From a3e51cc990812c8099dcaf1f3bd6d5bae45cf8e6 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Sat, 27 Dec 2014 13:25:18 -0800 Subject: [PATCH 001/116] [SPARK-4501][Core] - Create build/mvn to automatically download maven/zinc/scalac Creates a top level directory script (as `build/mvn`) to automatically download zinc and the specific version of scala used to easily build spark. This will also download and install maven if the user doesn't already have it and all packages are hosted under the `build/` directory. Tested on both Linux and OSX OS's and both work. All commands pass through to the maven binary so it acts exactly as a traditional maven call would. Author: Brennon York Closes #3707 from brennonyork/SPARK-4501 and squashes the following commits: 0e5a0e4 [Brennon York] minor incorrect doc verbage (with -> this) 9b79e38 [Brennon York] fixed merge conflicts with dev/run-tests, properly quoted args in sbt/sbt, fixed bug where relative paths would fail if passed in from build/mvn d2d41b6 [Brennon York] added blurb about leverging zinc with build/mvn b979c58 [Brennon York] updated the merge conflict c5634de [Brennon York] updated documentation to overview build/mvn, updated all points where sbt/sbt was referenced with build/sbt b8437ba [Brennon York] set progress bars for curl and wget when not run on jenkins, no progress bar when run on jenkins, moved sbt script to build/sbt, wrote stub and warning under sbt/sbt which calls build/sbt, modified build/sbt to use the correct directory, fixed bug in build/sbt-launch-lib.bash to correctly pull the sbt version be11317 [Brennon York] added switch to silence download progress only if AMPLAB_JENKINS is set 28d0a99 [Brennon York] updated to remove the python dependency, uses grep instead 7e785a6 [Brennon York] added silent and quiet flags to curl and wget respectively, added single echo output to denote start of a download if download is needed 14a5da0 [Brennon York] removed unnecessary zinc output on startup 1af4a94 [Brennon York] fixed bug with uppercase vs lowercase variable 3e8b9b3 [Brennon York] updated to properly only restart zinc if it was freshly installed a680d12 [Brennon York] Added comments to functions and tested various mvn calls bb8cc9d [Brennon York] removed package files ef017e6 [Brennon York] removed OS complexities, setup generic install_app call, removed extra file complexities, removed help, removed forced install (defaults now), removed double-dash from cli 07bf018 [Brennon York] Updated to specifically handle pulling down the correct scala version f914dea [Brennon York] Beginning final portions of localized scala home 69c4e44 [Brennon York] working linux and osx installers for purely local mvn build 4a1609c [Brennon York] finalizing working linux install for maven to local ./build/apache-maven folder cbfcc68 [Brennon York] Changed the default sbt/sbt to build/sbt and added a build/mvn which will automatically download, install, and execute maven with zinc for easier build capability --- .gitignore | 7 +- build/mvn | 132 +++++++++++++++++++++++ build/sbt | 111 +++++++++++++++++++ {sbt => build}/sbt-launch-lib.bash | 6 +- dev/create-release/create-release.sh | 10 +- dev/mima | 8 +- dev/run-tests | 24 ++--- dev/scalastyle | 4 +- docs/README.md | 6 +- docs/_plugins/copy_api_dirs.rb | 4 +- docs/building-spark.md | 45 +++++--- docs/hadoop-third-party-distributions.md | 10 +- extras/java8-tests/README.md | 6 +- python/pyspark/sql.py | 2 +- sbt/sbt | 117 ++------------------ sql/README.md | 4 +- 16 files changed, 330 insertions(+), 166 deletions(-) create mode 100755 build/mvn create mode 100755 build/sbt rename {sbt => build}/sbt-launch-lib.bash (96%) diff --git a/.gitignore b/.gitignore index 20095dd97343e..9757054a50f9e 100644 --- a/.gitignore +++ b/.gitignore @@ -8,16 +8,19 @@ *.pyc .idea/ .idea_modules/ -sbt/*.jar +build/*.jar .settings .cache +cache .generated-mima* -/build/ work/ out/ .DS_Store third_party/libmesos.so third_party/libmesos.dylib +build/apache-maven* +build/zinc* +build/scala* conf/java-opts conf/*.sh conf/*.cmd diff --git a/build/mvn b/build/mvn new file mode 100755 index 0000000000000..dde486a8ac605 --- /dev/null +++ b/build/mvn @@ -0,0 +1,132 @@ +#!/usr/bin/env bash + +# Determine the current working directory +_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +# Preserve the calling directory +_CALLING_DIR="$(pwd)" + +# Installs any application tarball given a URL, the expected tarball name, +# and, optionally, a checkable binary path to determine if the binary has +# already been installed +## Arg1 - URL +## Arg2 - Tarball Name +## Arg3 - Checkable Binary +install_app() { + local remote_tarball="$1/$2" + local local_tarball="${_DIR}/$2" + local binary="${_DIR}/$3" + + # setup `curl` and `wget` silent options if we're running on Jenkins + local curl_opts="" + local wget_opts="" + if [ -n "$AMPLAB_JENKINS" ]; then + curl_opts="-s" + wget_opts="--quiet" + else + curl_opts="--progress-bar" + wget_opts="--progress=bar:force" + fi + + if [ -z "$3" -o ! -f "$binary" ]; then + # check if we already have the tarball + # check if we have curl installed + # download application + [ ! -f "${local_tarball}" ] && [ -n "`which curl 2>/dev/null`" ] && \ + echo "exec: curl ${curl_opts} ${remote_tarball}" && \ + curl ${curl_opts} "${remote_tarball}" > "${local_tarball}" + # if the file still doesn't exist, lets try `wget` and cross our fingers + [ ! -f "${local_tarball}" ] && [ -n "`which wget 2>/dev/null`" ] && \ + echo "exec: wget ${wget_opts} ${remote_tarball}" && \ + wget ${wget_opts} -O "${local_tarball}" "${remote_tarball}" + # if both were unsuccessful, exit + [ ! -f "${local_tarball}" ] && \ + echo -n "ERROR: Cannot download $2 with cURL or wget; " && \ + echo "please install manually and try again." && \ + exit 2 + cd "${_DIR}" && tar -xzf "$2" + rm -rf "$local_tarball" + fi +} + +# Install maven under the build/ folder +install_mvn() { + install_app \ + "http://apache.claz.org/maven/maven-3/3.2.3/binaries" \ + "apache-maven-3.2.3-bin.tar.gz" \ + "apache-maven-3.2.3/bin/mvn" + MVN_BIN="${_DIR}/apache-maven-3.2.3/bin/mvn" +} + +# Install zinc under the build/ folder +install_zinc() { + local zinc_path="zinc-0.3.5.3/bin/zinc" + [ ! -f "${zinc_path}" ] && ZINC_INSTALL_FLAG=1 + install_app \ + "http://downloads.typesafe.com/zinc/0.3.5.3" \ + "zinc-0.3.5.3.tgz" \ + "${zinc_path}" + ZINC_BIN="${_DIR}/${zinc_path}" +} + +# Determine the Scala version from the root pom.xml file, set the Scala URL, +# and, with that, download the specific version of Scala necessary under +# the build/ folder +install_scala() { + # determine the Scala version used in Spark + local scala_version=`grep "scala.version" "${_DIR}/../pom.xml" | \ + head -1 | cut -f2 -d'>' | cut -f1 -d'<'` + local scala_bin="${_DIR}/scala-${scala_version}/bin/scala" + + install_app \ + "http://downloads.typesafe.com/scala/${scala_version}" \ + "scala-${scala_version}.tgz" \ + "scala-${scala_version}/bin/scala" + + SCALA_COMPILER="$(cd "$(dirname ${scala_bin})/../lib" && pwd)/scala-compiler.jar" + SCALA_LIBRARY="$(cd "$(dirname ${scala_bin})/../lib" && pwd)/scala-library.jar" +} + +# Determines if a given application is already installed. If not, will attempt +# to install +## Arg1 - application name +## Arg2 - Alternate path to local install under build/ dir +check_and_install_app() { + # create the local environment variable in uppercase + local app_bin="`echo $1 | awk '{print toupper(\$0)}'`_BIN" + # some black magic to set the generated app variable (i.e. MVN_BIN) into the + # environment + eval "${app_bin}=`which $1 2>/dev/null`" + + if [ -z "`which $1 2>/dev/null`" ]; then + install_$1 + fi +} + +# Setup healthy defaults for the Zinc port if none were provided from +# the environment +ZINC_PORT=${ZINC_PORT:-"3030"} + +# Check and install all applications necessary to build Spark +check_and_install_app "mvn" + +# Install the proper version of Scala and Zinc for the build +install_zinc +install_scala + +# Reset the current working directory +cd "${_CALLING_DIR}" + +# Now that zinc is ensured to be installed, check its status and, if its +# not running or just installed, start it +if [ -n "${ZINC_INSTALL_FLAG}" -o -z "`${ZINC_BIN} -status`" ]; then + ${ZINC_BIN} -shutdown + ${ZINC_BIN} -start -port ${ZINC_PORT} \ + -scala-compiler "${SCALA_COMPILER}" \ + -scala-library "${SCALA_LIBRARY}" &>/dev/null +fi + +# Set any `mvn` options if not already present +export MAVEN_OPTS=${MAVEN_OPTS:-"-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"} + +# Last, call the `mvn` command as usual +${MVN_BIN} "$@" diff --git a/build/sbt b/build/sbt new file mode 100755 index 0000000000000..0a251d97db95c --- /dev/null +++ b/build/sbt @@ -0,0 +1,111 @@ +#!/usr/bin/env bash + +# When creating new tests for Spark SQL Hive, the HADOOP_CLASSPATH must contain the hive jars so +# that we can run Hive to generate the golden answer. This is not required for normal development +# or testing. +for i in "$HIVE_HOME"/lib/* +do HADOOP_CLASSPATH="$HADOOP_CLASSPATH:$i" +done +export HADOOP_CLASSPATH + +realpath () { +( + TARGET_FILE="$1" + + cd "$(dirname "$TARGET_FILE")" + TARGET_FILE="$(basename "$TARGET_FILE")" + + COUNT=0 + while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ] + do + TARGET_FILE="$(readlink "$TARGET_FILE")" + cd $(dirname "$TARGET_FILE") + TARGET_FILE="$(basename $TARGET_FILE)" + COUNT=$(($COUNT + 1)) + done + + echo "$(pwd -P)/"$TARGET_FILE"" +) +} + +. "$(dirname "$(realpath "$0")")"/sbt-launch-lib.bash + + +declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy" +declare -r sbt_opts_file=".sbtopts" +declare -r etc_sbt_opts_file="/etc/sbt/sbtopts" + +usage() { + cat < path to global settings/plugins directory (default: ~/.sbt) + -sbt-boot path to shared boot directory (default: ~/.sbt/boot in 0.11 series) + -ivy path to local Ivy repository (default: ~/.ivy2) + -mem set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem)) + -no-share use all local caches; no sharing + -no-global uses global caches, but does not use global ~/.sbt directory. + -jvm-debug Turn on JVM debugging, open at the given port. + -batch Disable interactive mode + + # sbt version (default: from project/build.properties if present, else latest release) + -sbt-version use the specified version of sbt + -sbt-jar use the specified jar as the sbt launcher + -sbt-rc use an RC version of sbt + -sbt-snapshot use a snapshot version of sbt + + # java version (default: java from PATH, currently $(java -version 2>&1 | grep version)) + -java-home alternate JAVA_HOME + + # jvm options and output control + JAVA_OPTS environment variable, if unset uses "$java_opts" + SBT_OPTS environment variable, if unset uses "$default_sbt_opts" + .sbtopts if this file exists in the current directory, it is + prepended to the runner args + /etc/sbt/sbtopts if this file exists, it is prepended to the runner args + -Dkey=val pass -Dkey=val directly to the java runtime + -J-X pass option -X directly to the java runtime + (-J is stripped) + -S-X add -X to sbt's scalacOptions (-S is stripped) + -PmavenProfiles Enable a maven profile for the build. + +In the case of duplicated or conflicting options, the order above +shows precedence: JAVA_OPTS lowest, command line options highest. +EOM +} + +process_my_args () { + while [[ $# -gt 0 ]]; do + case "$1" in + -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;; + -no-share) addJava "$noshare_opts" && shift ;; + -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;; + -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;; + -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;; + -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;; + -batch) exec &1 \ | grep -e "^java version" --max-count=1 \ | sed "s/java version \"\(.*\)\.\(.*\)\.\(.*\)\"/\1\2/" ) - + if [ "$JAVA_VERSION" -lt 18 ]; then echo "[warn] Java 8 tests will not run because JDK version is < 1.8." fi @@ -79,7 +79,7 @@ export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl" # Partial solution for SPARK-1455. if [ -n "$AMPLAB_JENKINS" ]; then git fetch origin master:master - + sql_diffs=$( git diff --name-only master \ | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh" @@ -93,7 +93,7 @@ if [ -n "$AMPLAB_JENKINS" ]; then if [ -n "$sql_diffs" ]; then echo "[info] Detected changes in SQL. Will run Hive test suite." _RUN_SQL_TESTS=true - + if [ -z "$non_sql_diffs" ]; then echo "[info] Detected no changes except in SQL. Will only run SQL tests." _SQL_TESTS_ONLY=true @@ -151,7 +151,7 @@ CURRENT_BLOCK=$BLOCK_BUILD HIVE_12_BUILD_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver -Phive-0.12.0" echo "[info] Compile with Hive 0.12.0" echo -e "q\n" \ - | sbt/sbt $HIVE_12_BUILD_ARGS clean hive/compile hive-thriftserver/compile \ + | build/sbt $HIVE_12_BUILD_ARGS clean hive/compile hive-thriftserver/compile \ | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including" # Then build with default Hive version (0.13.1) because tests are based on this version @@ -160,7 +160,7 @@ CURRENT_BLOCK=$BLOCK_BUILD echo "[info] Building Spark with these arguments: $SBT_MAVEN_PROFILES_ARGS"\ " -Phive -Phive-thriftserver" echo -e "q\n" \ - | sbt/sbt $SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver package assembly/assembly \ + | build/sbt $SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver package assembly/assembly \ | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including" } @@ -177,7 +177,7 @@ CURRENT_BLOCK=$BLOCK_SPARK_UNIT_TESTS if [ -n "$_RUN_SQL_TESTS" ]; then SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver" fi - + if [ -n "$_SQL_TESTS_ONLY" ]; then # This must be an array of individual arguments. Otherwise, having one long string # will be interpreted as a single test, which doesn't work. @@ -185,19 +185,19 @@ CURRENT_BLOCK=$BLOCK_SPARK_UNIT_TESTS else SBT_MAVEN_TEST_ARGS=("test") fi - + echo "[info] Running Spark tests with these arguments: $SBT_MAVEN_PROFILES_ARGS ${SBT_MAVEN_TEST_ARGS[@]}" - + # NOTE: echo "q" is needed because sbt on encountering a build file with failure # (either resolution or compilation) prompts the user for input either q, r, etc # to quit or retry. This echo is there to make it not block. - # NOTE: Do not quote $SBT_MAVEN_PROFILES_ARGS or else it will be interpreted as a + # NOTE: Do not quote $SBT_MAVEN_PROFILES_ARGS or else it will be interpreted as a # single argument! # "${SBT_MAVEN_TEST_ARGS[@]}" is cool because it's an array. # QUESTION: Why doesn't 'yes "q"' work? # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work? echo -e "q\n" \ - | sbt/sbt $SBT_MAVEN_PROFILES_ARGS "${SBT_MAVEN_TEST_ARGS[@]}" \ + | build/sbt $SBT_MAVEN_PROFILES_ARGS "${SBT_MAVEN_TEST_ARGS[@]}" \ | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including" } diff --git a/dev/scalastyle b/dev/scalastyle index 3a4df6e4bf1bc..86919227ed1ab 100755 --- a/dev/scalastyle +++ b/dev/scalastyle @@ -17,9 +17,9 @@ # limitations under the License. # -echo -e "q\n" | sbt/sbt -Phive -Phive-thriftserver scalastyle > scalastyle.txt +echo -e "q\n" | build/sbt -Phive -Phive-thriftserver scalastyle > scalastyle.txt # Check style with YARN built too -echo -e "q\n" | sbt/sbt -Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0 scalastyle \ +echo -e "q\n" | build/sbt -Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0 scalastyle \ >> scalastyle.txt ERRORS=$(cat scalastyle.txt | awk '{if($1~/error/)print}') diff --git a/docs/README.md b/docs/README.md index 119484038083f..8a54724c4beae 100644 --- a/docs/README.md +++ b/docs/README.md @@ -21,7 +21,7 @@ read those text files directly if you want. Start with index.md. The markdown code can be compiled to HTML using the [Jekyll tool](http://jekyllrb.com). `Jekyll` and a few dependencies must be installed for this to work. We recommend -installing via the Ruby Gem dependency manager. Since the exact HTML output +installing via the Ruby Gem dependency manager. Since the exact HTML output varies between versions of Jekyll and its dependencies, we list specific versions here in some cases: @@ -60,7 +60,7 @@ We use Sphinx to generate Python API docs, so you will need to install it by run ## API Docs (Scaladoc and Sphinx) -You can build just the Spark scaladoc by running `sbt/sbt doc` from the SPARK_PROJECT_ROOT directory. +You can build just the Spark scaladoc by running `build/sbt doc` from the SPARK_PROJECT_ROOT directory. Similarly, you can build just the PySpark docs by running `make html` from the SPARK_PROJECT_ROOT/python/docs directory. Documentation is only generated for classes that are listed as @@ -68,7 +68,7 @@ public in `__init__.py`. When you run `jekyll` in the `docs` directory, it will also copy over the scaladoc for the various Spark subprojects into the `docs` directory (and then also into the `_site` directory). We use a -jekyll plugin to run `sbt/sbt doc` before building the site so if you haven't run it (recently) it +jekyll plugin to run `build/sbt doc` before building the site so if you haven't run it (recently) it may take some time as it generates all of the scaladoc. The jekyll plugin also generates the PySpark docs [Sphinx](http://sphinx-doc.org/). diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb index 4566a2fff562b..3c626a0b7f54b 100644 --- a/docs/_plugins/copy_api_dirs.rb +++ b/docs/_plugins/copy_api_dirs.rb @@ -25,8 +25,8 @@ curr_dir = pwd cd("..") - puts "Running 'sbt/sbt -Pkinesis-asl compile unidoc' from " + pwd + "; this may take a few minutes..." - puts `sbt/sbt -Pkinesis-asl compile unidoc` + puts "Running 'build/sbt -Pkinesis-asl compile unidoc' from " + pwd + "; this may take a few minutes..." + puts `build/sbt -Pkinesis-asl compile unidoc` puts "Moving back into docs dir." cd("docs") diff --git a/docs/building-spark.md b/docs/building-spark.md index dab3d2aef497e..c1bcd91b5b853 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -9,6 +9,15 @@ redirect_from: "building-with-maven.html" Building Spark using Maven requires Maven 3.0.4 or newer and Java 6+. +# Building with `build/mvn` + +Spark now comes packaged with a self-contained Maven installation to ease building and deployment of Spark from source located under the `build/` directory. This script will automatically download and setup all necessary build requirements ([Maven](https://maven.apache.org/), [Scala](http://www.scala-lang.org/), and [Zinc](https://github.com/typesafehub/zinc)) locally within the `build/` directory itself. It honors any `mvn` binary if present already, however, will pull down its own copy of Scala and Zinc regardless to ensure proper version requirements are met. `build/mvn` execution acts as a pass through to the `mvn` call allowing easy transition from previous build methods. As an example, one can build a version of Spark as follows: + +{% highlight bash %} +build/mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -DskipTests clean package +{% endhighlight %} + +Other build examples can be found below. # Setting up Maven's Memory Usage @@ -28,7 +37,9 @@ If you don't run this, you may see errors like the following: You can fix this by setting the `MAVEN_OPTS` variable as discussed before. -**Note:** *For Java 8 and above this step is not required.* +**Note:** +* *For Java 8 and above this step is not required.* +* *If using `build/mvn` and `MAVEN_OPTS` were not already set, the script will automate this for you.* # Specifying the Hadoop Version @@ -84,7 +95,7 @@ mvn -Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0 -Dyarn.version=2.2.0 -DskipTests # Building With Hive and JDBC Support To enable Hive integration for Spark SQL along with its JDBC server and CLI, add the `-Phive` and `Phive-thriftserver` profiles to your existing build options. -By default Spark will build with Hive 0.13.1 bindings. You can also build for +By default Spark will build with Hive 0.13.1 bindings. You can also build for Hive 0.12.0 using the `-Phive-0.12.0` profile. {% highlight bash %} # Apache Hadoop 2.4.X with Hive 13 support @@ -106,7 +117,7 @@ supported in Scala 2.11 builds. # Spark Tests in Maven -Tests are run by default via the [ScalaTest Maven plugin](http://www.scalatest.org/user_guide/using_the_scalatest_maven_plugin). +Tests are run by default via the [ScalaTest Maven plugin](http://www.scalatest.org/user_guide/using_the_scalatest_maven_plugin). Some of the tests require Spark to be packaged first, so always run `mvn package` with `-DskipTests` the first time. The following is an example of a correct (build, test) sequence: @@ -124,7 +135,7 @@ We use the scala-maven-plugin which supports incremental and continuous compilat mvn scala:cc -should run continuous compilation (i.e. wait for changes). However, this has not been tested +should run continuous compilation (i.e. wait for changes). However, this has not been tested extensively. A couple of gotchas to note: * it only scans the paths `src/main` and `src/test` (see [docs](http://scala-tools.org/mvnsites/maven-scala-plugin/usage_cc.html)), so it will only work @@ -157,9 +168,9 @@ The debian package can then be found under assembly/target. We added the short c Running only Java 8 tests and nothing else. mvn install -DskipTests -Pjava8-tests - -Java 8 tests are run when `-Pjava8-tests` profile is enabled, they will run in spite of `-DskipTests`. -For these tests to run your system must have a JDK 8 installation. + +Java 8 tests are run when `-Pjava8-tests` profile is enabled, they will run in spite of `-DskipTests`. +For these tests to run your system must have a JDK 8 installation. If you have JDK 8 installed but it is not the system default, you can set JAVA_HOME to point to JDK 8 before running the tests. # Building for PySpark on YARN @@ -171,7 +182,7 @@ then ship it over to the cluster. We are investigating the exact cause for this. # Packaging without Hadoop Dependencies for YARN -The assembly jar produced by `mvn package` will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with yarn.application.classpath. The `hadoop-provided` profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself. +The assembly jar produced by `mvn package` will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with yarn.application.classpath. The `hadoop-provided` profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself. # Building with SBT @@ -182,22 +193,22 @@ compilation. More advanced developers may wish to use SBT. The SBT build is derived from the Maven POM files, and so the same Maven profiles and variables can be set to control the SBT build. For example: - sbt/sbt -Pyarn -Phadoop-2.3 assembly + build/sbt -Pyarn -Phadoop-2.3 assembly # Testing with SBT -Some of the tests require Spark to be packaged first, so always run `sbt/sbt assembly` the first time. The following is an example of a correct (build, test) sequence: +Some of the tests require Spark to be packaged first, so always run `build/sbt assembly` the first time. The following is an example of a correct (build, test) sequence: - sbt/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver assembly - sbt/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver test + build/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver assembly + build/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver test To run only a specific test suite as follows: - sbt/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver "test-only org.apache.spark.repl.ReplSuite" + build/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver "test-only org.apache.spark.repl.ReplSuite" To run test suites of a specific sub project as follows: - sbt/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver core/test + build/sbt -Pyarn -Phadoop-2.3 -Phive -Phive-thriftserver core/test # Speeding up Compilation with Zinc @@ -206,3 +217,9 @@ compiler. When run locally as a background process, it speeds up builds of Scala like Spark. Developers who regularly recompile Spark with Maven will be the most interested in Zinc. The project site gives instructions for building and running `zinc`; OS X users can install it using `brew install zinc`. + +If using the `build/mvn` package `zinc` will automatically be downloaded and leveraged for all +builds. This process will auto-start after the first time `build/mvn` is called and bind to port +3030 unless the `ZINC_PORT` environment variable is set. The `zinc` process can subsequently be +shut down at any time by running `build/zinc-/bin/zinc -shutdown` and will automatically +restart whenever `build/mvn` is called. diff --git a/docs/hadoop-third-party-distributions.md b/docs/hadoop-third-party-distributions.md index dd73e9dc54440..87dcc58feb494 100644 --- a/docs/hadoop-third-party-distributions.md +++ b/docs/hadoop-third-party-distributions.md @@ -18,7 +18,7 @@ see the guide on [building with maven](building-spark.html#specifying-the-hadoop The table below lists the corresponding `hadoop.version` code for each CDH/HDP release. Note that some Hadoop releases are binary compatible across client versions. This means the pre-built Spark -distribution may "just work" without you needing to compile. That said, we recommend compiling with +distribution may "just work" without you needing to compile. That said, we recommend compiling with the _exact_ Hadoop version you are running to avoid any compatibility errors. @@ -50,7 +50,7 @@ the _exact_ Hadoop version you are running to avoid any compatibility errors. In SBT, the equivalent can be achieved by setting the the `hadoop.version` property: - sbt/sbt -Dhadoop.version=1.0.4 assembly + build/sbt -Dhadoop.version=1.0.4 assembly # Linking Applications to the Hadoop Version @@ -98,11 +98,11 @@ Spark can run in a variety of deployment modes: * Using dedicated set of Spark nodes in your cluster. These nodes should be co-located with your Hadoop installation. -* Running on the same nodes as an existing Hadoop installation, with a fixed amount memory and +* Running on the same nodes as an existing Hadoop installation, with a fixed amount memory and cores dedicated to Spark on each node. * Run Spark alongside Hadoop using a cluster resource manager, such as YARN or Mesos. -These options are identical for those using CDH and HDP. +These options are identical for those using CDH and HDP. # Inheriting Cluster Configuration @@ -116,5 +116,5 @@ The location of these configuration files varies across CDH and HDP versions, bu a common location is inside of `/etc/hadoop/conf`. Some tools, such as Cloudera Manager, create configurations on-the-fly, but offer a mechanisms to download copies of them. -To make these files visible to Spark, set `HADOOP_CONF_DIR` in `$SPARK_HOME/spark-env.sh` +To make these files visible to Spark, set `HADOOP_CONF_DIR` in `$SPARK_HOME/spark-env.sh` to a location containing the configuration files. diff --git a/extras/java8-tests/README.md b/extras/java8-tests/README.md index e95b73ac7702a..dc9e87f2eeb92 100644 --- a/extras/java8-tests/README.md +++ b/extras/java8-tests/README.md @@ -8,7 +8,7 @@ to your Java location. The set-up depends a bit on the build system: `-java-home` to the sbt launch script. If a Java 8 JDK is detected sbt will automatically include the Java 8 test project. - `$ JAVA_HOME=/opt/jdk1.8.0/ sbt/sbt clean "test-only org.apache.spark.Java8APISuite"` + `$ JAVA_HOME=/opt/jdk1.8.0/ build/sbt clean "test-only org.apache.spark.Java8APISuite"` * For Maven users, @@ -19,6 +19,6 @@ to your Java location. The set-up depends a bit on the build system: `$ JAVA_HOME=/opt/jdk1.8.0/ mvn clean install -DskipTests` `$ JAVA_HOME=/opt/jdk1.8.0/ mvn test -Pjava8-tests -DwildcardSuites=org.apache.spark.Java8APISuite` - Note that the above command can only be run from project root directory since this module - depends on core and the test-jars of core and streaming. This means an install step is + Note that the above command can only be run from project root directory since this module + depends on core and the test-jars of core and streaming. This means an install step is required to make the test dependencies visible to the Java 8 sub-project. diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index 9807a84a66f11..0e8b398fc6b97 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -1671,7 +1671,7 @@ def _ssql_ctx(self): except Py4JError as e: raise Exception("You must build Spark with Hive. " "Export 'SPARK_HIVE=true' and run " - "sbt/sbt assembly", e) + "build/sbt assembly", e) def _get_hive_ctx(self): return self._jvm.HiveContext(self._jsc.sc()) diff --git a/sbt/sbt b/sbt/sbt index 0a251d97db95c..6f3e5e08ed27a 100755 --- a/sbt/sbt +++ b/sbt/sbt @@ -1,111 +1,12 @@ -#!/usr/bin/env bash +#!/bin/bash -# When creating new tests for Spark SQL Hive, the HADOOP_CLASSPATH must contain the hive jars so -# that we can run Hive to generate the golden answer. This is not required for normal development -# or testing. -for i in "$HIVE_HOME"/lib/* -do HADOOP_CLASSPATH="$HADOOP_CLASSPATH:$i" -done -export HADOOP_CLASSPATH +# Determine the current working directory +_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -realpath () { -( - TARGET_FILE="$1" +echo "NOTE: The sbt/sbt script has been relocated to build/sbt." >&2 +echo " Please update references to point to the new location." >&2 +echo "" >&2 +echo " Invoking 'build/sbt $@' now ..." >&2 +echo "" >&2 - cd "$(dirname "$TARGET_FILE")" - TARGET_FILE="$(basename "$TARGET_FILE")" - - COUNT=0 - while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ] - do - TARGET_FILE="$(readlink "$TARGET_FILE")" - cd $(dirname "$TARGET_FILE") - TARGET_FILE="$(basename $TARGET_FILE)" - COUNT=$(($COUNT + 1)) - done - - echo "$(pwd -P)/"$TARGET_FILE"" -) -} - -. "$(dirname "$(realpath "$0")")"/sbt-launch-lib.bash - - -declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy" -declare -r sbt_opts_file=".sbtopts" -declare -r etc_sbt_opts_file="/etc/sbt/sbtopts" - -usage() { - cat < path to global settings/plugins directory (default: ~/.sbt) - -sbt-boot path to shared boot directory (default: ~/.sbt/boot in 0.11 series) - -ivy path to local Ivy repository (default: ~/.ivy2) - -mem set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem)) - -no-share use all local caches; no sharing - -no-global uses global caches, but does not use global ~/.sbt directory. - -jvm-debug Turn on JVM debugging, open at the given port. - -batch Disable interactive mode - - # sbt version (default: from project/build.properties if present, else latest release) - -sbt-version use the specified version of sbt - -sbt-jar use the specified jar as the sbt launcher - -sbt-rc use an RC version of sbt - -sbt-snapshot use a snapshot version of sbt - - # java version (default: java from PATH, currently $(java -version 2>&1 | grep version)) - -java-home alternate JAVA_HOME - - # jvm options and output control - JAVA_OPTS environment variable, if unset uses "$java_opts" - SBT_OPTS environment variable, if unset uses "$default_sbt_opts" - .sbtopts if this file exists in the current directory, it is - prepended to the runner args - /etc/sbt/sbtopts if this file exists, it is prepended to the runner args - -Dkey=val pass -Dkey=val directly to the java runtime - -J-X pass option -X directly to the java runtime - (-J is stripped) - -S-X add -X to sbt's scalacOptions (-S is stripped) - -PmavenProfiles Enable a maven profile for the build. - -In the case of duplicated or conflicting options, the order above -shows precedence: JAVA_OPTS lowest, command line options highest. -EOM -} - -process_my_args () { - while [[ $# -gt 0 ]]; do - case "$1" in - -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;; - -no-share) addJava "$noshare_opts" && shift ;; - -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;; - -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;; - -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;; - -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;; - -batch) exec Date: Mon, 29 Dec 2014 08:20:30 -0600 Subject: [PATCH 002/116] [SPARK-4966][YARN]The MemoryOverhead value is setted not correctly Author: meiyoula <1039320815@qq.com> Closes #3797 from XuTingjun/MemoryOverhead and squashes the following commits: 5a780fc [meiyoula] Update ClientArguments.scala --- .../scala/org/apache/spark/deploy/yarn/ClientArguments.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala index 7305249f80e83..39f1021c9d942 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala @@ -39,6 +39,8 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf) var appName: String = "Spark" var priority = 0 + parseArgs(args.toList) + // Additional memory to allocate to containers // For now, use driver's memory overhead as our AM container's memory overhead val amMemoryOverhead = sparkConf.getInt("spark.yarn.driver.memoryOverhead", @@ -50,7 +52,6 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf) private val isDynamicAllocationEnabled = sparkConf.getBoolean("spark.dynamicAllocation.enabled", false) - parseArgs(args.toList) loadEnvironmentArgs() validateArgs() From 6645e52580747990321e22340ae742f26d2f2504 Mon Sep 17 00:00:00 2001 From: wangxiaojing Date: Mon, 29 Dec 2014 10:45:14 -0800 Subject: [PATCH 003/116] [SPARK-4982][DOC] `spark.ui.retainedJobs` description is wrong in Spark UI configuration guide Author: wangxiaojing Closes #3818 from wangxiaojing/SPARK-4982 and squashes the following commits: fe2ad5f [wangxiaojing] change stages to jobs --- docs/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration.md b/docs/configuration.md index 2cc013c47fdbb..fa9d311f85068 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -452,7 +452,7 @@ Apart from these, the following properties are also available, and may be useful From 4cef05e1c1d420af89164d6f4fabbad090542f1b Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Mon, 29 Dec 2014 10:48:53 -0800 Subject: [PATCH 004/116] Adde LICENSE Header to build/mvn, build/sbt and sbt/sbt Recently, build/mvn and build/sbt are added, and sbt/sbt is changed but there are no license headers. Should we add license headers to the scripts right? If it's not right, please let me correct. This PR doesn't affect behavior of Spark, I don't file in JIRA. Author: Kousuke Saruta Closes #3817 from sarutak/add-license-header and squashes the following commits: 1abc972 [Kousuke Saruta] Added LICENSE Header --- build/mvn | 17 +++++++++++++++++ build/sbt | 17 +++++++++++++++++ sbt/sbt | 19 ++++++++++++++++++- 3 files changed, 52 insertions(+), 1 deletion(-) diff --git a/build/mvn b/build/mvn index dde486a8ac605..43471f83e904c 100755 --- a/build/mvn +++ b/build/mvn @@ -1,5 +1,22 @@ #!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + # Determine the current working directory _DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" # Preserve the calling directory diff --git a/build/sbt b/build/sbt index 0a251d97db95c..28ebb64f7197c 100755 --- a/build/sbt +++ b/build/sbt @@ -1,5 +1,22 @@ #!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + # When creating new tests for Spark SQL Hive, the HADOOP_CLASSPATH must contain the hive jars so # that we can run Hive to generate the golden answer. This is not required for normal development # or testing. diff --git a/sbt/sbt b/sbt/sbt index 6f3e5e08ed27a..41438251f681e 100755 --- a/sbt/sbt +++ b/sbt/sbt @@ -1,4 +1,21 @@ -#!/bin/bash +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# # Determine the current working directory _DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" From 815de54002f9c1cfedc398e95896fa207b4a5305 Mon Sep 17 00:00:00 2001 From: YanTangZhai Date: Mon, 29 Dec 2014 11:30:54 -0800 Subject: [PATCH 005/116] [SPARK-4946] [CORE] Using AkkaUtils.askWithReply in MapOutputTracker.askTracker to reduce the chance of the communicating problem Using AkkaUtils.askWithReply in MapOutputTracker.askTracker to reduce the chance of the communicating problem Author: YanTangZhai Author: yantangzhai Closes #3785 from YanTangZhai/SPARK-4946 and squashes the following commits: 9ca6541 [yantangzhai] [SPARK-4946] [CORE] Using AkkaUtils.askWithReply in MapOutputTracker.askTracker to reduce the chance of the communicating problem e4c2c0a [YanTangZhai] Merge pull request #15 from apache/master 718afeb [YanTangZhai] Merge pull request #12 from apache/master 6e643f8 [YanTangZhai] Merge pull request #11 from apache/master e249846 [YanTangZhai] Merge pull request #10 from apache/master d26d982 [YanTangZhai] Merge pull request #9 from apache/master 76d4027 [YanTangZhai] Merge pull request #8 from apache/master 03b62b0 [YanTangZhai] Merge pull request #7 from apache/master 8a00106 [YanTangZhai] Merge pull request #6 from apache/master cbcba66 [YanTangZhai] Merge pull request #3 from apache/master cdef539 [YanTangZhai] Merge pull request #1 from apache/master --- core/src/main/scala/org/apache/spark/MapOutputTracker.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index a074ab8ece1b7..6e4edc7c80d7a 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -76,6 +76,8 @@ private[spark] class MapOutputTrackerMasterActor(tracker: MapOutputTrackerMaster */ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging { private val timeout = AkkaUtils.askTimeout(conf) + private val retryAttempts = AkkaUtils.numRetries(conf) + private val retryIntervalMs = AkkaUtils.retryWaitMs(conf) /** Set to the MapOutputTrackerActor living on the driver. */ var trackerActor: ActorRef = _ @@ -108,8 +110,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging */ protected def askTracker(message: Any): Any = { try { - val future = trackerActor.ask(message)(timeout) - Await.result(future, timeout) + AkkaUtils.askWithReply(message, trackerActor, retryAttempts, retryIntervalMs, timeout) } catch { case e: Exception => logError("Error communicating with MapOutputTracker", e) From 8d72341ab75a7fb138b056cfb4e21db42aca55fb Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Mon, 29 Dec 2014 12:05:08 -0800 Subject: [PATCH 006/116] [Minor] Fix a typo of type parameter in JavaUtils.scala In JavaUtils.scala, thare is a typo of type parameter. In addition, the type information is removed at the time of compile by erasure. This issue is really minor so I don't file in JIRA. Author: Kousuke Saruta Closes #3789 from sarutak/fix-typo-in-javautils and squashes the following commits: e20193d [Kousuke Saruta] Fixed a typo of type parameter 82bc5d9 [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into fix-typo-in-javautils 99f6f63 [Kousuke Saruta] Fixed a typo of type parameter in JavaUtils.scala --- core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala b/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala index 86e94931300f8..71b26737b8c02 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala @@ -80,7 +80,7 @@ private[spark] object JavaUtils { prev match { case Some(k) => underlying match { - case mm: mutable.Map[a, _] => + case mm: mutable.Map[A, _] => mm remove k prev = None case _ => From 02b55de3dce9a1fef806be13e5cefa0f39ea2fcc Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Mon, 29 Dec 2014 13:24:26 -0800 Subject: [PATCH 007/116] [SPARK-4409][MLlib] Additional Linear Algebra Utils Addition of a very limited number of local matrix manipulation and generation methods that would be helpful in the further development for algorithms on top of BlockMatrix (SPARK-3974), such as Randomized SVD, and Multi Model Training (SPARK-1486). The proposed methods for addition are: For `Matrix` - map: maps the values in the matrix with a given function. Produces a new matrix. - update: the values in the matrix are updated with a given function. Occurs in place. Factory methods for `DenseMatrix`: - *zeros: Generate a matrix consisting of zeros - *ones: Generate a matrix consisting of ones - *eye: Generate an identity matrix - *rand: Generate a matrix consisting of i.i.d. uniform random numbers - *randn: Generate a matrix consisting of i.i.d. gaussian random numbers - *diag: Generate a diagonal matrix from a supplied vector *These methods already exist in the factory methods for `Matrices`, however for cases where we require a `DenseMatrix`, you constantly have to add `.asInstanceOf[DenseMatrix]` everywhere, which makes the code "dirtier". I propose moving these functions to factory methods for `DenseMatrix` where the putput will be a `DenseMatrix` and the factory methods for `Matrices` will call these functions directly and output a generic `Matrix`. Factory methods for `SparseMatrix`: - speye: Identity matrix in sparse format. Saves a ton of memory when dimensions are large, especially in Multi Model Training, where each row requires being multiplied by a scalar. - sprand: Generate a sparse matrix with a given density consisting of i.i.d. uniform random numbers. - sprandn: Generate a sparse matrix with a given density consisting of i.i.d. gaussian random numbers. - diag: Generate a diagonal matrix from a supplied vector, but is memory efficient, because it just stores the diagonal. Again, very helpful in Multi Model Training. Factory methods for `Matrices`: - Include all the factory methods given above, but return a generic `Matrix` rather than `SparseMatrix` or `DenseMatrix`. - horzCat: Horizontally concatenate matrices to form one larger matrix. Very useful in both Multi Model Training, and for the repartitioning of BlockMatrix. - vertCat: Vertically concatenate matrices to form one larger matrix. Very useful for the repartitioning of BlockMatrix. The names for these methods were selected from MATLAB Author: Burak Yavuz Author: Xiangrui Meng Closes #3319 from brkyvz/SPARK-4409 and squashes the following commits: b0354f6 [Burak Yavuz] [SPARK-4409] Incorporated mengxr's code 04c4829 [Burak Yavuz] Merge pull request #1 from mengxr/SPARK-4409 80cfa29 [Xiangrui Meng] minor changes ecc937a [Xiangrui Meng] update sprand 4e95e24 [Xiangrui Meng] simplify fromCOO implementation 10a63a6 [Burak Yavuz] [SPARK-4409] Fourth pass of code review f62d6c7 [Burak Yavuz] [SPARK-4409] Modified genRandMatrix 3971c93 [Burak Yavuz] [SPARK-4409] Third pass of code review 75239f8 [Burak Yavuz] [SPARK-4409] Second pass of code review e4bd0c0 [Burak Yavuz] [SPARK-4409] Modified horzcat and vertcat 65c562e [Burak Yavuz] [SPARK-4409] Hopefully fixed Java Test d8be7bc [Burak Yavuz] [SPARK-4409] Organized imports 065b531 [Burak Yavuz] [SPARK-4409] First pass after code review a8120d2 [Burak Yavuz] [SPARK-4409] Finished updates to API according to SPARK-4614 f798c82 [Burak Yavuz] [SPARK-4409] Updated API according to SPARK-4614 c75f3cd [Burak Yavuz] [SPARK-4409] Added JavaAPI Tests, and fixed a couple of bugs d662f9d [Burak Yavuz] [SPARK-4409] Modified according to remote repo 83dfe37 [Burak Yavuz] [SPARK-4409] Scalastyle error fixed a14c0da [Burak Yavuz] [SPARK-4409] Initial commit to add methods --- .../apache/spark/mllib/linalg/Matrices.scala | 570 ++++++++++++++++-- .../spark/mllib/linalg/JavaMatricesSuite.java | 163 +++++ .../spark/mllib/linalg/MatricesSuite.scala | 172 +++++- .../spark/mllib/util/TestingUtils.scala | 6 +- 4 files changed, 868 insertions(+), 43 deletions(-) create mode 100644 mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 327366a1a3a82..5a7281ec6dc3c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -17,9 +17,11 @@ package org.apache.spark.mllib.linalg -import java.util.{Random, Arrays} +import java.util.{Arrays, Random} -import breeze.linalg.{Matrix => BM, DenseMatrix => BDM, CSCMatrix => BSM} +import scala.collection.mutable.{ArrayBuilder => MArrayBuilder, HashSet => MHashSet, ArrayBuffer} + +import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, Matrix => BM} /** * Trait for a local matrix. @@ -80,6 +82,16 @@ sealed trait Matrix extends Serializable { /** A human readable representation of the matrix */ override def toString: String = toBreeze.toString() + + /** Map the values of this matrix using a function. Generates a new matrix. Performs the + * function on only the backing array. For example, an operation such as addition or + * subtraction will only be performed on the non-zero values in a `SparseMatrix`. */ + private[mllib] def map(f: Double => Double): Matrix + + /** Update all the values of this matrix using the function f. Performed in-place on the + * backing array. For example, an operation such as addition or subtraction will only be + * performed on the non-zero values in a `SparseMatrix`. */ + private[mllib] def update(f: Double => Double): Matrix } /** @@ -123,6 +135,122 @@ class DenseMatrix(val numRows: Int, val numCols: Int, val values: Array[Double]) } override def copy = new DenseMatrix(numRows, numCols, values.clone()) + + private[mllib] def map(f: Double => Double) = new DenseMatrix(numRows, numCols, values.map(f)) + + private[mllib] def update(f: Double => Double): DenseMatrix = { + val len = values.length + var i = 0 + while (i < len) { + values(i) = f(values(i)) + i += 1 + } + this + } + + /** Generate a `SparseMatrix` from the given `DenseMatrix`. */ + def toSparse(): SparseMatrix = { + val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble + val colPtrs: Array[Int] = new Array[Int](numCols + 1) + val rowIndices: MArrayBuilder[Int] = new MArrayBuilder.ofInt + var nnz = 0 + var j = 0 + while (j < numCols) { + var i = 0 + val indStart = j * numRows + while (i < numRows) { + val v = values(indStart + i) + if (v != 0.0) { + rowIndices += i + spVals += v + nnz += 1 + } + i += 1 + } + j += 1 + colPtrs(j) = nnz + } + new SparseMatrix(numRows, numCols, colPtrs, rowIndices.result(), spVals.result()) + } +} + +/** + * Factory methods for [[org.apache.spark.mllib.linalg.DenseMatrix]]. + */ +object DenseMatrix { + + /** + * Generate a `DenseMatrix` consisting of zeros. + * @param numRows number of rows of the matrix + * @param numCols number of columns of the matrix + * @return `DenseMatrix` with size `numRows` x `numCols` and values of zeros + */ + def zeros(numRows: Int, numCols: Int): DenseMatrix = + new DenseMatrix(numRows, numCols, new Array[Double](numRows * numCols)) + + /** + * Generate a `DenseMatrix` consisting of ones. + * @param numRows number of rows of the matrix + * @param numCols number of columns of the matrix + * @return `DenseMatrix` with size `numRows` x `numCols` and values of ones + */ + def ones(numRows: Int, numCols: Int): DenseMatrix = + new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(1.0)) + + /** + * Generate an Identity Matrix in `DenseMatrix` format. + * @param n number of rows and columns of the matrix + * @return `DenseMatrix` with size `n` x `n` and values of ones on the diagonal + */ + def eye(n: Int): DenseMatrix = { + val identity = DenseMatrix.zeros(n, n) + var i = 0 + while (i < n) { + identity.update(i, i, 1.0) + i += 1 + } + identity + } + + /** + * Generate a `DenseMatrix` consisting of i.i.d. uniform random numbers. + * @param numRows number of rows of the matrix + * @param numCols number of columns of the matrix + * @param rng a random number generator + * @return `DenseMatrix` with size `numRows` x `numCols` and values in U(0, 1) + */ + def rand(numRows: Int, numCols: Int, rng: Random): DenseMatrix = { + new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(rng.nextDouble())) + } + + /** + * Generate a `DenseMatrix` consisting of i.i.d. gaussian random numbers. + * @param numRows number of rows of the matrix + * @param numCols number of columns of the matrix + * @param rng a random number generator + * @return `DenseMatrix` with size `numRows` x `numCols` and values in N(0, 1) + */ + def randn(numRows: Int, numCols: Int, rng: Random): DenseMatrix = { + new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(rng.nextGaussian())) + } + + /** + * Generate a diagonal matrix in `DenseMatrix` format from the supplied values. + * @param vector a `Vector` that will form the values on the diagonal of the matrix + * @return Square `DenseMatrix` with size `values.length` x `values.length` and `values` + * on the diagonal + */ + def diag(vector: Vector): DenseMatrix = { + val n = vector.size + val matrix = DenseMatrix.zeros(n, n) + val values = vector.toArray + var i = 0 + while (i < n) { + matrix.update(i, i, values(i)) + i += 1 + } + matrix + } } /** @@ -156,6 +284,8 @@ class SparseMatrix( require(colPtrs.length == numCols + 1, "The length of the column indices should be the " + s"number of columns + 1. Currently, colPointers.length: ${colPtrs.length}, " + s"numCols: $numCols") + require(values.length == colPtrs.last, "The last value of colPtrs must equal the number of " + + s"elements. values.length: ${values.length}, colPtrs.last: ${colPtrs.last}") override def toArray: Array[Double] = { val arr = new Array[Double](numRows * numCols) @@ -188,7 +318,7 @@ class SparseMatrix( private[mllib] def update(i: Int, j: Int, v: Double): Unit = { val ind = index(i, j) - if (ind == -1){ + if (ind == -1) { throw new NoSuchElementException("The given row and column indices correspond to a zero " + "value. Only non-zero elements in Sparse Matrices can be updated.") } else { @@ -197,6 +327,192 @@ class SparseMatrix( } override def copy = new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values.clone()) + + private[mllib] def map(f: Double => Double) = + new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values.map(f)) + + private[mllib] def update(f: Double => Double): SparseMatrix = { + val len = values.length + var i = 0 + while (i < len) { + values(i) = f(values(i)) + i += 1 + } + this + } + + /** Generate a `DenseMatrix` from the given `SparseMatrix`. */ + def toDense(): DenseMatrix = { + new DenseMatrix(numRows, numCols, toArray) + } +} + +/** + * Factory methods for [[org.apache.spark.mllib.linalg.SparseMatrix]]. + */ +object SparseMatrix { + + /** + * Generate a `SparseMatrix` from Coordinate List (COO) format. Input must be an array of + * (i, j, value) tuples. Entries that have duplicate values of i and j are + * added together. Tuples where value is equal to zero will be omitted. + * @param numRows number of rows of the matrix + * @param numCols number of columns of the matrix + * @param entries Array of (i, j, value) tuples + * @return The corresponding `SparseMatrix` + */ + def fromCOO(numRows: Int, numCols: Int, entries: Iterable[(Int, Int, Double)]): SparseMatrix = { + val sortedEntries = entries.toSeq.sortBy(v => (v._2, v._1)) + val numEntries = sortedEntries.size + if (sortedEntries.nonEmpty) { + // Since the entries are sorted by column index, we only need to check the first and the last. + for (col <- Seq(sortedEntries.head._2, sortedEntries.last._2)) { + require(col >= 0 && col < numCols, s"Column index out of range [0, $numCols): $col.") + } + } + val colPtrs = new Array[Int](numCols + 1) + val rowIndices = MArrayBuilder.make[Int] + rowIndices.sizeHint(numEntries) + val values = MArrayBuilder.make[Double] + values.sizeHint(numEntries) + var nnz = 0 + var prevCol = 0 + var prevRow = -1 + var prevVal = 0.0 + // Append a dummy entry to include the last one at the end of the loop. + (sortedEntries.view :+ (numRows, numCols, 1.0)).foreach { case (i, j, v) => + if (v != 0) { + if (i == prevRow && j == prevCol) { + prevVal += v + } else { + if (prevVal != 0) { + require(prevRow >= 0 && prevRow < numRows, + s"Row index out of range [0, $numRows): $prevRow.") + nnz += 1 + rowIndices += prevRow + values += prevVal + } + prevRow = i + prevVal = v + while (prevCol < j) { + colPtrs(prevCol + 1) = nnz + prevCol += 1 + } + } + } + } + new SparseMatrix(numRows, numCols, colPtrs, rowIndices.result(), values.result()) + } + + /** + * Generate an Identity Matrix in `SparseMatrix` format. + * @param n number of rows and columns of the matrix + * @return `SparseMatrix` with size `n` x `n` and values of ones on the diagonal + */ + def speye(n: Int): SparseMatrix = { + new SparseMatrix(n, n, (0 to n).toArray, (0 until n).toArray, Array.fill(n)(1.0)) + } + + /** + * Generates the skeleton of a random `SparseMatrix` with a given random number generator. + * The values of the matrix returned are undefined. + */ + private def genRandMatrix( + numRows: Int, + numCols: Int, + density: Double, + rng: Random): SparseMatrix = { + require(numRows > 0, s"numRows must be greater than 0 but got $numRows") + require(numCols > 0, s"numCols must be greater than 0 but got $numCols") + require(density >= 0.0 && density <= 1.0, + s"density must be a double in the range 0.0 <= d <= 1.0. Currently, density: $density") + val size = numRows.toLong * numCols + val expected = size * density + assert(expected < Int.MaxValue, + "The expected number of nonzeros cannot be greater than Int.MaxValue.") + val nnz = math.ceil(expected).toInt + if (density == 0.0) { + new SparseMatrix(numRows, numCols, new Array[Int](numCols + 1), Array[Int](), Array[Double]()) + } else if (density == 1.0) { + val colPtrs = Array.tabulate(numCols + 1)(j => j * numRows) + val rowIndices = Array.tabulate(size.toInt)(idx => idx % numRows) + new SparseMatrix(numRows, numCols, colPtrs, rowIndices, new Array[Double](numRows * numCols)) + } else if (density < 0.34) { + // draw-by-draw, expected number of iterations is less than 1.5 * nnz + val entries = MHashSet[(Int, Int)]() + while (entries.size < nnz) { + entries += ((rng.nextInt(numRows), rng.nextInt(numCols))) + } + SparseMatrix.fromCOO(numRows, numCols, entries.map(v => (v._1, v._2, 1.0))) + } else { + // selection-rejection method + var idx = 0L + var numSelected = 0 + var j = 0 + val colPtrs = new Array[Int](numCols + 1) + val rowIndices = new Array[Int](nnz) + while (j < numCols && numSelected < nnz) { + var i = 0 + while (i < numRows && numSelected < nnz) { + if (rng.nextDouble() < 1.0 * (nnz - numSelected) / (size - idx)) { + rowIndices(numSelected) = i + numSelected += 1 + } + i += 1 + idx += 1 + } + colPtrs(j + 1) = numSelected + j += 1 + } + new SparseMatrix(numRows, numCols, colPtrs, rowIndices, new Array[Double](nnz)) + } + } + + /** + * Generate a `SparseMatrix` consisting of i.i.d. uniform random numbers. The number of non-zero + * elements equal the ceiling of `numRows` x `numCols` x `density` + * + * @param numRows number of rows of the matrix + * @param numCols number of columns of the matrix + * @param density the desired density for the matrix + * @param rng a random number generator + * @return `SparseMatrix` with size `numRows` x `numCols` and values in U(0, 1) + */ + def sprand(numRows: Int, numCols: Int, density: Double, rng: Random): SparseMatrix = { + val mat = genRandMatrix(numRows, numCols, density, rng) + mat.update(i => rng.nextDouble()) + } + + /** + * Generate a `SparseMatrix` consisting of i.i.d. gaussian random numbers. + * @param numRows number of rows of the matrix + * @param numCols number of columns of the matrix + * @param density the desired density for the matrix + * @param rng a random number generator + * @return `SparseMatrix` with size `numRows` x `numCols` and values in N(0, 1) + */ + def sprandn(numRows: Int, numCols: Int, density: Double, rng: Random): SparseMatrix = { + val mat = genRandMatrix(numRows, numCols, density, rng) + mat.update(i => rng.nextGaussian()) + } + + /** + * Generate a diagonal matrix in `SparseMatrix` format from the supplied values. + * @param vector a `Vector` that will form the values on the diagonal of the matrix + * @return Square `SparseMatrix` with size `values.length` x `values.length` and non-zero + * `values` on the diagonal + */ + def diag(vector: Vector): SparseMatrix = { + val n = vector.size + vector match { + case sVec: SparseVector => + SparseMatrix.fromCOO(n, n, sVec.indices.zip(sVec.values).map(v => (v._1, v._1, v._2))) + case dVec: DenseVector => + val entries = dVec.values.zipWithIndex + val nnzVals = entries.filter(v => v._1 != 0.0) + SparseMatrix.fromCOO(n, n, nnzVals.map(v => (v._2, v._2, v._1))) + } + } } /** @@ -256,72 +572,250 @@ object Matrices { * Generate a `DenseMatrix` consisting of zeros. * @param numRows number of rows of the matrix * @param numCols number of columns of the matrix - * @return `DenseMatrix` with size `numRows` x `numCols` and values of zeros + * @return `Matrix` with size `numRows` x `numCols` and values of zeros */ - def zeros(numRows: Int, numCols: Int): Matrix = - new DenseMatrix(numRows, numCols, new Array[Double](numRows * numCols)) + def zeros(numRows: Int, numCols: Int): Matrix = DenseMatrix.zeros(numRows, numCols) /** * Generate a `DenseMatrix` consisting of ones. * @param numRows number of rows of the matrix * @param numCols number of columns of the matrix - * @return `DenseMatrix` with size `numRows` x `numCols` and values of ones + * @return `Matrix` with size `numRows` x `numCols` and values of ones */ - def ones(numRows: Int, numCols: Int): Matrix = - new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(1.0)) + def ones(numRows: Int, numCols: Int): Matrix = DenseMatrix.ones(numRows, numCols) /** - * Generate an Identity Matrix in `DenseMatrix` format. + * Generate a dense Identity Matrix in `Matrix` format. * @param n number of rows and columns of the matrix - * @return `DenseMatrix` with size `n` x `n` and values of ones on the diagonal + * @return `Matrix` with size `n` x `n` and values of ones on the diagonal */ - def eye(n: Int): Matrix = { - val identity = Matrices.zeros(n, n) - var i = 0 - while (i < n){ - identity.update(i, i, 1.0) - i += 1 - } - identity - } + def eye(n: Int): Matrix = DenseMatrix.eye(n) + + /** + * Generate a sparse Identity Matrix in `Matrix` format. + * @param n number of rows and columns of the matrix + * @return `Matrix` with size `n` x `n` and values of ones on the diagonal + */ + def speye(n: Int): Matrix = SparseMatrix.speye(n) /** * Generate a `DenseMatrix` consisting of i.i.d. uniform random numbers. * @param numRows number of rows of the matrix * @param numCols number of columns of the matrix * @param rng a random number generator - * @return `DenseMatrix` with size `numRows` x `numCols` and values in U(0, 1) + * @return `Matrix` with size `numRows` x `numCols` and values in U(0, 1) */ - def rand(numRows: Int, numCols: Int, rng: Random): Matrix = { - new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(rng.nextDouble())) - } + def rand(numRows: Int, numCols: Int, rng: Random): Matrix = + DenseMatrix.rand(numRows, numCols, rng) + + /** + * Generate a `SparseMatrix` consisting of i.i.d. gaussian random numbers. + * @param numRows number of rows of the matrix + * @param numCols number of columns of the matrix + * @param density the desired density for the matrix + * @param rng a random number generator + * @return `Matrix` with size `numRows` x `numCols` and values in U(0, 1) + */ + def sprand(numRows: Int, numCols: Int, density: Double, rng: Random): Matrix = + SparseMatrix.sprand(numRows, numCols, density, rng) /** * Generate a `DenseMatrix` consisting of i.i.d. gaussian random numbers. * @param numRows number of rows of the matrix * @param numCols number of columns of the matrix * @param rng a random number generator - * @return `DenseMatrix` with size `numRows` x `numCols` and values in N(0, 1) + * @return `Matrix` with size `numRows` x `numCols` and values in N(0, 1) */ - def randn(numRows: Int, numCols: Int, rng: Random): Matrix = { - new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(rng.nextGaussian())) - } + def randn(numRows: Int, numCols: Int, rng: Random): Matrix = + DenseMatrix.randn(numRows, numCols, rng) + + /** + * Generate a `SparseMatrix` consisting of i.i.d. gaussian random numbers. + * @param numRows number of rows of the matrix + * @param numCols number of columns of the matrix + * @param density the desired density for the matrix + * @param rng a random number generator + * @return `Matrix` with size `numRows` x `numCols` and values in N(0, 1) + */ + def sprandn(numRows: Int, numCols: Int, density: Double, rng: Random): Matrix = + SparseMatrix.sprandn(numRows, numCols, density, rng) /** * Generate a diagonal matrix in `DenseMatrix` format from the supplied values. * @param vector a `Vector` tat will form the values on the diagonal of the matrix - * @return Square `DenseMatrix` with size `values.length` x `values.length` and `values` + * @return Square `Matrix` with size `values.length` x `values.length` and `values` * on the diagonal */ - def diag(vector: Vector): Matrix = { - val n = vector.size - val matrix = Matrices.eye(n) - val values = vector.toArray - var i = 0 - while (i < n) { - matrix.update(i, i, values(i)) - i += 1 + def diag(vector: Vector): Matrix = DenseMatrix.diag(vector) + + /** + * Horizontally concatenate a sequence of matrices. The returned matrix will be in the format + * the matrices are supplied in. Supplying a mix of dense and sparse matrices will result in + * a sparse matrix. If the Array is empty, an empty `DenseMatrix` will be returned. + * @param matrices array of matrices + * @return a single `Matrix` composed of the matrices that were horizontally concatenated + */ + def horzcat(matrices: Array[Matrix]): Matrix = { + if (matrices.isEmpty) { + return new DenseMatrix(0, 0, Array[Double]()) + } else if (matrices.size == 1) { + return matrices(0) + } + val numRows = matrices(0).numRows + var hasSparse = false + var numCols = 0 + matrices.foreach { mat => + require(numRows == mat.numRows, "The number of rows of the matrices in this sequence, " + + "don't match!") + mat match { + case sparse: SparseMatrix => hasSparse = true + case dense: DenseMatrix => // empty on purpose + case _ => throw new IllegalArgumentException("Unsupported matrix format. Expected " + + s"SparseMatrix or DenseMatrix. Instead got: ${mat.getClass}") + } + numCols += mat.numCols + } + if (!hasSparse) { + new DenseMatrix(numRows, numCols, matrices.flatMap(_.toArray)) + } else { + var startCol = 0 + val entries: Array[(Int, Int, Double)] = matrices.flatMap { + case spMat: SparseMatrix => + var j = 0 + val colPtrs = spMat.colPtrs + val rowIndices = spMat.rowIndices + val values = spMat.values + val data = new Array[(Int, Int, Double)](values.length) + val nCols = spMat.numCols + while (j < nCols) { + var idx = colPtrs(j) + while (idx < colPtrs(j + 1)) { + val i = rowIndices(idx) + val v = values(idx) + data(idx) = (i, j + startCol, v) + idx += 1 + } + j += 1 + } + startCol += nCols + data + case dnMat: DenseMatrix => + val data = new ArrayBuffer[(Int, Int, Double)]() + var j = 0 + val nCols = dnMat.numCols + val nRows = dnMat.numRows + val values = dnMat.values + while (j < nCols) { + var i = 0 + val indStart = j * nRows + while (i < nRows) { + val v = values(indStart + i) + if (v != 0.0) { + data.append((i, j + startCol, v)) + } + i += 1 + } + j += 1 + } + startCol += nCols + data + } + SparseMatrix.fromCOO(numRows, numCols, entries) + } + } + + /** + * Vertically concatenate a sequence of matrices. The returned matrix will be in the format + * the matrices are supplied in. Supplying a mix of dense and sparse matrices will result in + * a sparse matrix. If the Array is empty, an empty `DenseMatrix` will be returned. + * @param matrices array of matrices + * @return a single `Matrix` composed of the matrices that were vertically concatenated + */ + def vertcat(matrices: Array[Matrix]): Matrix = { + if (matrices.isEmpty) { + return new DenseMatrix(0, 0, Array[Double]()) + } else if (matrices.size == 1) { + return matrices(0) + } + val numCols = matrices(0).numCols + var hasSparse = false + var numRows = 0 + matrices.foreach { mat => + require(numCols == mat.numCols, "The number of rows of the matrices in this sequence, " + + "don't match!") + mat match { + case sparse: SparseMatrix => + hasSparse = true + case dense: DenseMatrix => + case _ => throw new IllegalArgumentException("Unsupported matrix format. Expected " + + s"SparseMatrix or DenseMatrix. Instead got: ${mat.getClass}") + } + numRows += mat.numRows + + } + if (!hasSparse) { + val allValues = new Array[Double](numRows * numCols) + var startRow = 0 + matrices.foreach { mat => + var j = 0 + val nRows = mat.numRows + val values = mat.toArray + while (j < numCols) { + var i = 0 + val indStart = j * numRows + startRow + val subMatStart = j * nRows + while (i < nRows) { + allValues(indStart + i) = values(subMatStart + i) + i += 1 + } + j += 1 + } + startRow += nRows + } + new DenseMatrix(numRows, numCols, allValues) + } else { + var startRow = 0 + val entries: Array[(Int, Int, Double)] = matrices.flatMap { + case spMat: SparseMatrix => + var j = 0 + val colPtrs = spMat.colPtrs + val rowIndices = spMat.rowIndices + val values = spMat.values + val data = new Array[(Int, Int, Double)](values.length) + while (j < numCols) { + var idx = colPtrs(j) + while (idx < colPtrs(j + 1)) { + val i = rowIndices(idx) + val v = values(idx) + data(idx) = (i + startRow, j, v) + idx += 1 + } + j += 1 + } + startRow += spMat.numRows + data + case dnMat: DenseMatrix => + val data = new ArrayBuffer[(Int, Int, Double)]() + var j = 0 + val nCols = dnMat.numCols + val nRows = dnMat.numRows + val values = dnMat.values + while (j < nCols) { + var i = 0 + val indStart = j * nRows + while (i < nRows) { + val v = values(indStart + i) + if (v != 0.0) { + data.append((i + startRow, j, v)) + } + i += 1 + } + j += 1 + } + startRow += nRows + data + } + SparseMatrix.fromCOO(numRows, numCols, entries) } - matrix } } diff --git a/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java new file mode 100644 index 0000000000000..704d484d0b585 --- /dev/null +++ b/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.linalg; + +import static org.junit.Assert.*; +import org.junit.Test; + +import java.io.Serializable; +import java.util.Random; + +public class JavaMatricesSuite implements Serializable { + + @Test + public void randMatrixConstruction() { + Random rng = new Random(24); + Matrix r = Matrices.rand(3, 4, rng); + rng.setSeed(24); + DenseMatrix dr = DenseMatrix.rand(3, 4, rng); + assertArrayEquals(r.toArray(), dr.toArray(), 0.0); + + rng.setSeed(24); + Matrix rn = Matrices.randn(3, 4, rng); + rng.setSeed(24); + DenseMatrix drn = DenseMatrix.randn(3, 4, rng); + assertArrayEquals(rn.toArray(), drn.toArray(), 0.0); + + rng.setSeed(24); + Matrix s = Matrices.sprand(3, 4, 0.5, rng); + rng.setSeed(24); + SparseMatrix sr = SparseMatrix.sprand(3, 4, 0.5, rng); + assertArrayEquals(s.toArray(), sr.toArray(), 0.0); + + rng.setSeed(24); + Matrix sn = Matrices.sprandn(3, 4, 0.5, rng); + rng.setSeed(24); + SparseMatrix srn = SparseMatrix.sprandn(3, 4, 0.5, rng); + assertArrayEquals(sn.toArray(), srn.toArray(), 0.0); + } + + @Test + public void identityMatrixConstruction() { + Matrix r = Matrices.eye(2); + DenseMatrix dr = DenseMatrix.eye(2); + SparseMatrix sr = SparseMatrix.speye(2); + assertArrayEquals(r.toArray(), dr.toArray(), 0.0); + assertArrayEquals(sr.toArray(), dr.toArray(), 0.0); + assertArrayEquals(r.toArray(), new double[]{1.0, 0.0, 0.0, 1.0}, 0.0); + } + + @Test + public void diagonalMatrixConstruction() { + Vector v = Vectors.dense(1.0, 0.0, 2.0); + Vector sv = Vectors.sparse(3, new int[]{0, 2}, new double[]{1.0, 2.0}); + + Matrix m = Matrices.diag(v); + Matrix sm = Matrices.diag(sv); + DenseMatrix d = DenseMatrix.diag(v); + DenseMatrix sd = DenseMatrix.diag(sv); + SparseMatrix s = SparseMatrix.diag(v); + SparseMatrix ss = SparseMatrix.diag(sv); + + assertArrayEquals(m.toArray(), sm.toArray(), 0.0); + assertArrayEquals(d.toArray(), sm.toArray(), 0.0); + assertArrayEquals(d.toArray(), sd.toArray(), 0.0); + assertArrayEquals(sd.toArray(), s.toArray(), 0.0); + assertArrayEquals(s.toArray(), ss.toArray(), 0.0); + assertArrayEquals(s.values(), ss.values(), 0.0); + assert(s.values().length == 2); + assert(ss.values().length == 2); + assert(s.colPtrs().length == 4); + assert(ss.colPtrs().length == 4); + } + + @Test + public void zerosMatrixConstruction() { + Matrix z = Matrices.zeros(2, 2); + Matrix one = Matrices.ones(2, 2); + DenseMatrix dz = DenseMatrix.zeros(2, 2); + DenseMatrix done = DenseMatrix.ones(2, 2); + + assertArrayEquals(z.toArray(), new double[]{0.0, 0.0, 0.0, 0.0}, 0.0); + assertArrayEquals(dz.toArray(), new double[]{0.0, 0.0, 0.0, 0.0}, 0.0); + assertArrayEquals(one.toArray(), new double[]{1.0, 1.0, 1.0, 1.0}, 0.0); + assertArrayEquals(done.toArray(), new double[]{1.0, 1.0, 1.0, 1.0}, 0.0); + } + + @Test + public void sparseDenseConversion() { + int m = 3; + int n = 2; + double[] values = new double[]{1.0, 2.0, 4.0, 5.0}; + double[] allValues = new double[]{1.0, 2.0, 0.0, 0.0, 4.0, 5.0}; + int[] colPtrs = new int[]{0, 2, 4}; + int[] rowIndices = new int[]{0, 1, 1, 2}; + + SparseMatrix spMat1 = new SparseMatrix(m, n, colPtrs, rowIndices, values); + DenseMatrix deMat1 = new DenseMatrix(m, n, allValues); + + SparseMatrix spMat2 = deMat1.toSparse(); + DenseMatrix deMat2 = spMat1.toDense(); + + assertArrayEquals(spMat1.toArray(), spMat2.toArray(), 0.0); + assertArrayEquals(deMat1.toArray(), deMat2.toArray(), 0.0); + } + + @Test + public void concatenateMatrices() { + int m = 3; + int n = 2; + + Random rng = new Random(42); + SparseMatrix spMat1 = SparseMatrix.sprand(m, n, 0.5, rng); + rng.setSeed(42); + DenseMatrix deMat1 = DenseMatrix.rand(m, n, rng); + Matrix deMat2 = Matrices.eye(3); + Matrix spMat2 = Matrices.speye(3); + Matrix deMat3 = Matrices.eye(2); + Matrix spMat3 = Matrices.speye(2); + + Matrix spHorz = Matrices.horzcat(new Matrix[]{spMat1, spMat2}); + Matrix deHorz1 = Matrices.horzcat(new Matrix[]{deMat1, deMat2}); + Matrix deHorz2 = Matrices.horzcat(new Matrix[]{spMat1, deMat2}); + Matrix deHorz3 = Matrices.horzcat(new Matrix[]{deMat1, spMat2}); + + assert(deHorz1.numRows() == 3); + assert(deHorz2.numRows() == 3); + assert(deHorz3.numRows() == 3); + assert(spHorz.numRows() == 3); + assert(deHorz1.numCols() == 5); + assert(deHorz2.numCols() == 5); + assert(deHorz3.numCols() == 5); + assert(spHorz.numCols() == 5); + + Matrix spVert = Matrices.vertcat(new Matrix[]{spMat1, spMat3}); + Matrix deVert1 = Matrices.vertcat(new Matrix[]{deMat1, deMat3}); + Matrix deVert2 = Matrices.vertcat(new Matrix[]{spMat1, deMat3}); + Matrix deVert3 = Matrices.vertcat(new Matrix[]{deMat1, spMat3}); + + assert(deVert1.numRows() == 5); + assert(deVert2.numRows() == 5); + assert(deVert3.numRows() == 5); + assert(spVert.numRows() == 5); + assert(deVert1.numCols() == 2); + assert(deVert2.numCols() == 2); + assert(deVert3.numCols() == 2); + assert(spVert.numCols() == 2); + } +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala index 322a0e9242918..a35d0fe389fdd 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala @@ -43,9 +43,9 @@ class MatricesSuite extends FunSuite { test("sparse matrix construction") { val m = 3 - val n = 2 + val n = 4 val values = Array(1.0, 2.0, 4.0, 5.0) - val colPtrs = Array(0, 2, 4) + val colPtrs = Array(0, 2, 2, 4, 4) val rowIndices = Array(1, 2, 1, 2) val mat = Matrices.sparse(m, n, colPtrs, rowIndices, values).asInstanceOf[SparseMatrix] assert(mat.numRows === m) @@ -53,6 +53,13 @@ class MatricesSuite extends FunSuite { assert(mat.values.eq(values), "should not copy data") assert(mat.colPtrs.eq(colPtrs), "should not copy data") assert(mat.rowIndices.eq(rowIndices), "should not copy data") + + val entries: Array[(Int, Int, Double)] = Array((2, 2, 3.0), (1, 0, 1.0), (2, 0, 2.0), + (1, 2, 2.0), (2, 2, 2.0), (1, 2, 2.0), (0, 0, 0.0)) + + val mat2 = SparseMatrix.fromCOO(m, n, entries) + assert(mat.toBreeze === mat2.toBreeze) + assert(mat2.values.length == 4) } test("sparse matrix construction with wrong number of elements") { @@ -117,6 +124,142 @@ class MatricesSuite extends FunSuite { assert(sparseMat.values(2) === 10.0) } + test("toSparse, toDense") { + val m = 3 + val n = 2 + val values = Array(1.0, 2.0, 4.0, 5.0) + val allValues = Array(1.0, 2.0, 0.0, 0.0, 4.0, 5.0) + val colPtrs = Array(0, 2, 4) + val rowIndices = Array(0, 1, 1, 2) + + val spMat1 = new SparseMatrix(m, n, colPtrs, rowIndices, values) + val deMat1 = new DenseMatrix(m, n, allValues) + + val spMat2 = deMat1.toSparse() + val deMat2 = spMat1.toDense() + + assert(spMat1.toBreeze === spMat2.toBreeze) + assert(deMat1.toBreeze === deMat2.toBreeze) + } + + test("map, update") { + val m = 3 + val n = 2 + val values = Array(1.0, 2.0, 4.0, 5.0) + val allValues = Array(1.0, 2.0, 0.0, 0.0, 4.0, 5.0) + val colPtrs = Array(0, 2, 4) + val rowIndices = Array(0, 1, 1, 2) + + val spMat1 = new SparseMatrix(m, n, colPtrs, rowIndices, values) + val deMat1 = new DenseMatrix(m, n, allValues) + val deMat2 = deMat1.map(_ * 2) + val spMat2 = spMat1.map(_ * 2) + deMat1.update(_ * 2) + spMat1.update(_ * 2) + + assert(spMat1.toArray === spMat2.toArray) + assert(deMat1.toArray === deMat2.toArray) + } + + test("horzcat, vertcat, eye, speye") { + val m = 3 + val n = 2 + val values = Array(1.0, 2.0, 4.0, 5.0) + val allValues = Array(1.0, 2.0, 0.0, 0.0, 4.0, 5.0) + val colPtrs = Array(0, 2, 4) + val rowIndices = Array(0, 1, 1, 2) + + val spMat1 = new SparseMatrix(m, n, colPtrs, rowIndices, values) + val deMat1 = new DenseMatrix(m, n, allValues) + val deMat2 = Matrices.eye(3) + val spMat2 = Matrices.speye(3) + val deMat3 = Matrices.eye(2) + val spMat3 = Matrices.speye(2) + + val spHorz = Matrices.horzcat(Array(spMat1, spMat2)) + val spHorz2 = Matrices.horzcat(Array(spMat1, deMat2)) + val spHorz3 = Matrices.horzcat(Array(deMat1, spMat2)) + val deHorz1 = Matrices.horzcat(Array(deMat1, deMat2)) + + val deHorz2 = Matrices.horzcat(Array[Matrix]()) + + assert(deHorz1.numRows === 3) + assert(spHorz2.numRows === 3) + assert(spHorz3.numRows === 3) + assert(spHorz.numRows === 3) + assert(deHorz1.numCols === 5) + assert(spHorz2.numCols === 5) + assert(spHorz3.numCols === 5) + assert(spHorz.numCols === 5) + assert(deHorz2.numRows === 0) + assert(deHorz2.numCols === 0) + assert(deHorz2.toArray.length === 0) + + assert(deHorz1.toBreeze.toDenseMatrix === spHorz2.toBreeze.toDenseMatrix) + assert(spHorz2.toBreeze === spHorz3.toBreeze) + assert(spHorz(0, 0) === 1.0) + assert(spHorz(2, 1) === 5.0) + assert(spHorz(0, 2) === 1.0) + assert(spHorz(1, 2) === 0.0) + assert(spHorz(1, 3) === 1.0) + assert(spHorz(2, 4) === 1.0) + assert(spHorz(1, 4) === 0.0) + assert(deHorz1(0, 0) === 1.0) + assert(deHorz1(2, 1) === 5.0) + assert(deHorz1(0, 2) === 1.0) + assert(deHorz1(1, 2) == 0.0) + assert(deHorz1(1, 3) === 1.0) + assert(deHorz1(2, 4) === 1.0) + assert(deHorz1(1, 4) === 0.0) + + intercept[IllegalArgumentException] { + Matrices.horzcat(Array(spMat1, spMat3)) + } + + intercept[IllegalArgumentException] { + Matrices.horzcat(Array(deMat1, spMat3)) + } + + val spVert = Matrices.vertcat(Array(spMat1, spMat3)) + val deVert1 = Matrices.vertcat(Array(deMat1, deMat3)) + val spVert2 = Matrices.vertcat(Array(spMat1, deMat3)) + val spVert3 = Matrices.vertcat(Array(deMat1, spMat3)) + val deVert2 = Matrices.vertcat(Array[Matrix]()) + + assert(deVert1.numRows === 5) + assert(spVert2.numRows === 5) + assert(spVert3.numRows === 5) + assert(spVert.numRows === 5) + assert(deVert1.numCols === 2) + assert(spVert2.numCols === 2) + assert(spVert3.numCols === 2) + assert(spVert.numCols === 2) + assert(deVert2.numRows === 0) + assert(deVert2.numCols === 0) + assert(deVert2.toArray.length === 0) + + assert(deVert1.toBreeze.toDenseMatrix === spVert2.toBreeze.toDenseMatrix) + assert(spVert2.toBreeze === spVert3.toBreeze) + assert(spVert(0, 0) === 1.0) + assert(spVert(2, 1) === 5.0) + assert(spVert(3, 0) === 1.0) + assert(spVert(3, 1) === 0.0) + assert(spVert(4, 1) === 1.0) + assert(deVert1(0, 0) === 1.0) + assert(deVert1(2, 1) === 5.0) + assert(deVert1(3, 0) === 1.0) + assert(deVert1(3, 1) === 0.0) + assert(deVert1(4, 1) === 1.0) + + intercept[IllegalArgumentException] { + Matrices.vertcat(Array(spMat1, spMat2)) + } + + intercept[IllegalArgumentException] { + Matrices.vertcat(Array(deMat1, spMat2)) + } + } + test("zeros") { val mat = Matrices.zeros(2, 3).asInstanceOf[DenseMatrix] assert(mat.numRows === 2) @@ -162,4 +305,29 @@ class MatricesSuite extends FunSuite { assert(mat.numCols === 2) assert(mat.values.toSeq === Seq(1.0, 0.0, 0.0, 2.0)) } + + test("sprand") { + val rng = mock[Random] + when(rng.nextInt(4)).thenReturn(0, 1, 1, 3, 2, 2, 0, 1, 3, 0) + when(rng.nextDouble()).thenReturn(1.0, 2.0, 3.0, 4.0, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0) + val mat = SparseMatrix.sprand(4, 4, 0.25, rng) + assert(mat.numRows === 4) + assert(mat.numCols === 4) + assert(mat.rowIndices.toSeq === Seq(3, 0, 2, 1)) + assert(mat.values.toSeq === Seq(1.0, 2.0, 3.0, 4.0)) + val mat2 = SparseMatrix.sprand(2, 3, 1.0, rng) + assert(mat2.rowIndices.toSeq === Seq(0, 1, 0, 1, 0, 1)) + assert(mat2.colPtrs.toSeq === Seq(0, 2, 4, 6)) + } + + test("sprandn") { + val rng = mock[Random] + when(rng.nextInt(4)).thenReturn(0, 1, 1, 3, 2, 2, 0, 1, 3, 0) + when(rng.nextGaussian()).thenReturn(1.0, 2.0, 3.0, 4.0) + val mat = SparseMatrix.sprandn(4, 4, 0.25, rng) + assert(mat.numRows === 4) + assert(mat.numCols === 4) + assert(mat.rowIndices.toSeq === Seq(3, 0, 2, 1)) + assert(mat.values.toSeq === Seq(1.0, 2.0, 3.0, 4.0)) + } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala index 30b906aaa3ba4..e957fa5d25f4c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala @@ -178,17 +178,17 @@ object TestingUtils { implicit class MatrixWithAlmostEquals(val x: Matrix) { /** - * When the difference of two vectors are within eps, returns true; otherwise, returns false. + * When the difference of two matrices are within eps, returns true; otherwise, returns false. */ def ~=(r: CompareMatrixRightSide): Boolean = r.fun(x, r.y, r.eps) /** - * When the difference of two vectors are within eps, returns false; otherwise, returns true. + * When the difference of two matrices are within eps, returns false; otherwise, returns true. */ def !~=(r: CompareMatrixRightSide): Boolean = !r.fun(x, r.y, r.eps) /** - * Throws exception when the difference of two vectors are NOT within eps; + * Throws exception when the difference of two matrices are NOT within eps; * otherwise, returns true. */ def ~==(r: CompareMatrixRightSide): Boolean = { From 9bc0df6804f241aff24520d9c6ec54d9b11f5785 Mon Sep 17 00:00:00 2001 From: Yash Datta Date: Mon, 29 Dec 2014 13:49:45 -0800 Subject: [PATCH 008/116] SPARK-4968: takeOrdered to skip reduce step in case mappers return no partitions takeOrdered should skip reduce step in case mapped RDDs have no partitions. This prevents the mentioned exception : 4. run query SELECT * FROM testTable WHERE market = 'market2' ORDER BY End_Time DESC LIMIT 100; Error trace java.lang.UnsupportedOperationException: empty collection at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:863) at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:863) at scala.Option.getOrElse(Option.scala:120) at org.apache.spark.rdd.RDD.reduce(RDD.scala:863) at org.apache.spark.rdd.RDD.takeOrdered(RDD.scala:1136) Author: Yash Datta Closes #3830 from saucam/fix_takeorder and squashes the following commits: 5974d10 [Yash Datta] SPARK-4968: takeOrdered to skip reduce step in case mappers return no partitions --- .../src/main/scala/org/apache/spark/rdd/RDD.scala | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index f47c2d1fcdcc7..5118e2b911120 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -1146,15 +1146,20 @@ abstract class RDD[T: ClassTag]( if (num == 0) { Array.empty } else { - mapPartitions { items => + val mapRDDs = mapPartitions { items => // Priority keeps the largest elements, so let's reverse the ordering. val queue = new BoundedPriorityQueue[T](num)(ord.reverse) queue ++= util.collection.Utils.takeOrdered(items, num)(ord) Iterator.single(queue) - }.reduce { (queue1, queue2) => - queue1 ++= queue2 - queue1 - }.toArray.sorted(ord) + } + if (mapRDDs.partitions.size == 0) { + Array.empty + } else { + mapRDDs.reduce { (queue1, queue2) => + queue1 ++= queue2 + queue1 + }.toArray.sorted(ord) + } } } From 6cf6fdf3ff5d1cf33c2dc28f039adc4d7c0f0464 Mon Sep 17 00:00:00 2001 From: Travis Galoppo Date: Mon, 29 Dec 2014 15:29:15 -0800 Subject: [PATCH 009/116] SPARK-4156 [MLLIB] EM algorithm for GMMs Implementation of Expectation-Maximization for Gaussian Mixture Models. This is my maiden contribution to Apache Spark, so I apologize now if I have done anything incorrectly; having said that, this work is my own, and I offer it to the project under the project's open source license. Author: Travis Galoppo Author: Travis Galoppo Author: tgaloppo Author: FlytxtRnD Closes #3022 from tgaloppo/master and squashes the following commits: aaa8f25 [Travis Galoppo] MLUtils: changed privacy of EPSILON from [util] to [mllib] 709e4bf [Travis Galoppo] fixed usage line to include optional maxIterations parameter acf1fba [Travis Galoppo] Fixed parameter comment in GaussianMixtureModel Made maximum iterations an optional parameter to DenseGmmEM 9b2fc2a [Travis Galoppo] Style improvements Changed ExpectationSum to a private class b97fe00 [Travis Galoppo] Minor fixes and tweaks. 1de73f3 [Travis Galoppo] Removed redundant array from array creation 578c2d1 [Travis Galoppo] Removed unused import 227ad66 [Travis Galoppo] Moved prediction methods into model class. 308c8ad [Travis Galoppo] Numerous changes to improve code cff73e0 [Travis Galoppo] Replaced accumulators with RDD.aggregate 20ebca1 [Travis Galoppo] Removed unusued code 42b2142 [Travis Galoppo] Added functionality to allow setting of GMM starting point. Added two cluster test to testing suite. 8b633f3 [Travis Galoppo] Style issue 9be2534 [Travis Galoppo] Style issue d695034 [Travis Galoppo] Fixed style issues c3b8ce0 [Travis Galoppo] Merge branch 'master' of https://github.com/tgaloppo/spark Adds predict() method 2df336b [Travis Galoppo] Fixed style issue b99ecc4 [tgaloppo] Merge pull request #1 from FlytxtRnD/predictBranch f407b4c [FlytxtRnD] Added predict() to return the cluster labels and membership values 97044cf [Travis Galoppo] Fixed style issues dc9c742 [Travis Galoppo] Moved MultivariateGaussian utility class e7d413b [Travis Galoppo] Moved multivariate Gaussian utility class to mllib/stat/impl Improved comments 9770261 [Travis Galoppo] Corrected a variety of style and naming issues. 8aaa17d [Travis Galoppo] Added additional train() method to companion object for cluster count and tolerance parameters. 676e523 [Travis Galoppo] Fixed to no longer ignore delta value provided on command line e6ea805 [Travis Galoppo] Merged with master branch; update test suite with latest context changes. Improved cluster initialization strategy. 86fb382 [Travis Galoppo] Merge remote-tracking branch 'upstream/master' 719d8cc [Travis Galoppo] Added scala test suite with basic test c1a8e16 [Travis Galoppo] Made GaussianMixtureModel class serializable Modified sum function for better performance 5c96c57 [Travis Galoppo] Merge remote-tracking branch 'upstream/master' c15405c [Travis Galoppo] SPARK-4156 --- .../spark/examples/mllib/DenseGmmEM.scala | 67 +++++ .../mllib/clustering/GaussianMixtureEM.scala | 241 ++++++++++++++++++ .../clustering/GaussianMixtureModel.scala | 91 +++++++ .../stat/impl/MultivariateGaussian.scala | 39 +++ .../org/apache/spark/mllib/util/MLUtils.scala | 2 +- .../GMMExpectationMaximizationSuite.scala | 78 ++++++ 6 files changed, 517 insertions(+), 1 deletion(-) create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala new file mode 100644 index 0000000000000..948c350953e27 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.mllib.clustering.GaussianMixtureEM +import org.apache.spark.mllib.linalg.Vectors + +/** + * An example Gaussian Mixture Model EM app. Run with + * {{{ + * ./bin/run-example org.apache.spark.examples.mllib.DenseGmmEM + * }}} + * If you use it as a template to create your own app, please use `spark-submit` to submit your app. + */ +object DenseGmmEM { + def main(args: Array[String]): Unit = { + if (args.length < 3) { + println("usage: DenseGmmEM [maxIterations]") + } else { + val maxIterations = if (args.length > 3) args(3).toInt else 100 + run(args(0), args(1).toInt, args(2).toDouble, maxIterations) + } + } + + private def run(inputFile: String, k: Int, convergenceTol: Double, maxIterations: Int) { + val conf = new SparkConf().setAppName("Gaussian Mixture Model EM example") + val ctx = new SparkContext(conf) + + val data = ctx.textFile(inputFile).map { line => + Vectors.dense(line.trim.split(' ').map(_.toDouble)) + }.cache() + + val clusters = new GaussianMixtureEM() + .setK(k) + .setConvergenceTol(convergenceTol) + .setMaxIterations(maxIterations) + .run(data) + + for (i <- 0 until clusters.k) { + println("weight=%f\nmu=%s\nsigma=\n%s\n" format + (clusters.weight(i), clusters.mu(i), clusters.sigma(i))) + } + + println("Cluster labels (first <= 100):") + val clusterLabels = clusters.predict(data) + clusterLabels.take(100).foreach { x => + print(" " + x) + } + println() + } +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala new file mode 100644 index 0000000000000..bdf984aee4dae --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.clustering + +import scala.collection.mutable.IndexedSeq + +import breeze.linalg.{DenseVector => BreezeVector, DenseMatrix => BreezeMatrix, diag, Transpose} +import org.apache.spark.rdd.RDD +import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors} +import org.apache.spark.mllib.stat.impl.MultivariateGaussian +import org.apache.spark.mllib.util.MLUtils + +/** + * This class performs expectation maximization for multivariate Gaussian + * Mixture Models (GMMs). A GMM represents a composite distribution of + * independent Gaussian distributions with associated "mixing" weights + * specifying each's contribution to the composite. + * + * Given a set of sample points, this class will maximize the log-likelihood + * for a mixture of k Gaussians, iterating until the log-likelihood changes by + * less than convergenceTol, or until it has reached the max number of iterations. + * While this process is generally guaranteed to converge, it is not guaranteed + * to find a global optimum. + * + * @param k The number of independent Gaussians in the mixture model + * @param convergenceTol The maximum change in log-likelihood at which convergence + * is considered to have occurred. + * @param maxIterations The maximum number of iterations to perform + */ +class GaussianMixtureEM private ( + private var k: Int, + private var convergenceTol: Double, + private var maxIterations: Int) extends Serializable { + + /** A default instance, 2 Gaussians, 100 iterations, 0.01 log-likelihood threshold */ + def this() = this(2, 0.01, 100) + + // number of samples per cluster to use when initializing Gaussians + private val nSamples = 5 + + // an initializing GMM can be provided rather than using the + // default random starting point + private var initialModel: Option[GaussianMixtureModel] = None + + /** Set the initial GMM starting point, bypassing the random initialization. + * You must call setK() prior to calling this method, and the condition + * (model.k == this.k) must be met; failure will result in an IllegalArgumentException + */ + def setInitialModel(model: GaussianMixtureModel): this.type = { + if (model.k == k) { + initialModel = Some(model) + } else { + throw new IllegalArgumentException("mismatched cluster count (model.k != k)") + } + this + } + + /** Return the user supplied initial GMM, if supplied */ + def getInitialModel: Option[GaussianMixtureModel] = initialModel + + /** Set the number of Gaussians in the mixture model. Default: 2 */ + def setK(k: Int): this.type = { + this.k = k + this + } + + /** Return the number of Gaussians in the mixture model */ + def getK: Int = k + + /** Set the maximum number of iterations to run. Default: 100 */ + def setMaxIterations(maxIterations: Int): this.type = { + this.maxIterations = maxIterations + this + } + + /** Return the maximum number of iterations to run */ + def getMaxIterations: Int = maxIterations + + /** + * Set the largest change in log-likelihood at which convergence is + * considered to have occurred. + */ + def setConvergenceTol(convergenceTol: Double): this.type = { + this.convergenceTol = convergenceTol + this + } + + /** Return the largest change in log-likelihood at which convergence is + * considered to have occurred. + */ + def getConvergenceTol: Double = convergenceTol + + /** Perform expectation maximization */ + def run(data: RDD[Vector]): GaussianMixtureModel = { + val sc = data.sparkContext + + // we will operate on the data as breeze data + val breezeData = data.map(u => u.toBreeze.toDenseVector).cache() + + // Get length of the input vectors + val d = breezeData.first.length + + // Determine initial weights and corresponding Gaussians. + // If the user supplied an initial GMM, we use those values, otherwise + // we start with uniform weights, a random mean from the data, and + // diagonal covariance matrices using component variances + // derived from the samples + val (weights, gaussians) = initialModel match { + case Some(gmm) => (gmm.weight, gmm.mu.zip(gmm.sigma).map { case(mu, sigma) => + new MultivariateGaussian(mu.toBreeze.toDenseVector, sigma.toBreeze.toDenseMatrix) + }) + + case None => { + val samples = breezeData.takeSample(true, k * nSamples, scala.util.Random.nextInt) + (Array.fill(k)(1.0 / k), Array.tabulate(k) { i => + val slice = samples.view(i * nSamples, (i + 1) * nSamples) + new MultivariateGaussian(vectorMean(slice), initCovariance(slice)) + }) + } + } + + var llh = Double.MinValue // current log-likelihood + var llhp = 0.0 // previous log-likelihood + + var iter = 0 + while(iter < maxIterations && Math.abs(llh-llhp) > convergenceTol) { + // create and broadcast curried cluster contribution function + val compute = sc.broadcast(ExpectationSum.add(weights, gaussians)_) + + // aggregate the cluster contribution for all sample points + val sums = breezeData.aggregate(ExpectationSum.zero(k, d))(compute.value, _ += _) + + // Create new distributions based on the partial assignments + // (often referred to as the "M" step in literature) + val sumWeights = sums.weights.sum + var i = 0 + while (i < k) { + val mu = sums.means(i) / sums.weights(i) + val sigma = sums.sigmas(i) / sums.weights(i) - mu * new Transpose(mu) // TODO: Use BLAS.dsyr + weights(i) = sums.weights(i) / sumWeights + gaussians(i) = new MultivariateGaussian(mu, sigma) + i = i + 1 + } + + llhp = llh // current becomes previous + llh = sums.logLikelihood // this is the freshly computed log-likelihood + iter += 1 + } + + // Need to convert the breeze matrices to MLlib matrices + val means = Array.tabulate(k) { i => Vectors.fromBreeze(gaussians(i).mu) } + val sigmas = Array.tabulate(k) { i => Matrices.fromBreeze(gaussians(i).sigma) } + new GaussianMixtureModel(weights, means, sigmas) + } + + /** Average of dense breeze vectors */ + private def vectorMean(x: IndexedSeq[BreezeVector[Double]]): BreezeVector[Double] = { + val v = BreezeVector.zeros[Double](x(0).length) + x.foreach(xi => v += xi) + v / x.length.toDouble + } + + /** + * Construct matrix where diagonal entries are element-wise + * variance of input vectors (computes biased variance) + */ + private def initCovariance(x: IndexedSeq[BreezeVector[Double]]): BreezeMatrix[Double] = { + val mu = vectorMean(x) + val ss = BreezeVector.zeros[Double](x(0).length) + x.map(xi => (xi - mu) :^ 2.0).foreach(u => ss += u) + diag(ss / x.length.toDouble) + } +} + +// companion class to provide zero constructor for ExpectationSum +private object ExpectationSum { + def zero(k: Int, d: Int): ExpectationSum = { + new ExpectationSum(0.0, Array.fill(k)(0.0), + Array.fill(k)(BreezeVector.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d,d))) + } + + // compute cluster contributions for each input point + // (U, T) => U for aggregation + def add( + weights: Array[Double], + dists: Array[MultivariateGaussian]) + (sums: ExpectationSum, x: BreezeVector[Double]): ExpectationSum = { + val p = weights.zip(dists).map { + case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(x) + } + val pSum = p.sum + sums.logLikelihood += math.log(pSum) + val xxt = x * new Transpose(x) + var i = 0 + while (i < sums.k) { + p(i) /= pSum + sums.weights(i) += p(i) + sums.means(i) += x * p(i) + sums.sigmas(i) += xxt * p(i) // TODO: use BLAS.dsyr + i = i + 1 + } + sums + } +} + +// Aggregation class for partial expectation results +private class ExpectationSum( + var logLikelihood: Double, + val weights: Array[Double], + val means: Array[BreezeVector[Double]], + val sigmas: Array[BreezeMatrix[Double]]) extends Serializable { + + val k = weights.length + + def +=(x: ExpectationSum): ExpectationSum = { + var i = 0 + while (i < k) { + weights(i) += x.weights(i) + means(i) += x.means(i) + sigmas(i) += x.sigmas(i) + i = i + 1 + } + logLikelihood += x.logLikelihood + this + } +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala new file mode 100644 index 0000000000000..11a110db1f7ca --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.clustering + +import breeze.linalg.{DenseVector => BreezeVector} + +import org.apache.spark.rdd.RDD +import org.apache.spark.mllib.linalg.{Matrix, Vector} +import org.apache.spark.mllib.stat.impl.MultivariateGaussian +import org.apache.spark.mllib.util.MLUtils + +/** + * Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points + * are drawn from each Gaussian i=1..k with probability w(i); mu(i) and sigma(i) are + * the respective mean and covariance for each Gaussian distribution i=1..k. + * + * @param weight Weights for each Gaussian distribution in the mixture, where weight(i) is + * the weight for Gaussian i, and weight.sum == 1 + * @param mu Means for each Gaussian in the mixture, where mu(i) is the mean for Gaussian i + * @param sigma Covariance maxtrix for each Gaussian in the mixture, where sigma(i) is the + * covariance matrix for Gaussian i + */ +class GaussianMixtureModel( + val weight: Array[Double], + val mu: Array[Vector], + val sigma: Array[Matrix]) extends Serializable { + + /** Number of gaussians in mixture */ + def k: Int = weight.length + + /** Maps given points to their cluster indices. */ + def predict(points: RDD[Vector]): RDD[Int] = { + val responsibilityMatrix = predictMembership(points, mu, sigma, weight, k) + responsibilityMatrix.map(r => r.indexOf(r.max)) + } + + /** + * Given the input vectors, return the membership value of each vector + * to all mixture components. + */ + def predictMembership( + points: RDD[Vector], + mu: Array[Vector], + sigma: Array[Matrix], + weight: Array[Double], + k: Int): RDD[Array[Double]] = { + val sc = points.sparkContext + val dists = sc.broadcast { + (0 until k).map { i => + new MultivariateGaussian(mu(i).toBreeze.toDenseVector, sigma(i).toBreeze.toDenseMatrix) + }.toArray + } + val weights = sc.broadcast(weight) + points.map { x => + computeSoftAssignments(x.toBreeze.toDenseVector, dists.value, weights.value, k) + } + } + + /** + * Compute the partial assignments for each vector + */ + private def computeSoftAssignments( + pt: BreezeVector[Double], + dists: Array[MultivariateGaussian], + weights: Array[Double], + k: Int): Array[Double] = { + val p = weights.zip(dists).map { + case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(pt) + } + val pSum = p.sum + for (i <- 0 until k) { + p(i) /= pSum + } + p + } +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala new file mode 100644 index 0000000000000..2eab5d277827d --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.stat.impl + +import breeze.linalg.{DenseVector => DBV, DenseMatrix => DBM, Transpose, det, pinv} + +/** + * Utility class to implement the density function for multivariate Gaussian distribution. + * Breeze provides this functionality, but it requires the Apache Commons Math library, + * so this class is here so-as to not introduce a new dependency in Spark. + */ +private[mllib] class MultivariateGaussian( + val mu: DBV[Double], + val sigma: DBM[Double]) extends Serializable { + private val sigmaInv2 = pinv(sigma) * -0.5 + private val U = math.pow(2.0 * math.Pi, -mu.length / 2.0) * math.pow(det(sigma), -0.5) + + /** Returns density of this multivariate Gaussian at given point, x */ + def pdf(x: DBV[Double]): Double = { + val delta = x - mu + val deltaTranspose = new Transpose(delta) + U * math.exp(deltaTranspose * sigmaInv2 * delta) + } +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index b0d05ae33e1b5..1d07b5dab8268 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -39,7 +39,7 @@ import org.apache.spark.streaming.dstream.DStream */ object MLUtils { - private[util] lazy val EPSILON = { + private[mllib] lazy val EPSILON = { var eps = 1.0 while ((1.0 + (eps / 2.0)) != 1.0) { eps /= 2.0 diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala new file mode 100644 index 0000000000000..23feb82874b70 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.clustering + +import org.scalatest.FunSuite + +import org.apache.spark.mllib.linalg.{Vectors, Matrices} +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ + +class GMMExpectationMaximizationSuite extends FunSuite with MLlibTestSparkContext { + test("single cluster") { + val data = sc.parallelize(Array( + Vectors.dense(6.0, 9.0), + Vectors.dense(5.0, 10.0), + Vectors.dense(4.0, 11.0) + )) + + // expectations + val Ew = 1.0 + val Emu = Vectors.dense(5.0, 10.0) + val Esigma = Matrices.dense(2, 2, Array(2.0 / 3.0, -2.0 / 3.0, -2.0 / 3.0, 2.0 / 3.0)) + + val gmm = new GaussianMixtureEM().setK(1).run(data) + + assert(gmm.weight(0) ~== Ew absTol 1E-5) + assert(gmm.mu(0) ~== Emu absTol 1E-5) + assert(gmm.sigma(0) ~== Esigma absTol 1E-5) + } + + test("two clusters") { + val data = sc.parallelize(Array( + Vectors.dense(-5.1971), Vectors.dense(-2.5359), Vectors.dense(-3.8220), + Vectors.dense(-5.2211), Vectors.dense(-5.0602), Vectors.dense( 4.7118), + Vectors.dense( 6.8989), Vectors.dense( 3.4592), Vectors.dense( 4.6322), + Vectors.dense( 5.7048), Vectors.dense( 4.6567), Vectors.dense( 5.5026), + Vectors.dense( 4.5605), Vectors.dense( 5.2043), Vectors.dense( 6.2734) + )) + + // we set an initial gaussian to induce expected results + val initialGmm = new GaussianMixtureModel( + Array(0.5, 0.5), + Array(Vectors.dense(-1.0), Vectors.dense(1.0)), + Array(Matrices.dense(1, 1, Array(1.0)), Matrices.dense(1, 1, Array(1.0))) + ) + + val Ew = Array(1.0 / 3.0, 2.0 / 3.0) + val Emu = Array(Vectors.dense(-4.3673), Vectors.dense(5.1604)) + val Esigma = Array(Matrices.dense(1, 1, Array(1.1098)), Matrices.dense(1, 1, Array(0.86644))) + + val gmm = new GaussianMixtureEM() + .setK(2) + .setInitialModel(initialGmm) + .run(data) + + assert(gmm.weight(0) ~== Ew(0) absTol 1E-3) + assert(gmm.weight(1) ~== Ew(1) absTol 1E-3) + assert(gmm.mu(0) ~== Emu(0) absTol 1E-3) + assert(gmm.mu(1) ~== Emu(1) absTol 1E-3) + assert(gmm.sigma(0) ~== Esigma(0) absTol 1E-3) + assert(gmm.sigma(1) ~== Esigma(1) absTol 1E-3) + } +} From 343db392b58fb33a3e4bc6fda1da69aaf686b5a9 Mon Sep 17 00:00:00 2001 From: ganonp Date: Mon, 29 Dec 2014 15:31:19 -0800 Subject: [PATCH 010/116] Added setMinCount to Word2Vec.scala Wanted to customize the private minCount variable in the Word2Vec class. Added a method to do so. Author: ganonp Closes #3693 from ganonp/my-custom-spark and squashes the following commits: ad534f2 [ganonp] made norm method public 5110a6f [ganonp] Reorganized 854958b [ganonp] Fixed Indentation for setMinCount 12ed8f9 [ganonp] Update Word2Vec.scala 76bdf5a [ganonp] Update Word2Vec.scala ffb88bb [ganonp] Update Word2Vec.scala 5eb9100 [ganonp] Added setMinCount to Word2Vec.scala --- .../org/apache/spark/mllib/feature/Word2Vec.scala | 15 +++++++++++---- .../org/apache/spark/mllib/linalg/Vectors.scala | 2 +- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 7960f3cab576f..d25a7cd5b439d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -71,7 +71,8 @@ class Word2Vec extends Serializable with Logging { private var numPartitions = 1 private var numIterations = 1 private var seed = Utils.random.nextLong() - + private var minCount = 5 + /** * Sets vector size (default: 100). */ @@ -114,6 +115,15 @@ class Word2Vec extends Serializable with Logging { this } + /** + * Sets minCount, the minimum number of times a token must appear to be included in the word2vec + * model's vocabulary (default: 5). + */ + def setMinCount(minCount: Int): this.type = { + this.minCount = minCount + this + } + private val EXP_TABLE_SIZE = 1000 private val MAX_EXP = 6 private val MAX_CODE_LENGTH = 40 @@ -122,9 +132,6 @@ class Word2Vec extends Serializable with Logging { /** context words from [-window, window] */ private val window = 5 - /** minimum frequency to consider a vocabulary word */ - private val minCount = 5 - private var trainWordsCount = 0 private var vocabSize = 0 private var vocab: Array[VocabWord] = null diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 47d1a76fa361d..01f3f90577142 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -268,7 +268,7 @@ object Vectors { * @param p norm. * @return norm in L^p^ space. */ - private[spark] def norm(vector: Vector, p: Double): Double = { + def norm(vector: Vector, p: Double): Double = { require(p >= 1.0) val values = vector match { case dv: DenseVector => dv.values From 040d6f2d13b132b3ef2a1e4f12f9f0e781c5a0b8 Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Mon, 29 Dec 2014 17:17:12 -0800 Subject: [PATCH 011/116] [SPARK-4972][MLlib] Updated the scala doc for lasso and ridge regression for the change of LeastSquaresGradient In #SPARK-4907, we added factor of 2 into the LeastSquaresGradient. We updated the scala doc for lasso and ridge regression here. Author: DB Tsai Closes #3808 from dbtsai/doc and squashes the following commits: ec3c989 [DB Tsai] first commit --- .../main/scala/org/apache/spark/mllib/regression/Lasso.scala | 2 +- .../org/apache/spark/mllib/regression/RidgeRegression.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala index f9791c6571782..8ecd5c6ad93c0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala @@ -45,7 +45,7 @@ class LassoModel ( /** * Train a regression model with L1-regularization using Stochastic Gradient Descent. * This solves the l1-regularized least squares regression formulation - * f(weights) = 1/n ||A weights-y||^2 + regParam ||weights||_1 + * f(weights) = 1/2n ||A weights-y||^2 + regParam ||weights||_1 * Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with * its corresponding right hand side label y. * See also the documentation for the precise formulation. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala index c8cad773f5efb..076ba35051c9d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala @@ -45,7 +45,7 @@ class RidgeRegressionModel ( /** * Train a regression model with L2-regularization using Stochastic Gradient Descent. * This solves the l1-regularized least squares regression formulation - * f(weights) = 1/n ||A weights-y||^2 + regParam/2 ||weights||^2 + * f(weights) = 1/2n ||A weights-y||^2 + regParam/2 ||weights||^2 * Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with * its corresponding right hand side label y. * See also the documentation for the precise formulation. From 9077e721cd36adfecd50cbd1fd7735d28e5be8b5 Mon Sep 17 00:00:00 2001 From: "Zhang, Liye" Date: Tue, 30 Dec 2014 09:19:47 -0800 Subject: [PATCH 012/116] [SPARK-4920][UI] add version on master and worker page for standalone mode Author: Zhang, Liye Closes #3769 from liyezhang556520/spark-4920_WebVersion and squashes the following commits: 3bb7e0d [Zhang, Liye] add version on master and worker page --- core/src/main/scala/org/apache/spark/ui/UIUtils.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index 7486cb6b1bbc0..b5022fe853c49 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -234,8 +234,9 @@ private[spark] object UIUtils extends Logging {

- + + {org.apache.spark.SPARK_VERSION} {title}

From efa80a531ecd485f6cf0cdc24ffa42ba17eea46d Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Tue, 30 Dec 2014 09:29:52 -0800 Subject: [PATCH 013/116] [SPARK-4882] Register PythonBroadcast with Kryo so that PySpark works with KryoSerializer This PR fixes an issue where PySpark broadcast variables caused NullPointerExceptions if KryoSerializer was used. The fix is to register PythonBroadcast with Kryo so that it's deserialized with a KryoJavaSerializer. Author: Josh Rosen Closes #3831 from JoshRosen/SPARK-4882 and squashes the following commits: 0466c7a [Josh Rosen] Register PythonBroadcast with Kryo. d5b409f [Josh Rosen] Enable registrationRequired, which would have caught this bug. 069d8a7 [Josh Rosen] Add failing test for SPARK-4882 --- .../spark/serializer/KryoSerializer.scala | 2 + .../api/python/PythonBroadcastSuite.scala | 60 +++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala index 621a951c27d07..d2947dcea4f7c 100644 --- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala @@ -26,6 +26,7 @@ import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializ import com.twitter.chill.{AllScalaRegistrar, EmptyScalaKryoInstantiator} import org.apache.spark._ +import org.apache.spark.api.python.PythonBroadcast import org.apache.spark.broadcast.HttpBroadcast import org.apache.spark.network.nio.{PutBlock, GotBlock, GetBlock} import org.apache.spark.scheduler.MapStatus @@ -90,6 +91,7 @@ class KryoSerializer(conf: SparkConf) // Allow sending SerializableWritable kryo.register(classOf[SerializableWritable[_]], new KryoJavaSerializer()) kryo.register(classOf[HttpBroadcast[_]], new KryoJavaSerializer()) + kryo.register(classOf[PythonBroadcast], new KryoJavaSerializer()) try { // Use the default classloader when calling the user registrator. diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala new file mode 100644 index 0000000000000..8959a843dbd7d --- /dev/null +++ b/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.api.python + +import scala.io.Source + +import java.io.{PrintWriter, File} + +import org.scalatest.{Matchers, FunSuite} + +import org.apache.spark.{SharedSparkContext, SparkConf} +import org.apache.spark.serializer.KryoSerializer +import org.apache.spark.util.Utils + +// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize +// a PythonBroadcast: +class PythonBroadcastSuite extends FunSuite with Matchers with SharedSparkContext { + test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { + val tempDir = Utils.createTempDir() + val broadcastedString = "Hello, world!" + def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { + val source = Source.fromFile(broadcast.path) + val contents = source.mkString + source.close() + contents should be (broadcastedString) + } + try { + val broadcastDataFile: File = { + val file = new File(tempDir, "broadcastData") + val printWriter = new PrintWriter(file) + printWriter.write(broadcastedString) + printWriter.close() + file + } + val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) + assertBroadcastIsValid(broadcast) + val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") + val deserializedBroadcast = + Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) + assertBroadcastIsValid(deserializedBroadcast) + } finally { + Utils.deleteRecursively(tempDir) + } + } +} From 480bd1d2edd1de06af607b0cf3ff3c0b16089add Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Tue, 30 Dec 2014 11:24:46 -0800 Subject: [PATCH 014/116] [SPARK-4908][SQL] Prevent multiple concurrent hive native commands This is just a quick fix that locks when calling `runHive`. If we can find a way to avoid the error without a global lock that would be better. Author: Michael Armbrust Closes #3834 from marmbrus/hiveConcurrency and squashes the following commits: bf25300 [Michael Armbrust] prevent multiple concurrent hive native commands --- .../main/scala/org/apache/spark/sql/hive/HiveContext.scala | 2 +- .../apache/spark/sql/hive/execution/HiveQuerySuite.scala | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala index 56fe27a77b838..982e0593fcfd1 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala @@ -284,7 +284,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { * Execute the command using Hive and return the results as a sequence. Each element * in the sequence is one row. */ - protected def runHive(cmd: String, maxRows: Int = 1000): Seq[String] = { + protected def runHive(cmd: String, maxRows: Int = 1000): Seq[String] = synchronized { try { val cmd_trimmed: String = cmd.trim() val tokens: Array[String] = cmd_trimmed.split("\\s+") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 4d81acc753a27..fb6da33e88ef6 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -56,6 +56,13 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { Locale.setDefault(originalLocale) } + test("SPARK-4908: concurent hive native commands") { + (1 to 100).par.map { _ => + sql("USE default") + sql("SHOW TABLES") + } + } + createQueryTest("constant object inspector for generic udf", """SELECT named_struct( lower("AA"), "10", From 94d60b7021960dc10d98039dbc6ad7193e8557f5 Mon Sep 17 00:00:00 2001 From: Daoyuan Wang Date: Tue, 30 Dec 2014 11:29:13 -0800 Subject: [PATCH 015/116] [SQL] enable view test This is a follow up of #3396 , just add a test to white list. Author: Daoyuan Wang Closes #3826 from adrian-wang/viewtest and squashes the following commits: f105f68 [Daoyuan Wang] enable view test --- .../execution/HiveCompatibilitySuite.scala | 3 +- ...anslate-0-dc7fc9ce5109ef459ee84ccfbb12d2c0 | 0 ...anslate-1-3896ae0e680a5fdc01833533b11c07bb | 0 ...nslate-10-7016e1e3a4248564f3d08cddad7ae116 | 0 ...nslate-11-e27c6a59a833dcbc2e5cdb7ff7972828 | 0 ...anslate-2-6b4caec6d7e3a91e61720bbd6b7697f0 | 0 ...anslate-3-30dc3e80e3873af5115e4f5e39078a13 | 27 ++++++++++++++++ ...anslate-4-cefb7530126f9e60cb4a29441d578f23 | 0 ...anslate-5-856ea995681b18a543dc0e53b8b43a8e | 32 +++++++++++++++++++ ...anslate-6-a14cfe3eff322066e61023ec06c7735d | 0 ...anslate-7-e947bf2dacc907825df154a4131a3fcc | 0 ...anslate-8-b1a99b0beffb0b298aec9233ecc0707f | 0 ...anslate-9-fc0dc39c4796d917685e0797bc4a9786 | 0 13 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 sql/hive/src/test/resources/golden/create_view_translate-0-dc7fc9ce5109ef459ee84ccfbb12d2c0 create mode 100644 sql/hive/src/test/resources/golden/create_view_translate-1-3896ae0e680a5fdc01833533b11c07bb create mode 100644 sql/hive/src/test/resources/golden/create_view_translate-10-7016e1e3a4248564f3d08cddad7ae116 create mode 100644 sql/hive/src/test/resources/golden/create_view_translate-11-e27c6a59a833dcbc2e5cdb7ff7972828 create mode 100644 sql/hive/src/test/resources/golden/create_view_translate-2-6b4caec6d7e3a91e61720bbd6b7697f0 create mode 100644 sql/hive/src/test/resources/golden/create_view_translate-3-30dc3e80e3873af5115e4f5e39078a13 create mode 100644 sql/hive/src/test/resources/golden/create_view_translate-4-cefb7530126f9e60cb4a29441d578f23 create mode 100644 sql/hive/src/test/resources/golden/create_view_translate-5-856ea995681b18a543dc0e53b8b43a8e create mode 100644 sql/hive/src/test/resources/golden/create_view_translate-6-a14cfe3eff322066e61023ec06c7735d create mode 100644 sql/hive/src/test/resources/golden/create_view_translate-7-e947bf2dacc907825df154a4131a3fcc create mode 100644 sql/hive/src/test/resources/golden/create_view_translate-8-b1a99b0beffb0b298aec9233ecc0707f create mode 100644 sql/hive/src/test/resources/golden/create_view_translate-9-fc0dc39c4796d917685e0797bc4a9786 diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index 1e44dd239458a..23283fd3fe6b1 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -101,6 +101,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "describe_comment_nonascii", "create_merge_compressed", + "create_view", "create_view_partitioned", "database_location", "database_properties", @@ -110,7 +111,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { // Weird DDL differences result in failures on jenkins. "create_like2", - "create_view_translate", "partitions_json", // This test is totally fine except that it includes wrong queries and expects errors, but error @@ -349,6 +349,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "create_nested_type", "create_skewed_table1", "create_struct_table", + "create_view_translate", "cross_join", "cross_product_check_1", "cross_product_check_2", diff --git a/sql/hive/src/test/resources/golden/create_view_translate-0-dc7fc9ce5109ef459ee84ccfbb12d2c0 b/sql/hive/src/test/resources/golden/create_view_translate-0-dc7fc9ce5109ef459ee84ccfbb12d2c0 new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sql/hive/src/test/resources/golden/create_view_translate-1-3896ae0e680a5fdc01833533b11c07bb b/sql/hive/src/test/resources/golden/create_view_translate-1-3896ae0e680a5fdc01833533b11c07bb new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sql/hive/src/test/resources/golden/create_view_translate-10-7016e1e3a4248564f3d08cddad7ae116 b/sql/hive/src/test/resources/golden/create_view_translate-10-7016e1e3a4248564f3d08cddad7ae116 new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sql/hive/src/test/resources/golden/create_view_translate-11-e27c6a59a833dcbc2e5cdb7ff7972828 b/sql/hive/src/test/resources/golden/create_view_translate-11-e27c6a59a833dcbc2e5cdb7ff7972828 new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sql/hive/src/test/resources/golden/create_view_translate-2-6b4caec6d7e3a91e61720bbd6b7697f0 b/sql/hive/src/test/resources/golden/create_view_translate-2-6b4caec6d7e3a91e61720bbd6b7697f0 new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sql/hive/src/test/resources/golden/create_view_translate-3-30dc3e80e3873af5115e4f5e39078a13 b/sql/hive/src/test/resources/golden/create_view_translate-3-30dc3e80e3873af5115e4f5e39078a13 new file mode 100644 index 0000000000000..cec5f77033aa4 --- /dev/null +++ b/sql/hive/src/test/resources/golden/create_view_translate-3-30dc3e80e3873af5115e4f5e39078a13 @@ -0,0 +1,27 @@ +# col_name data_type comment + +key string + +# Detailed Table Information +Database: default +Owner: animal +CreateTime: Mon Dec 29 00:57:55 PST 2014 +LastAccessTime: UNKNOWN +Protect Mode: None +Retention: 0 +Table Type: VIRTUAL_VIEW +Table Parameters: + transient_lastDdlTime 1419843475 + +# Storage Information +SerDe Library: null +InputFormat: org.apache.hadoop.mapred.SequenceFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] + +# View Information +View Original Text: select cast(key as string) from src +View Expanded Text: select cast(`src`.`key` as string) from `default`.`src` diff --git a/sql/hive/src/test/resources/golden/create_view_translate-4-cefb7530126f9e60cb4a29441d578f23 b/sql/hive/src/test/resources/golden/create_view_translate-4-cefb7530126f9e60cb4a29441d578f23 new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sql/hive/src/test/resources/golden/create_view_translate-5-856ea995681b18a543dc0e53b8b43a8e b/sql/hive/src/test/resources/golden/create_view_translate-5-856ea995681b18a543dc0e53b8b43a8e new file mode 100644 index 0000000000000..bf582fc0964a3 --- /dev/null +++ b/sql/hive/src/test/resources/golden/create_view_translate-5-856ea995681b18a543dc0e53b8b43a8e @@ -0,0 +1,32 @@ +# col_name data_type comment + +key int +value string + +# Detailed Table Information +Database: default +Owner: animal +CreateTime: Mon Dec 29 00:57:55 PST 2014 +LastAccessTime: UNKNOWN +Protect Mode: None +Retention: 0 +Table Type: VIRTUAL_VIEW +Table Parameters: + transient_lastDdlTime 1419843475 + +# Storage Information +SerDe Library: null +InputFormat: org.apache.hadoop.mapred.SequenceFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] + +# View Information +View Original Text: select key, value from ( + select key, value from src +) a +View Expanded Text: select key, value from ( + select `src`.`key`, `src`.`value` from `default`.`src` +) `a` diff --git a/sql/hive/src/test/resources/golden/create_view_translate-6-a14cfe3eff322066e61023ec06c7735d b/sql/hive/src/test/resources/golden/create_view_translate-6-a14cfe3eff322066e61023ec06c7735d new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sql/hive/src/test/resources/golden/create_view_translate-7-e947bf2dacc907825df154a4131a3fcc b/sql/hive/src/test/resources/golden/create_view_translate-7-e947bf2dacc907825df154a4131a3fcc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sql/hive/src/test/resources/golden/create_view_translate-8-b1a99b0beffb0b298aec9233ecc0707f b/sql/hive/src/test/resources/golden/create_view_translate-8-b1a99b0beffb0b298aec9233ecc0707f new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sql/hive/src/test/resources/golden/create_view_translate-9-fc0dc39c4796d917685e0797bc4a9786 b/sql/hive/src/test/resources/golden/create_view_translate-9-fc0dc39c4796d917685e0797bc4a9786 new file mode 100644 index 0000000000000..e69de29bb2d1d From 65357f11c25a7c91577df5da31ebf349d7845eef Mon Sep 17 00:00:00 2001 From: scwf Date: Tue, 30 Dec 2014 11:30:47 -0800 Subject: [PATCH 016/116] [SPARK-4975][SQL] Fix HiveInspectorSuite test failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HiveInspectorSuite test failure: [info] - wrap / unwrap null, constant null and writables *** FAILED *** (21 milliseconds) [info] 1 did not equal 0 (HiveInspectorSuite.scala:136) this is because the origin date(is 3914-10-23) not equals the date returned by ```unwrap```(is 3914-10-22). Setting TimeZone and Locale fix this. Another minor change here is rename ```def checkValues(v1: Any, v2: Any): Unit``` to ```def checkValue(v1: Any, v2: Any): Unit ``` to make the code more clear Author: scwf Author: Fei Wang Closes #3814 from scwf/fix-inspectorsuite and squashes the following commits: d8531ef [Fei Wang] Delete test.log 72b19a9 [scwf] fix HiveInspectorSuite test error --- .../spark/sql/hive/HiveInspectorSuite.scala | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala index bfe608a51a30b..f90d3607915ae 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.hive import java.sql.Date import java.util +import java.util.{Locale, TimeZone} import org.apache.hadoop.hive.serde2.io.DoubleWritable import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory @@ -63,6 +64,11 @@ class HiveInspectorSuite extends FunSuite with HiveInspectors { .get()) } + // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) + TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) + // Add Locale setting + Locale.setDefault(Locale.US) + val data = Literal(true) :: Literal(0.asInstanceOf[Byte]) :: @@ -121,11 +127,11 @@ class HiveInspectorSuite extends FunSuite with HiveInspectors { def checkValues(row1: Seq[Any], row2: Seq[Any]): Unit = { row1.zip(row2).map { - case (r1, r2) => checkValues(r1, r2) + case (r1, r2) => checkValue(r1, r2) } } - def checkValues(v1: Any, v2: Any): Unit = { + def checkValue(v1: Any, v2: Any): Unit = { (v1, v2) match { case (r1: Decimal, r2: Decimal) => // Ignore the Decimal precision @@ -195,26 +201,26 @@ class HiveInspectorSuite extends FunSuite with HiveInspectors { }) checkValues(row, unwrap(wrap(row, toInspector(dt)), toInspector(dt)).asInstanceOf[Row]) - checkValues(null, unwrap(wrap(null, toInspector(dt)), toInspector(dt))) + checkValue(null, unwrap(wrap(null, toInspector(dt)), toInspector(dt))) } test("wrap / unwrap Array Type") { val dt = ArrayType(dataTypes(0)) val d = row(0) :: row(0) :: Nil - checkValues(d, unwrap(wrap(d, toInspector(dt)), toInspector(dt))) - checkValues(null, unwrap(wrap(null, toInspector(dt)), toInspector(dt))) - checkValues(d, unwrap(wrap(d, toInspector(Literal(d, dt))), toInspector(Literal(d, dt)))) - checkValues(d, unwrap(wrap(null, toInspector(Literal(d, dt))), toInspector(Literal(d, dt)))) + checkValue(d, unwrap(wrap(d, toInspector(dt)), toInspector(dt))) + checkValue(null, unwrap(wrap(null, toInspector(dt)), toInspector(dt))) + checkValue(d, unwrap(wrap(d, toInspector(Literal(d, dt))), toInspector(Literal(d, dt)))) + checkValue(d, unwrap(wrap(null, toInspector(Literal(d, dt))), toInspector(Literal(d, dt)))) } test("wrap / unwrap Map Type") { val dt = MapType(dataTypes(0), dataTypes(1)) val d = Map(row(0) -> row(1)) - checkValues(d, unwrap(wrap(d, toInspector(dt)), toInspector(dt))) - checkValues(null, unwrap(wrap(null, toInspector(dt)), toInspector(dt))) - checkValues(d, unwrap(wrap(d, toInspector(Literal(d, dt))), toInspector(Literal(d, dt)))) - checkValues(d, unwrap(wrap(null, toInspector(Literal(d, dt))), toInspector(Literal(d, dt)))) + checkValue(d, unwrap(wrap(d, toInspector(dt)), toInspector(dt))) + checkValue(null, unwrap(wrap(null, toInspector(dt)), toInspector(dt))) + checkValue(d, unwrap(wrap(d, toInspector(Literal(d, dt))), toInspector(Literal(d, dt)))) + checkValue(d, unwrap(wrap(null, toInspector(Literal(d, dt))), toInspector(Literal(d, dt)))) } } From 5595eaa74f139fdb6fd8a7bb0ca6ed421ef00ac8 Mon Sep 17 00:00:00 2001 From: Cheng Hao Date: Tue, 30 Dec 2014 11:33:47 -0800 Subject: [PATCH 017/116] [SPARK-4959] [SQL] Attributes are case sensitive when using a select query from a projection Author: Cheng Hao Closes #3796 from chenghao-intel/spark_4959 and squashes the following commits: 3ec08f8 [Cheng Hao] Replace the attribute in comparing its exprId other than itself --- .../spark/sql/catalyst/optimizer/Optimizer.scala | 8 ++++---- .../sql/hive/execution/HiveTableScanSuite.scala | 14 +++++++++++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 806c1394eb151..0f2eae6400d21 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -142,16 +142,16 @@ object ColumnPruning extends Rule[LogicalPlan] { case Project(projectList1, Project(projectList2, child)) => // Create a map of Aliases to their values from the child projection. // e.g., 'SELECT ... FROM (SELECT a + b AS c, d ...)' produces Map(c -> Alias(a + b, c)). - val aliasMap = projectList2.collect { - case a @ Alias(e, _) => (a.toAttribute: Expression, a) - }.toMap + val aliasMap = AttributeMap(projectList2.collect { + case a @ Alias(e, _) => (a.toAttribute, a) + }) // Substitute any attributes that are produced by the child projection, so that we safely // eliminate it. // e.g., 'SELECT c + 1 FROM (SELECT a + b AS C ...' produces 'SELECT a + b + 1 ...' // TODO: Fix TransformBase to avoid the cast below. val substitutedProjection = projectList1.map(_.transform { - case a if aliasMap.contains(a) => aliasMap(a) + case a: Attribute if aliasMap.contains(a) => aliasMap(a) }).asInstanceOf[Seq[NamedExpression]] Project(substitutedProjection, child) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala index a0ace91060a28..16f77a438e1ae 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala @@ -18,7 +18,8 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.hive.test.TestHive -import org.apache.spark.sql.{Row, SchemaRDD} +import org.apache.spark.sql.hive.test.TestHive._ +import org.apache.spark.sql.Row import org.apache.spark.util.Utils @@ -76,4 +77,15 @@ class HiveTableScanSuite extends HiveComparisonTest { === Array(Row(java.sql.Timestamp.valueOf("2014-12-11 00:00:00")),Row(null))) TestHive.sql("DROP TABLE timestamp_query_null") } + + test("Spark-4959 Attributes are case sensitive when using a select query from a projection") { + sql("create table spark_4959 (col1 string)") + sql("""insert into table spark_4959 select "hi" from src limit 1""") + table("spark_4959").select( + 'col1.as('CaseSensitiveColName), + 'col1.as('CaseSensitiveColName2)).registerTempTable("spark_4959_2") + + assert(sql("select CaseSensitiveColName from spark_4959_2").first() === Row("hi")) + assert(sql("select casesensitivecolname from spark_4959_2").first() === Row("hi")) + } } From 63b84b7d6785a687dd7f4c0e2bb1e348800d30d8 Mon Sep 17 00:00:00 2001 From: Cheng Hao Date: Tue, 30 Dec 2014 11:47:08 -0800 Subject: [PATCH 018/116] [SPARK-4904] [SQL] Remove the unnecessary code change in Generic UDF Since #3429 has been merged, the bug of wrapping to Writable for HiveGenericUDF is resolved, we can safely remove the foldable checking in `HiveGenericUdf.eval`, which discussed in #2802. Author: Cheng Hao Closes #3745 from chenghao-intel/generic_udf and squashes the following commits: 622ad03 [Cheng Hao] Remove the unnecessary code change in Generic UDF --- .../src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala index 93b6ef9fbc59b..7d863f9d89dae 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala @@ -158,11 +158,6 @@ private[hive] case class HiveGenericUdf(funcWrapper: HiveFunctionWrapper, childr override def foldable = isUDFDeterministic && returnInspector.isInstanceOf[ConstantObjectInspector] - @transient - protected def constantReturnValue = unwrap( - returnInspector.asInstanceOf[ConstantObjectInspector].getWritableConstantValue(), - returnInspector) - @transient protected lazy val deferedObjects = argumentInspectors.map(new DeferredObjectAdapter(_)).toArray[DeferredObject] @@ -171,7 +166,6 @@ private[hive] case class HiveGenericUdf(funcWrapper: HiveFunctionWrapper, childr override def eval(input: Row): Any = { returnInspector // Make sure initialized. - if(foldable) return constantReturnValue var i = 0 while (i < children.length) { From daac221302e0cf71a7b7bda31625134cf7b9dce1 Mon Sep 17 00:00:00 2001 From: wangfei Date: Tue, 30 Dec 2014 12:07:24 -0800 Subject: [PATCH 019/116] [SPARK-5002][SQL] Using ascending by default when not specify order in order by spark sql does not support ```SELECT a, b FROM testData2 ORDER BY a desc, b```. Author: wangfei Closes #3838 from scwf/orderby and squashes the following commits: 114b64a [wangfei] remove nouse methods 48145d3 [wangfei] fix order, using asc by default --- .../scala/org/apache/spark/sql/catalyst/SqlParser.scala | 8 ++------ .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 7 +++++++ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index d4fc9bbfd3118..66860a4c0923a 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala @@ -209,15 +209,11 @@ class SqlParser extends AbstractSparkSQLParser { ) protected lazy val ordering: Parser[Seq[SortOrder]] = - ( rep1sep(singleOrder, ",") - | rep1sep(expression, ",") ~ direction.? ^^ { - case exps ~ d => exps.map(SortOrder(_, d.getOrElse(Ascending))) + ( rep1sep(expression ~ direction.? , ",") ^^ { + case exps => exps.map(pair => SortOrder(pair._1, pair._2.getOrElse(Ascending))) } ) - protected lazy val singleOrder: Parser[SortOrder] = - expression ~ direction ^^ { case e ~ o => SortOrder(e, o) } - protected lazy val direction: Parser[SortDirection] = ( ASC ^^^ Ascending | DESC ^^^ Descending diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index ddf4776ecf7ae..add4e218a22ee 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -987,6 +987,13 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { ) } + test("oder by asc by default when not specify ascending and descending") { + checkAnswer( + sql("SELECT a, b FROM testData2 ORDER BY a desc, b"), + Seq((3, 1), (3, 2), (2, 1), (2,2), (1, 1), (1, 2)) + ) + } + test("Supporting relational operator '<=>' in Spark SQL") { val nullCheckData1 = TestData(1,"1") :: TestData(2,null) :: Nil val rdd1 = sparkContext.parallelize((0 to 1).map(i => nullCheckData1(i))) From 53f0a00b6051fb6cb52a90f91ae01bcd77e332c5 Mon Sep 17 00:00:00 2001 From: Cheng Hao Date: Tue, 30 Dec 2014 12:11:44 -0800 Subject: [PATCH 020/116] [Spark-4512] [SQL] Unresolved Attribute Exception in Sort By It will cause exception while do query like: SELECT key+key FROM src sort by value; Author: Cheng Hao Closes #3386 from chenghao-intel/sort and squashes the following commits: 38c78cc [Cheng Hao] revert the SortPartition in SparkStrategies 7e9dd15 [Cheng Hao] update the typo fcd1d64 [Cheng Hao] rebase the latest master and update the SortBy unit test --- .../apache/spark/sql/catalyst/SqlParser.scala | 4 ++-- .../sql/catalyst/analysis/Analyzer.scala | 13 +++++++------ .../spark/sql/catalyst/dsl/package.scala | 4 ++-- .../plans/logical/basicOperators.scala | 11 ++++++++++- .../org/apache/spark/sql/SchemaRDD.scala | 5 ++--- .../spark/sql/execution/SparkStrategies.scala | 11 +++++------ .../org/apache/spark/sql/DslQuerySuite.scala | 19 ++++++++++++++----- .../scala/org/apache/spark/sql/TestData.scala | 2 +- .../org/apache/spark/sql/hive/HiveQl.scala | 8 ++++---- .../hive/execution/HiveComparisonTest.scala | 2 +- .../sql/hive/execution/SQLQuerySuite.scala | 7 +++++++ 11 files changed, 55 insertions(+), 31 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index 66860a4c0923a..f79d4ff444dc0 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala @@ -204,8 +204,8 @@ class SqlParser extends AbstractSparkSQLParser { ) protected lazy val sortType: Parser[LogicalPlan => LogicalPlan] = - ( ORDER ~ BY ~> ordering ^^ { case o => l: LogicalPlan => Sort(o, l) } - | SORT ~ BY ~> ordering ^^ { case o => l: LogicalPlan => SortPartitions(o, l) } + ( ORDER ~ BY ~> ordering ^^ { case o => l: LogicalPlan => Sort(o, true, l) } + | SORT ~ BY ~> ordering ^^ { case o => l: LogicalPlan => Sort(o, false, l) } ) protected lazy val ordering: Parser[Seq[SortOrder]] = diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 1c4088b8438e1..72680f37a0b4d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -246,7 +246,7 @@ class Analyzer(catalog: Catalog, case p: LogicalPlan if !p.childrenResolved => p // If the projection list contains Stars, expand it. - case p@Project(projectList, child) if containsStar(projectList) => + case p @ Project(projectList, child) if containsStar(projectList) => Project( projectList.flatMap { case s: Star => s.expand(child.output, resolver) @@ -310,7 +310,8 @@ class Analyzer(catalog: Catalog, */ object ResolveSortReferences extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { - case s @ Sort(ordering, p @ Project(projectList, child)) if !s.resolved && p.resolved => + case s @ Sort(ordering, global, p @ Project(projectList, child)) + if !s.resolved && p.resolved => val unresolved = ordering.flatMap(_.collect { case UnresolvedAttribute(name) => name }) val resolved = unresolved.flatMap(child.resolve(_, resolver)) val requiredAttributes = AttributeSet(resolved.collect { case a: Attribute => a }) @@ -319,13 +320,14 @@ class Analyzer(catalog: Catalog, if (missingInProject.nonEmpty) { // Add missing attributes and then project them away after the sort. Project(projectList.map(_.toAttribute), - Sort(ordering, + Sort(ordering, global, Project(projectList ++ missingInProject, child))) } else { logDebug(s"Failed to find $missingInProject in ${p.output.mkString(", ")}") s // Nothing we can do here. Return original plan. } - case s @ Sort(ordering, a @ Aggregate(grouping, aggs, child)) if !s.resolved && a.resolved => + case s @ Sort(ordering, global, a @ Aggregate(grouping, aggs, child)) + if !s.resolved && a.resolved => val unresolved = ordering.flatMap(_.collect { case UnresolvedAttribute(name) => name }) // A small hack to create an object that will allow us to resolve any references that // refer to named expressions that are present in the grouping expressions. @@ -340,8 +342,7 @@ class Analyzer(catalog: Catalog, if (missingInAggs.nonEmpty) { // Add missing grouping exprs and then project them away after the sort. Project(a.output, - Sort(ordering, - Aggregate(grouping, aggs ++ missingInAggs, child))) + Sort(ordering, global, Aggregate(grouping, aggs ++ missingInAggs, child))) } else { s // Nothing we can do here. Return original plan. } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index fb252cdf51534..a14e5b9ef14d0 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -244,9 +244,9 @@ package object dsl { condition: Option[Expression] = None) = Join(logicalPlan, otherPlan, joinType, condition) - def orderBy(sortExprs: SortOrder*) = Sort(sortExprs, logicalPlan) + def orderBy(sortExprs: SortOrder*) = Sort(sortExprs, true, logicalPlan) - def sortBy(sortExprs: SortOrder*) = SortPartitions(sortExprs, logicalPlan) + def sortBy(sortExprs: SortOrder*) = Sort(sortExprs, false, logicalPlan) def groupBy(groupingExprs: Expression*)(aggregateExprs: Expression*) = { val aliasedExprs = aggregateExprs.map { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala index a9282b98adfab..0b9f01cbae9ea 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala @@ -130,7 +130,16 @@ case class WriteToFile( override def output = child.output } -case class Sort(order: Seq[SortOrder], child: LogicalPlan) extends UnaryNode { +/** + * @param order The ordering expressions + * @param global True means global sorting apply for entire data set, + * False means sorting only apply within the partition. + * @param child Child logical plan + */ +case class Sort( + order: Seq[SortOrder], + global: Boolean, + child: LogicalPlan) extends UnaryNode { override def output = child.output } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala index 856b10f1a8fd8..80787b61ce1bf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala @@ -214,7 +214,7 @@ class SchemaRDD( * @group Query */ def orderBy(sortExprs: SortOrder*): SchemaRDD = - new SchemaRDD(sqlContext, Sort(sortExprs, logicalPlan)) + new SchemaRDD(sqlContext, Sort(sortExprs, true, logicalPlan)) /** * Sorts the results by the given expressions within partition. @@ -227,7 +227,7 @@ class SchemaRDD( * @group Query */ def sortBy(sortExprs: SortOrder*): SchemaRDD = - new SchemaRDD(sqlContext, SortPartitions(sortExprs, logicalPlan)) + new SchemaRDD(sqlContext, Sort(sortExprs, false, logicalPlan)) @deprecated("use limit with integer argument", "1.1.0") def limit(limitExpr: Expression): SchemaRDD = @@ -238,7 +238,6 @@ class SchemaRDD( * {{{ * schemaRDD.limit(10) * }}} - * * @group Query */ def limit(limitNum: Int): SchemaRDD = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 2954d4ce7d2d8..9151da69ed44c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -190,7 +190,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { object TakeOrdered extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { - case logical.Limit(IntegerLiteral(limit), logical.Sort(order, child)) => + case logical.Limit(IntegerLiteral(limit), logical.Sort(order, true, child)) => execution.TakeOrdered(limit, order, planLater(child)) :: Nil case _ => Nil } @@ -257,15 +257,14 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { execution.Distinct(partial = false, execution.Distinct(partial = true, planLater(child))) :: Nil - case logical.Sort(sortExprs, child) if sqlContext.externalSortEnabled => - execution.ExternalSort(sortExprs, global = true, planLater(child)):: Nil - case logical.Sort(sortExprs, child) => - execution.Sort(sortExprs, global = true, planLater(child)):: Nil - case logical.SortPartitions(sortExprs, child) => // This sort only sorts tuples within a partition. Its requiredDistribution will be // an UnspecifiedDistribution. execution.Sort(sortExprs, global = false, planLater(child)) :: Nil + case logical.Sort(sortExprs, global, child) if sqlContext.externalSortEnabled => + execution.ExternalSort(sortExprs, global, planLater(child)):: Nil + case logical.Sort(sortExprs, global, child) => + execution.Sort(sortExprs, global, planLater(child)):: Nil case logical.Project(projectList, child) => execution.Project(projectList, planLater(child)) :: Nil case logical.Filter(condition, child) => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala index 691c4b38287bf..c0b9cf5163120 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala @@ -88,7 +88,7 @@ class DslQuerySuite extends QueryTest { Seq(Seq(6))) } - test("sorting") { + test("global sorting") { checkAnswer( testData2.orderBy('a.asc, 'b.asc), Seq((1,1), (1,2), (2,1), (2,2), (3,1), (3,2))) @@ -122,22 +122,31 @@ class DslQuerySuite extends QueryTest { mapData.collect().sortBy(_.data(1)).reverse.toSeq) } - test("sorting #2") { + test("partition wide sorting") { + // 2 partitions totally, and + // Partition #1 with values: + // (1, 1) + // (1, 2) + // (2, 1) + // Partition #2 with values: + // (2, 2) + // (3, 1) + // (3, 2) checkAnswer( testData2.sortBy('a.asc, 'b.asc), Seq((1,1), (1,2), (2,1), (2,2), (3,1), (3,2))) checkAnswer( testData2.sortBy('a.asc, 'b.desc), - Seq((1,2), (1,1), (2,2), (2,1), (3,2), (3,1))) + Seq((1,2), (1,1), (2,1), (2,2), (3,2), (3,1))) checkAnswer( testData2.sortBy('a.desc, 'b.desc), - Seq((3,2), (3,1), (2,2), (2,1), (1,2), (1,1))) + Seq((2,1), (1,2), (1,1), (3,2), (3,1), (2,2))) checkAnswer( testData2.sortBy('a.desc, 'b.asc), - Seq((3,1), (3,2), (2,1), (2,2), (1,1), (1,2))) + Seq((2,1), (1,1), (1,2), (3,1), (3,2), (2,2))) } test("limit") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala index bb553a0a1e50c..497897c3c0d4d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala @@ -55,7 +55,7 @@ object TestData { TestData2(2, 1) :: TestData2(2, 2) :: TestData2(3, 1) :: - TestData2(3, 2) :: Nil).toSchemaRDD + TestData2(3, 2) :: Nil, 2).toSchemaRDD testData2.registerTempTable("testData2") case class DecimalData(a: BigDecimal, b: BigDecimal) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index 3f3d9e7cd4fbe..8a9613cf96e54 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -680,16 +680,16 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C val withSort = (orderByClause, sortByClause, distributeByClause, clusterByClause) match { case (Some(totalOrdering), None, None, None) => - Sort(totalOrdering.getChildren.map(nodeToSortOrder), withHaving) + Sort(totalOrdering.getChildren.map(nodeToSortOrder), true, withHaving) case (None, Some(perPartitionOrdering), None, None) => - SortPartitions(perPartitionOrdering.getChildren.map(nodeToSortOrder), withHaving) + Sort(perPartitionOrdering.getChildren.map(nodeToSortOrder), false, withHaving) case (None, None, Some(partitionExprs), None) => Repartition(partitionExprs.getChildren.map(nodeToExpr), withHaving) case (None, Some(perPartitionOrdering), Some(partitionExprs), None) => - SortPartitions(perPartitionOrdering.getChildren.map(nodeToSortOrder), + Sort(perPartitionOrdering.getChildren.map(nodeToSortOrder), false, Repartition(partitionExprs.getChildren.map(nodeToExpr), withHaving)) case (None, None, None, Some(clusterExprs)) => - SortPartitions(clusterExprs.getChildren.map(nodeToExpr).map(SortOrder(_, Ascending)), + Sort(clusterExprs.getChildren.map(nodeToExpr).map(SortOrder(_, Ascending)), false, Repartition(clusterExprs.getChildren.map(nodeToExpr), withHaving)) case (None, None, None, None) => withHaving case _ => sys.error("Unsupported set of ordering / distribution clauses.") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala index 8011f9b8773b3..4104df8f8e022 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala @@ -132,7 +132,7 @@ abstract class HiveComparisonTest def isSorted(plan: LogicalPlan): Boolean = plan match { case _: Join | _: Aggregate | _: Generate | _: Sample | _: Distinct => false - case PhysicalOperation(_, _, Sort(_, _)) => true + case PhysicalOperation(_, _, Sort(_, true, _)) => true case _ => plan.children.iterator.exists(isSorted) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index f57f31af15566..5d0fb7237011f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -32,6 +32,13 @@ case class Nested3(f3: Int) * valid, but Hive currently cannot execute it. */ class SQLQuerySuite extends QueryTest { + test("SPARK-4512 Fix attribute reference resolution error when using SORT BY") { + checkAnswer( + sql("SELECT * FROM (SELECT key + key AS a FROM src SORT BY value) t ORDER BY t.a"), + sql("SELECT key + key as a FROM src ORDER BY a").collect().toSeq + ) + } + test("CTAS with serde") { sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value").collect sql( From 19a8802e703e6b075a148ba73dc9dd80748d6322 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Tue, 30 Dec 2014 12:16:45 -0800 Subject: [PATCH 021/116] [SPARK-4493][SQL] Tests for IsNull / IsNotNull in the ParquetFilterSuite This is a follow-up of #3367 and #3644. At the time #3644 was written, #3367 hadn't been merged yet, thus `IsNull` and `IsNotNull` filters are not covered in the first version of `ParquetFilterSuite`. This PR adds corresponding test cases. [Review on Reviewable](https://reviewable.io/reviews/apache/spark/3748) Author: Cheng Lian Closes #3748 from liancheng/test-null-filters and squashes the following commits: 1ab943f [Cheng Lian] IsNull and IsNotNull Parquet filter test case for boolean type bcd616b [Cheng Lian] Adds Parquet filter pushedown tests for IsNull and IsNotNull --- .../sql/parquet/ParquetFilterSuite.scala | 60 +++++++++++++++---- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala index b17300475b6f6..4c3a04506ce42 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala @@ -28,11 +28,14 @@ import org.apache.spark.sql.{QueryTest, SQLConf, SchemaRDD} /** * A test suite that tests Parquet filter2 API based filter pushdown optimization. * - * Notice that `!(a cmp b)` are always transformed to its negated form `a cmp' b` by the - * `BooleanSimplification` optimization rule whenever possible. As a result, predicate `!(a < 1)` - * results a `GtEq` filter predicate rather than a `Not`. + * NOTE: * - * @todo Add test cases for `IsNull` and `IsNotNull` after merging PR #3367 + * 1. `!(a cmp b)` is always transformed to its negated form `a cmp' b` by the + * `BooleanSimplification` optimization rule whenever possible. As a result, predicate `!(a < 1)` + * results in a `GtEq` filter predicate rather than a `Not`. + * + * 2. `Tuple1(Option(x))` is used together with `AnyVal` types like `Int` to ensure the inferred + * data type is nullable. */ class ParquetFilterSuite extends QueryTest with ParquetTest { val sqlContext = TestSQLContext @@ -85,14 +88,26 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { } test("filter pushdown - boolean") { - withParquetRDD((true :: false :: Nil).map(Tuple1.apply)) { rdd => + withParquetRDD((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) { rdd => + checkFilterPushdown(rdd, '_1)('_1.isNull, classOf[Eq[java.lang.Boolean]])(Seq.empty[Row]) + checkFilterPushdown(rdd, '_1)('_1.isNotNull, classOf[NotEq[java.lang.Boolean]]) { + Seq(Row(true), Row(false)) + } + checkFilterPushdown(rdd, '_1)('_1 === true, classOf[Eq[java.lang.Boolean]])(true) - checkFilterPushdown(rdd, '_1)('_1 !== true, classOf[Operators.NotEq[java.lang.Boolean]])(false) + checkFilterPushdown(rdd, '_1)('_1 !== true, classOf[Operators.NotEq[java.lang.Boolean]]) { + false + } } } test("filter pushdown - integer") { - withParquetRDD((1 to 4).map(Tuple1.apply)) { rdd => + withParquetRDD((1 to 4).map(i => Tuple1(Option(i)))) { rdd => + checkFilterPushdown(rdd, '_1)('_1.isNull, classOf[Eq[Integer]])(Seq.empty[Row]) + checkFilterPushdown(rdd, '_1)('_1.isNotNull, classOf[NotEq[Integer]]) { + (1 to 4).map(Row.apply(_)) + } + checkFilterPushdown(rdd, '_1)('_1 === 1, classOf[Eq[Integer]])(1) checkFilterPushdown(rdd, '_1)('_1 !== 1, classOf[Operators.NotEq[Integer]]) { (2 to 4).map(Row.apply(_)) @@ -118,7 +133,12 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { } test("filter pushdown - long") { - withParquetRDD((1 to 4).map(i => Tuple1(i.toLong))) { rdd => + withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toLong)))) { rdd => + checkFilterPushdown(rdd, '_1)('_1.isNull, classOf[Eq[java.lang.Long]])(Seq.empty[Row]) + checkFilterPushdown(rdd, '_1)('_1.isNotNull, classOf[NotEq[java.lang.Long]]) { + (1 to 4).map(Row.apply(_)) + } + checkFilterPushdown(rdd, '_1)('_1 === 1, classOf[Eq[java.lang.Long]])(1) checkFilterPushdown(rdd, '_1)('_1 !== 1, classOf[Operators.NotEq[java.lang.Long]]) { (2 to 4).map(Row.apply(_)) @@ -144,7 +164,12 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { } test("filter pushdown - float") { - withParquetRDD((1 to 4).map(i => Tuple1(i.toFloat))) { rdd => + withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toFloat)))) { rdd => + checkFilterPushdown(rdd, '_1)('_1.isNull, classOf[Eq[java.lang.Float]])(Seq.empty[Row]) + checkFilterPushdown(rdd, '_1)('_1.isNotNull, classOf[NotEq[java.lang.Float]]) { + (1 to 4).map(Row.apply(_)) + } + checkFilterPushdown(rdd, '_1)('_1 === 1, classOf[Eq[java.lang.Float]])(1) checkFilterPushdown(rdd, '_1)('_1 !== 1, classOf[Operators.NotEq[java.lang.Float]]) { (2 to 4).map(Row.apply(_)) @@ -170,7 +195,12 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { } test("filter pushdown - double") { - withParquetRDD((1 to 4).map(i => Tuple1(i.toDouble))) { rdd => + withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toDouble)))) { rdd => + checkFilterPushdown(rdd, '_1)('_1.isNull, classOf[Eq[java.lang.Double]])(Seq.empty[Row]) + checkFilterPushdown(rdd, '_1)('_1.isNotNull, classOf[NotEq[java.lang.Double]]) { + (1 to 4).map(Row.apply(_)) + } + checkFilterPushdown(rdd, '_1)('_1 === 1, classOf[Eq[java.lang.Double]])(1) checkFilterPushdown(rdd, '_1)('_1 !== 1, classOf[Operators.NotEq[java.lang.Double]]) { (2 to 4).map(Row.apply(_)) @@ -197,6 +227,11 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { test("filter pushdown - string") { withParquetRDD((1 to 4).map(i => Tuple1(i.toString))) { rdd => + checkFilterPushdown(rdd, '_1)('_1.isNull, classOf[Eq[java.lang.String]])(Seq.empty[Row]) + checkFilterPushdown(rdd, '_1)('_1.isNotNull, classOf[NotEq[java.lang.String]]) { + (1 to 4).map(i => Row.apply(i.toString)) + } + checkFilterPushdown(rdd, '_1)('_1 === "1", classOf[Eq[String]])("1") checkFilterPushdown(rdd, '_1)('_1 !== "1", classOf[Operators.NotEq[String]]) { (2 to 4).map(i => Row.apply(i.toString)) @@ -227,6 +262,11 @@ class ParquetFilterSuite extends QueryTest with ParquetTest { } withParquetRDD((1 to 4).map(i => Tuple1(i.b))) { rdd => + checkBinaryFilterPushdown(rdd, '_1)('_1.isNull, classOf[Eq[java.lang.String]])(Seq.empty[Row]) + checkBinaryFilterPushdown(rdd, '_1)('_1.isNotNull, classOf[NotEq[java.lang.String]]) { + (1 to 4).map(i => Row.apply(i.b)).toSeq + } + checkBinaryFilterPushdown(rdd, '_1)('_1 === 1.b, classOf[Eq[Array[Byte]]])(1.b) checkBinaryFilterPushdown(rdd, '_1)('_1 !== 1.b, classOf[Operators.NotEq[Array[Byte]]]) { (2 to 4).map(i => Row.apply(i.b)).toSeq From f7a41a0e79561a722e41800257dca886732ccaad Mon Sep 17 00:00:00 2001 From: luogankun Date: Tue, 30 Dec 2014 12:17:49 -0800 Subject: [PATCH 022/116] [SPARK-4916][SQL][DOCS]Update SQL programming guide about cache section `SchemeRDD.cache()` now uses in-memory columnar storage. Author: luogankun Closes #3759 from luogankun/SPARK-4916 and squashes the following commits: 7b39864 [luogankun] [SPARK-4916]Update SQL programming guide 6018122 [luogankun] Merge branch 'master' of https://github.com/apache/spark into SPARK-4916 0b93785 [luogankun] [SPARK-4916]Update SQL programming guide 99b2336 [luogankun] [SPARK-4916]Update SQL programming guide --- docs/sql-programming-guide.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 2aea8a8aedafc..1b5fde991e405 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -831,13 +831,10 @@ turning on some experimental options. ## Caching Data In Memory -Spark SQL can cache tables using an in-memory columnar format by calling `sqlContext.cacheTable("tableName")`. +Spark SQL can cache tables using an in-memory columnar format by calling `sqlContext.cacheTable("tableName")` or `schemaRDD.cache()`. Then Spark SQL will scan only required columns and will automatically tune compression to minimize memory usage and GC pressure. You can call `sqlContext.uncacheTable("tableName")` to remove the table from memory. -Note that if you call `schemaRDD.cache()` rather than `sqlContext.cacheTable(...)`, tables will _not_ be cached using -the in-memory columnar format, and therefore `sqlContext.cacheTable(...)` is strongly recommended for this use case. - Configuration of in-memory caching can be done using the `setConf` method on SQLContext or by running `SET key=value` commands using SQL. From 2deac748b4e1245c2cb9bd43ad87c80d6d130a83 Mon Sep 17 00:00:00 2001 From: luogankun Date: Tue, 30 Dec 2014 12:18:55 -0800 Subject: [PATCH 023/116] [SPARK-4930][SQL][DOCS]Update SQL programming guide, CACHE TABLE is eager `CACHE TABLE tbl` is now __eager__ by default not __lazy__ Author: luogankun Closes #3773 from luogankun/SPARK-4930 and squashes the following commits: cc17b7d [luogankun] [SPARK-4930][SQL][DOCS]Update SQL programming guide, add CACHE [LAZY] TABLE [AS SELECT] ... bffe0e8 [luogankun] [SPARK-4930][SQL][DOCS]Update SQL programming guide, CACHE TABLE tbl is eager --- docs/sql-programming-guide.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 1b5fde991e405..729045b81a8c0 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1007,12 +1007,11 @@ let user control table caching explicitly: CACHE TABLE logs_last_month; UNCACHE TABLE logs_last_month; -**NOTE:** `CACHE TABLE tbl` is lazy, similar to `.cache` on an RDD. This command only marks `tbl` to ensure that -partitions are cached when calculated but doesn't actually cache it until a query that touches `tbl` is executed. -To force the table to be cached, you may simply count the table immediately after executing `CACHE TABLE`: +**NOTE:** `CACHE TABLE tbl` is now __eager__ by default not __lazy__. Don’t need to trigger cache materialization manually anymore. - CACHE TABLE logs_last_month; - SELECT COUNT(1) FROM logs_last_month; +Spark SQL newly introduced a statement to let user control table caching whether or not lazy since Spark 1.2.0: + + CACHE [LAZY] TABLE [AS SELECT] ... Several caching related features are not supported yet: From a75dd83b72586695768c89ed32b240aa8f48f32c Mon Sep 17 00:00:00 2001 From: guowei2 Date: Tue, 30 Dec 2014 12:21:00 -0800 Subject: [PATCH 024/116] [SPARK-4928][SQL] Fix: Operator '>,<,>=,<=' with decimal between different precision report error case operator with decimal between different precision, we need change them to unlimited Author: guowei2 Closes #3767 from guowei2/SPARK-4928 and squashes the following commits: c6a6e3e [guowei2] fix code style 3214e0a [guowei2] add test case b4985a2 [guowei2] fix code style 27adf42 [guowei2] Fix: Operation '>,<,>=,<=' with Decimal report error --- .../catalyst/analysis/HiveTypeCoercion.scala | 16 ++++++++++++++++ .../analysis/DecimalPrecisionSuite.scala | 17 +++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index e38114ab3cf25..242f28f670298 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -361,6 +361,22 @@ trait HiveTypeCoercion { DecimalType(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2)) ) + case LessThan(e1 @ DecimalType.Expression(p1, s1), + e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 => + LessThan(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited)) + + case LessThanOrEqual(e1 @ DecimalType.Expression(p1, s1), + e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 => + LessThanOrEqual(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited)) + + case GreaterThan(e1 @ DecimalType.Expression(p1, s1), + e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 => + GreaterThan(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited)) + + case GreaterThanOrEqual(e1 @ DecimalType.Expression(p1, s1), + e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 => + GreaterThanOrEqual(Cast(e1, DecimalType.Unlimited), Cast(e2, DecimalType.Unlimited)) + // Promote integers inside a binary expression with fixed-precision decimals to decimals, // and fixed-precision decimals in an expression with floats / doubles to doubles case b: BinaryExpression if b.left.dataType != b.right.dataType => diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala index d5b7d2789a103..3677a6e72e23a 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala @@ -49,6 +49,15 @@ class DecimalPrecisionSuite extends FunSuite with BeforeAndAfter { assert(analyzer(plan).schema.fields(0).dataType === expectedType) } + private def checkComparison(expression: Expression, expectedType: DataType): Unit = { + val plan = Project(Alias(expression, "c")() :: Nil, relation) + val comparison = analyzer(plan).collect { + case Project(Alias(e: BinaryComparison, _) :: Nil, _) => e + }.head + assert(comparison.left.dataType === expectedType) + assert(comparison.right.dataType === expectedType) + } + test("basic operations") { checkType(Add(d1, d2), DecimalType(6, 2)) checkType(Subtract(d1, d2), DecimalType(6, 2)) @@ -65,6 +74,14 @@ class DecimalPrecisionSuite extends FunSuite with BeforeAndAfter { checkType(Add(Add(d1, d2), Add(d1, d2)), DecimalType(7, 2)) } + test("Comparison operations") { + checkComparison(LessThan(i, d1), DecimalType.Unlimited) + checkComparison(LessThanOrEqual(d1, d2), DecimalType.Unlimited) + checkComparison(GreaterThan(d2, u), DecimalType.Unlimited) + checkComparison(GreaterThanOrEqual(d1, f), DoubleType) + checkComparison(GreaterThan(d2, d2), DecimalType(5, 2)) + } + test("bringing in primitive types") { checkType(Add(d1, i), DecimalType(12, 1)) checkType(Add(d1, f), DoubleType) From 61a99f6a11d85e931e7d60f9ab4370b3b40a52ef Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Tue, 30 Dec 2014 13:38:27 -0800 Subject: [PATCH 025/116] [SPARK-4937][SQL] Normalizes conjunctions and disjunctions to eliminate common predicates This PR is a simplified version of several filter optimization rules introduced in #3778 authored by scwf. Newly introduced optimizations include: 1. `a && a` => `a` 2. `a || a` => `a` 3. `(a || b || c || ...) && (a || b || d || ...)` => `a && b && (c || d || ...)` The 3rd rule is particularly useful for optimizing the following query, which is planned into a cartesian product ```sql SELECT * FROM t1, t2 WHERE (t1.key = t2.key AND t1.value > 10) OR (t1.key = t2.key AND t2.value < 20) ``` to the following one, which is planned into an equi-join: ```sql SELECT * FROM t1, t2 WHERE t1.key = t2.key AND (t1.value > 10 OR t2.value < 20) ``` The example above is quite artificial, but common predicates are likely to appear in real life complex queries (like the one mentioned in #3778). A difference between this PR and #3778 is that these optimizations are not limited to `Filter`, but are generalized to all logical plan nodes. Thanks to scwf for bringing up these optimizations, and chenghao-intel for the generalization suggestion. [Review on Reviewable](https://reviewable.io/reviews/apache/spark/3784) Author: Cheng Lian Closes #3784 from liancheng/normalize-filters and squashes the following commits: caca560 [Cheng Lian] Moves filter normalization into BooleanSimplification rule 4ab3a58 [Cheng Lian] Fixes test failure, adds more tests 5d54349 [Cheng Lian] Fixes typo in comment 2abbf8e [Cheng Lian] Forgot our sacred Apache licence header... cf95639 [Cheng Lian] Adds an optimization rule for filter normalization --- .../sql/catalyst/expressions/predicates.scala | 9 ++- .../sql/catalyst/optimizer/Optimizer.scala | 27 +++++-- .../optimizer/NormalizeFiltersSuite.scala | 72 +++++++++++++++++++ .../columnar/PartitionBatchPruningSuite.scala | 10 ++- 4 files changed, 110 insertions(+), 8 deletions(-) create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFiltersSuite.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala index 94b6fb084d38a..cb5ff67959868 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.catalyst.expressions -import scala.collection.immutable.HashSet import org.apache.spark.sql.catalyst.analysis.UnresolvedException import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.types.BooleanType @@ -48,6 +47,14 @@ trait PredicateHelper { } } + protected def splitDisjunctivePredicates(condition: Expression): Seq[Expression] = { + condition match { + case Or(cond1, cond2) => + splitDisjunctivePredicates(cond1) ++ splitDisjunctivePredicates(cond2) + case other => other :: Nil + } + } + /** * Returns true if `expr` can be evaluated using only the output of `plan`. This method * can be used to determine when is is acceptable to move expression evaluation within a query diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 0f2eae6400d21..cd3137980ca43 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -294,11 +294,16 @@ object OptimizeIn extends Rule[LogicalPlan] { } /** - * Simplifies boolean expressions where the answer can be determined without evaluating both sides. + * Simplifies boolean expressions: + * + * 1. Simplifies expressions whose answer can be determined without evaluating both sides. + * 2. Eliminates / extracts common factors. + * 3. Removes `Not` operator. + * * Note that this rule can eliminate expressions that might otherwise have been evaluated and thus * is only safe when evaluations of expressions does not result in side effects. */ -object BooleanSimplification extends Rule[LogicalPlan] { +object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper { def apply(plan: LogicalPlan): LogicalPlan = plan transform { case q: LogicalPlan => q transformExpressionsUp { case and @ And(left, right) => @@ -307,7 +312,9 @@ object BooleanSimplification extends Rule[LogicalPlan] { case (l, Literal(true, BooleanType)) => l case (Literal(false, BooleanType), _) => Literal(false) case (_, Literal(false, BooleanType)) => Literal(false) - case (_, _) => and + // a && a && a ... => a + case _ if splitConjunctivePredicates(and).distinct.size == 1 => left + case _ => and } case or @ Or(left, right) => @@ -316,7 +323,19 @@ object BooleanSimplification extends Rule[LogicalPlan] { case (_, Literal(true, BooleanType)) => Literal(true) case (Literal(false, BooleanType), r) => r case (l, Literal(false, BooleanType)) => l - case (_, _) => or + // a || a || a ... => a + case _ if splitDisjunctivePredicates(or).distinct.size == 1 => left + // (a && b && c && ...) || (a && b && d && ...) => a && b && (c || d || ...) + case _ => + val lhsSet = splitConjunctivePredicates(left).toSet + val rhsSet = splitConjunctivePredicates(right).toSet + val common = lhsSet.intersect(rhsSet) + + (lhsSet.diff(common).reduceOption(And) ++ rhsSet.diff(common).reduceOption(And)) + .reduceOption(Or) + .map(_ :: common.toList) + .getOrElse(common.toList) + .reduce(And) } case not @ Not(exp) => diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFiltersSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFiltersSuite.scala new file mode 100644 index 0000000000000..906300d8336cb --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFiltersSuite.scala @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.spark.sql.catalyst.analysis.EliminateAnalysisOperators +import org.apache.spark.sql.catalyst.expressions.{And, Expression, Or} +import org.apache.spark.sql.catalyst.plans.PlanTest +import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, LogicalPlan} +import org.apache.spark.sql.catalyst.rules.RuleExecutor + +// For implicit conversions +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ + +class NormalizeFiltersSuite extends PlanTest { + object Optimize extends RuleExecutor[LogicalPlan] { + val batches = Seq( + Batch("AnalysisNodes", Once, + EliminateAnalysisOperators), + Batch("NormalizeFilters", FixedPoint(100), + BooleanSimplification, + SimplifyFilters)) + } + + val relation = LocalRelation('a.int, 'b.int, 'c.string) + + def checkExpression(original: Expression, expected: Expression): Unit = { + val actual = Optimize(relation.where(original)).collect { case f: Filter => f.condition }.head + val result = (actual, expected) match { + case (And(l1, r1), And(l2, r2)) => (l1 == l2 && r1 == r2) || (l1 == r2 && l2 == r1) + case (Or (l1, r1), Or (l2, r2)) => (l1 == l2 && r1 == r2) || (l1 == r2 && l2 == r1) + case (lhs, rhs) => lhs fastEquals rhs + } + + assert(result, s"$actual isn't equivalent to $expected") + } + + test("a && a => a") { + checkExpression('a === 1 && 'a === 1, 'a === 1) + checkExpression('a === 1 && 'a === 1 && 'a === 1, 'a === 1) + } + + test("a || a => a") { + checkExpression('a === 1 || 'a === 1, 'a === 1) + checkExpression('a === 1 || 'a === 1 || 'a === 1, 'a === 1) + } + + test("(a && b) || (a && c) => a && (b || c)") { + checkExpression( + ('a === 1 && 'a < 10) || ('a > 2 && 'a === 1), + ('a === 1) && ('a < 10 || 'a > 2)) + + checkExpression( + ('a < 1 && 'b > 2 && 'c.isNull) || ('a < 1 && 'c === "hello" && 'b > 2), + ('c.isNull || 'c === "hello") && 'a < 1 && 'b > 2) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala index 82afa31a99a7e..1915c25392f1e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala @@ -105,7 +105,9 @@ class PartitionBatchPruningSuite extends FunSuite with BeforeAndAfterAll with Be test(query) { val schemaRdd = sql(query) - assertResult(expectedQueryResult.toArray, "Wrong query result") { + val queryExecution = schemaRdd.queryExecution + + assertResult(expectedQueryResult.toArray, s"Wrong query result: $queryExecution") { schemaRdd.collect().map(_.head).toArray } @@ -113,8 +115,10 @@ class PartitionBatchPruningSuite extends FunSuite with BeforeAndAfterAll with Be case in: InMemoryColumnarTableScan => (in.readPartitions.value, in.readBatches.value) }.head - assert(readBatches === expectedReadBatches, "Wrong number of read batches") - assert(readPartitions === expectedReadPartitions, "Wrong number of read partitions") + assert(readBatches === expectedReadBatches, s"Wrong number of read batches: $queryExecution") + assert( + readPartitions === expectedReadPartitions, + s"Wrong number of read partitions: $queryExecution") } } } From 7425bec320227bf8818dc2844c12d5373d166364 Mon Sep 17 00:00:00 2001 From: Michael Davies Date: Tue, 30 Dec 2014 13:40:51 -0800 Subject: [PATCH 026/116] [SPARK-4386] Improve performance when writing Parquet files Convert type of RowWriteSupport.attributes to Array. Analysis of performance for writing very wide tables shows that time is spent predominantly in apply method on attributes var. Type of attributes previously was LinearSeqOptimized and apply is O(N) which made write O(N squared). Measurements on 575 column table showed this change made a 6x improvement in write times. Author: Michael Davies Closes #3843 from MickDavies/SPARK-4386 and squashes the following commits: 892519d [Michael Davies] [SPARK-4386] Improve performance when writing Parquet files --- .../org/apache/spark/sql/parquet/ParquetTableSupport.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala index ef3687e692964..9049eb5932b79 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala @@ -130,7 +130,7 @@ private[parquet] object RowReadSupport { private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging { private[parquet] var writer: RecordConsumer = null - private[parquet] var attributes: Seq[Attribute] = null + private[parquet] var attributes: Array[Attribute] = null override def init(configuration: Configuration): WriteSupport.WriteContext = { val origAttributesStr: String = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA) @@ -138,7 +138,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging { metadata.put(RowReadSupport.SPARK_METADATA_KEY, origAttributesStr) if (attributes == null) { - attributes = ParquetTypesConverter.convertFromString(origAttributesStr) + attributes = ParquetTypesConverter.convertFromString(origAttributesStr).toArray } log.debug(s"write support initialized for requested schema $attributes") From 8f29b7cafc2b6e802e4eb21f681d6369da2f30fa Mon Sep 17 00:00:00 2001 From: wangfei Date: Tue, 30 Dec 2014 13:44:30 -0800 Subject: [PATCH 027/116] [SPARK-4935][SQL] When hive.cli.print.header configured, spark-sql aborted if passed in a invalid sql If we passed in a wrong sql like ```abdcdfsfs```, the spark-sql script aborted. Author: wangfei Author: Fei Wang Closes #3761 from scwf/patch-10 and squashes the following commits: 46dc344 [Fei Wang] revert console.printError(rc.getErrorMessage()) 0330e07 [wangfei] avoid to print error message repeatedly 1614a11 [wangfei] spark-sql abort when passed in a wrong sql --- .../spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala index 6ed8fd2768f95..7a3d76c61c3a1 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala @@ -60,7 +60,7 @@ private[hive] abstract class AbstractSparkSQLDriver( } catch { case cause: Throwable => logError(s"Failed in [$command]", cause) - new CommandProcessorResponse(0, ExceptionUtils.getFullStackTrace(cause), null) + new CommandProcessorResponse(1, ExceptionUtils.getFullStackTrace(cause), null) } } From 07fa1910d9c4092d670381c447403105f01c584e Mon Sep 17 00:00:00 2001 From: wangxiaojing Date: Tue, 30 Dec 2014 13:54:12 -0800 Subject: [PATCH 028/116] [SPARK-4570][SQL]add BroadcastLeftSemiJoinHash JIRA issue: [SPARK-4570](https://issues.apache.org/jira/browse/SPARK-4570) We are planning to create a `BroadcastLeftSemiJoinHash` to implement the broadcast join for `left semijoin` In left semijoin : If the size of data from right side is smaller than the user-settable threshold `AUTO_BROADCASTJOIN_THRESHOLD`, the planner would mark it as the `broadcast` relation and mark the other relation as the stream side. The broadcast table will be broadcasted to all of the executors involved in the join, as a `org.apache.spark.broadcast.Broadcast` object. It will use `joins.BroadcastLeftSemiJoinHash`.,else it will use `joins.LeftSemiJoinHash`. The benchmark suggests these made the optimized version 4x faster when `left semijoin`

Original:
left semi join : 9288 ms
Optimized:
left semi join : 1963 ms
The micro benchmark load `data1/kv3.txt` into a normal Hive table. Benchmark code:

 def benchmark(f: => Unit) = {
    val begin = System.currentTimeMillis()
    f
    val end = System.currentTimeMillis()
    end - begin
  }
  val sc = new SparkContext(
    new SparkConf()
      .setMaster("local")
      .setAppName(getClass.getSimpleName.stripSuffix("$")))
  val hiveContext = new HiveContext(sc)
  import hiveContext._
  sql("drop table if exists left_table")
  sql("drop table if exists right_table")
  sql( """create table left_table (key int, value string)
       """.stripMargin)
  sql( s"""load data local inpath "/data1/kv3.txt" into table left_table""")
  sql( """create table right_table (key int, value string)
       """.stripMargin)
  sql(
    """
      |from left_table
      |insert overwrite table right_table
      |select left_table.key, left_table.value
    """.stripMargin)

  val leftSimeJoin = sql(
    """select a.key from left_table a
      |left semi join right_table b on a.key = b.key""".stripMargin)
  val leftSemiJoinDuration = benchmark(leftSimeJoin.count())
  println(s"left semi join : $leftSemiJoinDuration ms ")
Author: wangxiaojing Closes #3442 from wangxiaojing/SPARK-4570 and squashes the following commits: a4a43c9 [wangxiaojing] rebase f103983 [wangxiaojing] change style fbe4887 [wangxiaojing] change style ff2e618 [wangxiaojing] add testsuite 1a8da2a [wangxiaojing] add BroadcastLeftSemiJoinHash --- .../spark/sql/execution/SparkStrategies.scala | 6 ++ .../joins/BroadcastLeftSemiJoinHash.scala | 67 +++++++++++++++++++ .../org/apache/spark/sql/JoinSuite.scala | 38 +++++++++++ .../spark/sql/hive/StatisticsSuite.scala | 50 +++++++++++++- 4 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 9151da69ed44c..ce878c137e627 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -33,6 +33,12 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { object LeftSemiJoin extends Strategy with PredicateHelper { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case ExtractEquiJoinKeys(LeftSemi, leftKeys, rightKeys, condition, left, right) + if sqlContext.autoBroadcastJoinThreshold > 0 && + right.statistics.sizeInBytes <= sqlContext.autoBroadcastJoinThreshold => + val semiJoin = joins.BroadcastLeftSemiJoinHash( + leftKeys, rightKeys, planLater(left), planLater(right)) + condition.map(Filter(_, semiJoin)).getOrElse(semiJoin) :: Nil // Find left semi joins where at least some predicates can be evaluated by matching join keys case ExtractEquiJoinKeys(LeftSemi, leftKeys, rightKeys, condition, left, right) => val semiJoin = joins.LeftSemiJoinHash( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala new file mode 100644 index 0000000000000..2ab064fd0151e --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.joins + +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.sql.catalyst.expressions.{Expression, Row} +import org.apache.spark.sql.catalyst.plans.physical.ClusteredDistribution +import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} + +/** + * :: DeveloperApi :: + * Build the right table's join keys into a HashSet, and iteratively go through the left + * table, to find the if join keys are in the Hash set. + */ +@DeveloperApi +case class BroadcastLeftSemiJoinHash( + leftKeys: Seq[Expression], + rightKeys: Seq[Expression], + left: SparkPlan, + right: SparkPlan) extends BinaryNode with HashJoin { + + override val buildSide = BuildRight + + override def output = left.output + + override def execute() = { + val buildIter= buildPlan.execute().map(_.copy()).collect().toIterator + val hashSet = new java.util.HashSet[Row]() + var currentRow: Row = null + + // Create a Hash set of buildKeys + while (buildIter.hasNext) { + currentRow = buildIter.next() + val rowKey = buildSideKeyGenerator(currentRow) + if (!rowKey.anyNull) { + val keyExists = hashSet.contains(rowKey) + if (!keyExists) { + hashSet.add(rowKey) + } + } + } + + val broadcastedRelation = sparkContext.broadcast(hashSet) + + streamedPlan.execute().mapPartitions { streamIter => + val joinKeys = streamSideKeyGenerator() + streamIter.filter(current => { + !joinKeys(current).anyNull && broadcastedRelation.value.contains(joinKeys.currentValue) + }) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala index 0378fd7e367f0..1a4232dab86e7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala @@ -48,6 +48,7 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { case j: LeftSemiJoinBNL => j case j: CartesianProduct => j case j: BroadcastNestedLoopJoin => j + case j: BroadcastLeftSemiJoinHash => j } assert(operators.size === 1) @@ -382,4 +383,41 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { """.stripMargin), (null, 10) :: Nil) } + + test("broadcasted left semi join operator selection") { + clearCache() + sql("CACHE TABLE testData") + val tmp = autoBroadcastJoinThreshold + + sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=1000000000") + Seq( + ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a", + classOf[BroadcastLeftSemiJoinHash]) + ).foreach { + case (query, joinClass) => assertJoin(query, joinClass) + } + + sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1") + + Seq( + ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a", classOf[LeftSemiJoinHash]) + ).foreach { + case (query, joinClass) => assertJoin(query, joinClass) + } + + setConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD, tmp.toString) + sql("UNCACHE TABLE testData") + } + + test("left semi join") { + val rdd = sql("SELECT * FROM testData2 LEFT SEMI JOIN testData ON key = a") + checkAnswer(rdd, + (1, 1) :: + (1, 2) :: + (2, 1) :: + (2, 2) :: + (3, 1) :: + (3, 2) :: Nil) + + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index ff4071d8e2f10..4b6a9308b9811 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -22,7 +22,7 @@ import org.scalatest.BeforeAndAfterAll import scala.reflect.ClassTag import org.apache.spark.sql.{SQLConf, QueryTest} -import org.apache.spark.sql.execution.joins.{BroadcastHashJoin, ShuffledHashJoin} +import org.apache.spark.sql.execution.joins._ import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.execution._ @@ -193,4 +193,52 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll { ) } + test("auto converts to broadcast left semi join, by size estimate of a relation") { + val leftSemiJoinQuery = + """SELECT * FROM src a + |left semi JOIN src b ON a.key=86 and a.key = b.key""".stripMargin + val answer = (86, "val_86") :: Nil + + var rdd = sql(leftSemiJoinQuery) + + // Assert src has a size smaller than the threshold. + val sizes = rdd.queryExecution.analyzed.collect { + case r if implicitly[ClassTag[MetastoreRelation]].runtimeClass + .isAssignableFrom(r.getClass) => + r.statistics.sizeInBytes + } + assert(sizes.size === 2 && sizes(1) <= autoBroadcastJoinThreshold + && sizes(0) <= autoBroadcastJoinThreshold, + s"query should contain two relations, each of which has size smaller than autoConvertSize") + + // Using `sparkPlan` because for relevant patterns in HashJoin to be + // matched, other strategies need to be applied. + var bhj = rdd.queryExecution.sparkPlan.collect { + case j: BroadcastLeftSemiJoinHash => j + } + assert(bhj.size === 1, + s"actual query plans do not contain broadcast join: ${rdd.queryExecution}") + + checkAnswer(rdd, answer) // check correctness of output + + TestHive.settings.synchronized { + val tmp = autoBroadcastJoinThreshold + + sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1") + rdd = sql(leftSemiJoinQuery) + bhj = rdd.queryExecution.sparkPlan.collect { + case j: BroadcastLeftSemiJoinHash => j + } + assert(bhj.isEmpty, "BroadcastHashJoin still planned even though it is switched off") + + val shj = rdd.queryExecution.sparkPlan.collect { + case j: LeftSemiJoinHash => j + } + assert(shj.size === 1, + "LeftSemiJoinHash should be planned when BroadcastHashJoin is turned off") + + sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=$tmp") + } + + } } From b239ea1c31aeaa752d5dc8f45423df1f8c0924ca Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Tue, 30 Dec 2014 14:00:57 -0800 Subject: [PATCH 029/116] SPARK-3955 part 2 [CORE] [HOTFIX] Different versions between jackson-mapper-asl and jackson-core-asl pwendell https://github.com/apache/spark/commit/2483c1efb6429a7d8a20c96d18ce2fec93a1aff9 didn't actually add a reference to `jackson-core-asl` as intended, but a second redundant reference to `jackson-mapper-asl`, as markhamstra picked up on (https://github.com/apache/spark/pull/3716#issuecomment-68180192) This just rectifies the typo. I missed it as well; the original PR https://github.com/apache/spark/pull/2818 had it correct and I also didn't see the problem. Author: Sean Owen Closes #3829 from srowen/SPARK-3955 and squashes the following commits: 6cfdc4e [Sean Owen] Actually refer to jackson-core-asl --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a843af2b22d6c..05f59a9b4140b 100644 --- a/pom.xml +++ b/pom.xml @@ -827,7 +827,7 @@ org.codehaus.jackson - jackson-mapper-asl + jackson-core-asl ${jackson.version} From 0f31992c61f6662e5347745f6a1ac272a5fd63c9 Mon Sep 17 00:00:00 2001 From: Jakub Dubovsky Date: Tue, 30 Dec 2014 14:19:07 -0800 Subject: [PATCH 030/116] [Spark-4995] Replace Vector.toBreeze.activeIterator with foreachActive New foreachActive method of vector was introduced by SPARK-4431 as more efficient alternative to vector.toBreeze.activeIterator. There are some parts of codebase where it was not yet replaced. dbtsai Author: Jakub Dubovsky Closes #3846 from james64/SPARK-4995-foreachActive and squashes the following commits: 3eb7e37 [Jakub Dubovsky] Scalastyle fix 32fe6c6 [Jakub Dubovsky] activeIterator removed - IndexedRowMatrix.toBreeze 47a4777 [Jakub Dubovsky] activeIterator removed in RowMatrix.toBreeze 90a7d98 [Jakub Dubovsky] activeIterator removed in MLUtils.saveAsLibSVMFile --- .../spark/mllib/linalg/distributed/IndexedRowMatrix.scala | 2 +- .../apache/spark/mllib/linalg/distributed/RowMatrix.scala | 4 ++-- .../main/scala/org/apache/spark/mllib/util/MLUtils.scala | 8 +++++--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala index 5c1acca0ec532..36d8cadd2bdd7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala @@ -142,7 +142,7 @@ class IndexedRowMatrix( val mat = BDM.zeros[Double](m, n) rows.collect().foreach { case IndexedRow(rowIndex, vector) => val i = rowIndex.toInt - vector.toBreeze.activeIterator.foreach { case (j, v) => + vector.foreachActive { case (j, v) => mat(i, j) = v } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index 10a515af88802..a3fca53929ab7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -588,8 +588,8 @@ class RowMatrix( val n = numCols().toInt val mat = BDM.zeros[Double](m, n) var i = 0 - rows.collect().foreach { v => - v.toBreeze.activeIterator.foreach { case (j, v) => + rows.collect().foreach { vector => + vector.foreachActive { case (j, v) => mat(i, j) = v } i += 1 diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 1d07b5dab8268..da0da0a168c1d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -154,10 +154,12 @@ object MLUtils { def saveAsLibSVMFile(data: RDD[LabeledPoint], dir: String) { // TODO: allow to specify label precision and feature precision. val dataStr = data.map { case LabeledPoint(label, features) => - val featureStrings = features.toBreeze.activeIterator.map { case (i, v) => - s"${i + 1}:$v" + val sb = new StringBuilder(label.toString) + features.foreachActive { case (i, v) => + sb += ' ' + sb ++= s"${i + 1}:$v" } - (Iterator(label) ++ featureStrings).mkString(" ") + sb.mkString } dataStr.saveAsTextFile(dir) } From 6a897829444e2ef273586511f93a40d36e64fb0b Mon Sep 17 00:00:00 2001 From: zsxwing Date: Tue, 30 Dec 2014 14:39:13 -0800 Subject: [PATCH 031/116] [SPARK-4813][Streaming] Fix the issue that ContextWaiter didn't handle 'spurious wakeup' Used `Condition` to rewrite `ContextWaiter` because it provides a convenient API `awaitNanos` for timeout. Author: zsxwing Closes #3661 from zsxwing/SPARK-4813 and squashes the following commits: 52247f5 [zsxwing] Add explicit unit type be42bcf [zsxwing] Update as per review suggestion e06bd4f [zsxwing] Fix the issue that ContextWaiter didn't handle 'spurious wakeup' --- .../spark/streaming/ContextWaiter.scala | 63 ++++++++++++++----- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala b/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala index a0aeacbc733bd..fdbbe2aa6ef08 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala @@ -17,30 +17,63 @@ package org.apache.spark.streaming +import java.util.concurrent.TimeUnit +import java.util.concurrent.locks.ReentrantLock + private[streaming] class ContextWaiter { + + private val lock = new ReentrantLock() + private val condition = lock.newCondition() + + // Guarded by "lock" private var error: Throwable = null - private var stopped: Boolean = false - def notifyError(e: Throwable) = synchronized { - error = e - notifyAll() - } + // Guarded by "lock" + private var stopped: Boolean = false - def notifyStop() = synchronized { - stopped = true - notifyAll() + def notifyError(e: Throwable): Unit = { + lock.lock() + try { + error = e + condition.signalAll() + } finally { + lock.unlock() + } } - def waitForStopOrError(timeout: Long = -1) = synchronized { - // If already had error, then throw it - if (error != null) { - throw error + def notifyStop(): Unit = { + lock.lock() + try { + stopped = true + condition.signalAll() + } finally { + lock.unlock() } + } - // If not already stopped, then wait - if (!stopped) { - if (timeout < 0) wait() else wait(timeout) + /** + * Return `true` if it's stopped; or throw the reported error if `notifyError` has been called; or + * `false` if the waiting time detectably elapsed before return from the method. + */ + def waitForStopOrError(timeout: Long = -1): Boolean = { + lock.lock() + try { + if (timeout < 0) { + while (!stopped && error == null) { + condition.await() + } + } else { + var nanos = TimeUnit.MILLISECONDS.toNanos(timeout) + while (!stopped && error == null && nanos > 0) { + nanos = condition.awaitNanos(nanos) + } + } + // If already had error, then throw it if (error != null) throw error + // already stopped or timeout + stopped + } finally { + lock.unlock() } } } From 035bac88c732247c79a1bbad4f9191090cbbdc9a Mon Sep 17 00:00:00 2001 From: Liu Jiongzhou Date: Tue, 30 Dec 2014 15:55:56 -0800 Subject: [PATCH 032/116] [SPARK-4998][MLlib]delete the "train" function To make the functions with the same in "object" effective, specially when using java reflection. As the "train" function defined in "class DecisionTree" will hide the functions with the same name in "object DecisionTree". JIRA[SPARK-4998] Author: Liu Jiongzhou Closes #3836 from ljzzju/master and squashes the following commits: 4e13133 [Liu Jiongzhou] [MLlib]delete the "train" function --- .../scala/org/apache/spark/mllib/tree/DecisionTree.scala | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala index 73e7e32c6db31..b3e8ed9af8c51 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala @@ -64,13 +64,6 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo val rfModel = rf.run(input) rfModel.trees(0) } - - /** - * Trains a decision tree model over an RDD. This is deprecated because it hides the static - * methods with the same name in Java. - */ - @deprecated("Please use DecisionTree.run instead.", "1.2.0") - def train(input: RDD[LabeledPoint]): DecisionTreeModel = run(input) } object DecisionTree extends Serializable with Logging { From 352ed6bbe3c3b67e52e298e7c535ae414d96beca Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Tue, 30 Dec 2014 18:12:20 -0800 Subject: [PATCH 033/116] [SPARK-1010] Clean up uses of System.setProperty in unit tests Several of our tests call System.setProperty (or test code which implicitly sets system properties) and don't always reset/clear the modified properties, which can create ordering dependencies between tests and cause hard-to-diagnose failures. This patch removes most uses of System.setProperty from our tests, since in most cases we can use SparkConf to set these configurations (there are a few exceptions, including the tests of SparkConf itself). For the cases where we continue to use System.setProperty, this patch introduces a `ResetSystemProperties` ScalaTest mixin class which snapshots the system properties before individual tests and to automatically restores them on test completion / failure. See the block comment at the top of the ResetSystemProperties class for more details. Author: Josh Rosen Closes #3739 from JoshRosen/cleanup-system-properties-in-tests and squashes the following commits: 0236d66 [Josh Rosen] Replace setProperty uses in two example programs / tools 3888fe3 [Josh Rosen] Remove setProperty use in LocalJavaStreamingContext 4f4031d [Josh Rosen] Add note on why SparkSubmitSuite needs ResetSystemProperties 4742a5b [Josh Rosen] Clarify ResetSystemProperties trait inheritance ordering. 0eaf0b6 [Josh Rosen] Remove setProperty call in TaskResultGetterSuite. 7a3d224 [Josh Rosen] Fix trait ordering 3fdb554 [Josh Rosen] Remove setProperty call in TaskSchedulerImplSuite bee20df [Josh Rosen] Remove setProperty calls in SparkContextSchedulerCreationSuite 655587c [Josh Rosen] Remove setProperty calls in JobCancellationSuite 3f2f955 [Josh Rosen] Remove System.setProperty calls in DistributedSuite cfe9cce [Josh Rosen] Remove use of system properties in SparkContextSuite 8783ab0 [Josh Rosen] Remove TestUtils.setSystemProperty, since it is subsumed by the ResetSystemProperties trait. 633a84a [Josh Rosen] Remove use of system properties in FileServerSuite 25bfce2 [Josh Rosen] Use ResetSystemProperties in UtilsSuite 1d1aa5a [Josh Rosen] Use ResetSystemProperties in SizeEstimatorSuite dd9492b [Josh Rosen] Use ResetSystemProperties in AkkaUtilsSuite b0daff2 [Josh Rosen] Use ResetSystemProperties in BlockManagerSuite e9ded62 [Josh Rosen] Use ResetSystemProperties in TaskSchedulerImplSuite 5b3cb54 [Josh Rosen] Use ResetSystemProperties in SparkListenerSuite 0995c4b [Josh Rosen] Use ResetSystemProperties in SparkContextSchedulerCreationSuite c83ded8 [Josh Rosen] Use ResetSystemProperties in SparkConfSuite 51aa870 [Josh Rosen] Use withSystemProperty in ShuffleSuite 60a63a1 [Josh Rosen] Use ResetSystemProperties in JobCancellationSuite 14a92e4 [Josh Rosen] Use withSystemProperty in FileServerSuite 628f46c [Josh Rosen] Use ResetSystemProperties in DistributedSuite 9e3e0dd [Josh Rosen] Add ResetSystemProperties test fixture mixin; use it in SparkSubmitSuite. 4dcea38 [Josh Rosen] Move withSystemProperty to TestUtils class. --- .../org/apache/spark/DistributedSuite.scala | 21 ++----- .../org/apache/spark/FileServerSuite.scala | 16 ++--- .../apache/spark/JobCancellationSuite.scala | 21 +++---- .../scala/org/apache/spark/ShuffleSuite.scala | 22 +++---- .../org/apache/spark/SparkConfSuite.scala | 51 ++++++--------- .../SparkContextSchedulerCreationSuite.scala | 31 ++++------ .../org/apache/spark/SparkContextSuite.scala | 62 +++++++------------ .../spark/deploy/SparkSubmitSuite.scala | 6 +- .../spark/scheduler/SparkListenerSuite.scala | 9 +-- .../scheduler/TaskResultGetterSuite.scala | 23 +++---- .../scheduler/TaskSchedulerImplSuite.scala | 6 +- .../spark/storage/BlockManagerSuite.scala | 23 +++---- .../apache/spark/util/AkkaUtilsSuite.scala | 2 +- .../spark/util/ResetSystemProperties.scala | 57 +++++++++++++++++ .../spark/util/SizeEstimatorSuite.scala | 38 +++--------- .../org/apache/spark/util/UtilsSuite.scala | 2 +- .../apache/spark/examples/BroadcastTest.scala | 6 +- .../streaming/LocalJavaStreamingContext.java | 8 ++- .../streaming/LocalJavaStreamingContext.java | 8 ++- .../streaming/LocalJavaStreamingContext.java | 8 ++- .../streaming/LocalJavaStreamingContext.java | 8 ++- .../streaming/LocalJavaStreamingContext.java | 8 ++- .../spark/tools/StoragePerfTester.scala | 12 ++-- 23 files changed, 216 insertions(+), 232 deletions(-) create mode 100644 core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala index 998f3008ec0ea..97ea3578aa8ba 100644 --- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala +++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala @@ -17,7 +17,6 @@ package org.apache.spark -import org.scalatest.BeforeAndAfter import org.scalatest.FunSuite import org.scalatest.concurrent.Timeouts._ import org.scalatest.Matchers @@ -29,16 +28,10 @@ class NotSerializableClass class NotSerializableExn(val notSer: NotSerializableClass) extends Throwable() {} -class DistributedSuite extends FunSuite with Matchers with BeforeAndAfter - with LocalSparkContext { +class DistributedSuite extends FunSuite with Matchers with LocalSparkContext { val clusterUrl = "local-cluster[2,1,512]" - after { - System.clearProperty("spark.reducer.maxMbInFlight") - System.clearProperty("spark.storage.memoryFraction") - } - test("task throws not serializable exception") { // Ensures that executors do not crash when an exn is not serializable. If executors crash, // this test will hang. Correct behavior is that executors don't crash but fail tasks @@ -84,15 +77,14 @@ class DistributedSuite extends FunSuite with Matchers with BeforeAndAfter } test("groupByKey where map output sizes exceed maxMbInFlight") { - System.setProperty("spark.reducer.maxMbInFlight", "1") - sc = new SparkContext(clusterUrl, "test") + val conf = new SparkConf().set("spark.reducer.maxMbInFlight", "1") + sc = new SparkContext(clusterUrl, "test", conf) // This data should be around 20 MB, so even with 4 mappers and 2 reducers, each map output // file should be about 2.5 MB val pairs = sc.parallelize(1 to 2000, 4).map(x => (x % 16, new Array[Byte](10000))) val groups = pairs.groupByKey(2).map(x => (x._1, x._2.size)).collect() assert(groups.length === 16) assert(groups.map(_._2).sum === 2000) - // Note that spark.reducer.maxMbInFlight will be cleared in the test suite's after{} block } test("accumulators") { @@ -210,7 +202,6 @@ class DistributedSuite extends FunSuite with Matchers with BeforeAndAfter } test("compute without caching when no partitions fit in memory") { - System.setProperty("spark.storage.memoryFraction", "0.0001") sc = new SparkContext(clusterUrl, "test") // data will be 4 million * 4 bytes = 16 MB in size, but our memoryFraction set the cache // to only 50 KB (0.0001 of 512 MB), so no partitions should fit in memory @@ -218,12 +209,11 @@ class DistributedSuite extends FunSuite with Matchers with BeforeAndAfter assert(data.count() === 4000000) assert(data.count() === 4000000) assert(data.count() === 4000000) - System.clearProperty("spark.storage.memoryFraction") } test("compute when only some partitions fit in memory") { - System.setProperty("spark.storage.memoryFraction", "0.01") - sc = new SparkContext(clusterUrl, "test") + val conf = new SparkConf().set("spark.storage.memoryFraction", "0.01") + sc = new SparkContext(clusterUrl, "test", conf) // data will be 4 million * 4 bytes = 16 MB in size, but our memoryFraction set the cache // to only 5 MB (0.01 of 512 MB), so not all of it will fit in memory; we use 20 partitions // to make sure that *some* of them do fit though @@ -231,7 +221,6 @@ class DistributedSuite extends FunSuite with Matchers with BeforeAndAfter assert(data.count() === 4000000) assert(data.count() === 4000000) assert(data.count() === 4000000) - System.clearProperty("spark.storage.memoryFraction") } test("passing environment variables to cluster") { diff --git a/core/src/test/scala/org/apache/spark/FileServerSuite.scala b/core/src/test/scala/org/apache/spark/FileServerSuite.scala index 49426545c767e..0f49ce4754fbb 100644 --- a/core/src/test/scala/org/apache/spark/FileServerSuite.scala +++ b/core/src/test/scala/org/apache/spark/FileServerSuite.scala @@ -31,10 +31,11 @@ class FileServerSuite extends FunSuite with LocalSparkContext { @transient var tmpFile: File = _ @transient var tmpJarUrl: String = _ + def newConf: SparkConf = new SparkConf(loadDefaults = false).set("spark.authenticate", "false") + override def beforeEach() { super.beforeEach() resetSparkContext() - System.setProperty("spark.authenticate", "false") } override def beforeAll() { @@ -52,7 +53,6 @@ class FileServerSuite extends FunSuite with LocalSparkContext { val jarFile = new File(testTempDir, "test.jar") val jarStream = new FileOutputStream(jarFile) val jar = new JarOutputStream(jarStream, new java.util.jar.Manifest()) - System.setProperty("spark.authenticate", "false") val jarEntry = new JarEntry(textFile.getName) jar.putNextEntry(jarEntry) @@ -74,7 +74,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext { } test("Distributing files locally") { - sc = new SparkContext("local[4]", "test") + sc = new SparkContext("local[4]", "test", newConf) sc.addFile(tmpFile.toString) val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0)) val result = sc.parallelize(testData).reduceByKey { @@ -108,7 +108,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext { test("Distributing files locally using URL as input") { // addFile("file:///....") - sc = new SparkContext("local[4]", "test") + sc = new SparkContext("local[4]", "test", newConf) sc.addFile(new File(tmpFile.toString).toURI.toString) val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0)) val result = sc.parallelize(testData).reduceByKey { @@ -122,7 +122,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext { } test ("Dynamically adding JARS locally") { - sc = new SparkContext("local[4]", "test") + sc = new SparkContext("local[4]", "test", newConf) sc.addJar(tmpJarUrl) val testData = Array((1, 1)) sc.parallelize(testData).foreach { x => @@ -133,7 +133,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext { } test("Distributing files on a standalone cluster") { - sc = new SparkContext("local-cluster[1,1,512]", "test") + sc = new SparkContext("local-cluster[1,1,512]", "test", newConf) sc.addFile(tmpFile.toString) val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0)) val result = sc.parallelize(testData).reduceByKey { @@ -147,7 +147,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext { } test ("Dynamically adding JARS on a standalone cluster") { - sc = new SparkContext("local-cluster[1,1,512]", "test") + sc = new SparkContext("local-cluster[1,1,512]", "test", newConf) sc.addJar(tmpJarUrl) val testData = Array((1,1)) sc.parallelize(testData).foreach { x => @@ -158,7 +158,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext { } test ("Dynamically adding JARS on a standalone cluster using local: URL") { - sc = new SparkContext("local-cluster[1,1,512]", "test") + sc = new SparkContext("local-cluster[1,1,512]", "test", newConf) sc.addJar(tmpJarUrl.replace("file", "local")) val testData = Array((1,1)) sc.parallelize(testData).foreach { x => diff --git a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala index 41ed2bce55ce1..7584ae79fc920 100644 --- a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala +++ b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala @@ -40,12 +40,11 @@ class JobCancellationSuite extends FunSuite with Matchers with BeforeAndAfter override def afterEach() { super.afterEach() resetSparkContext() - System.clearProperty("spark.scheduler.mode") } test("local mode, FIFO scheduler") { - System.setProperty("spark.scheduler.mode", "FIFO") - sc = new SparkContext("local[2]", "test") + val conf = new SparkConf().set("spark.scheduler.mode", "FIFO") + sc = new SparkContext("local[2]", "test", conf) testCount() testTake() // Make sure we can still launch tasks. @@ -53,10 +52,10 @@ class JobCancellationSuite extends FunSuite with Matchers with BeforeAndAfter } test("local mode, fair scheduler") { - System.setProperty("spark.scheduler.mode", "FAIR") + val conf = new SparkConf().set("spark.scheduler.mode", "FAIR") val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile() - System.setProperty("spark.scheduler.allocation.file", xmlPath) - sc = new SparkContext("local[2]", "test") + conf.set("spark.scheduler.allocation.file", xmlPath) + sc = new SparkContext("local[2]", "test", conf) testCount() testTake() // Make sure we can still launch tasks. @@ -64,8 +63,8 @@ class JobCancellationSuite extends FunSuite with Matchers with BeforeAndAfter } test("cluster mode, FIFO scheduler") { - System.setProperty("spark.scheduler.mode", "FIFO") - sc = new SparkContext("local-cluster[2,1,512]", "test") + val conf = new SparkConf().set("spark.scheduler.mode", "FIFO") + sc = new SparkContext("local-cluster[2,1,512]", "test", conf) testCount() testTake() // Make sure we can still launch tasks. @@ -73,10 +72,10 @@ class JobCancellationSuite extends FunSuite with Matchers with BeforeAndAfter } test("cluster mode, fair scheduler") { - System.setProperty("spark.scheduler.mode", "FAIR") + val conf = new SparkConf().set("spark.scheduler.mode", "FAIR") val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile() - System.setProperty("spark.scheduler.allocation.file", xmlPath) - sc = new SparkContext("local-cluster[2,1,512]", "test") + conf.set("spark.scheduler.allocation.file", xmlPath) + sc = new SparkContext("local-cluster[2,1,512]", "test", conf) testCount() testTake() // Make sure we can still launch tasks. diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala index 58a96245a9b53..f57921b768310 100644 --- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala +++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala @@ -35,19 +35,15 @@ abstract class ShuffleSuite extends FunSuite with Matchers with LocalSparkContex conf.set("spark.test.noStageRetry", "true") test("groupByKey without compression") { - try { - System.setProperty("spark.shuffle.compress", "false") - sc = new SparkContext("local", "test", conf) - val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (2, 1)), 4) - val groups = pairs.groupByKey(4).collect() - assert(groups.size === 2) - val valuesFor1 = groups.find(_._1 == 1).get._2 - assert(valuesFor1.toList.sorted === List(1, 2, 3)) - val valuesFor2 = groups.find(_._1 == 2).get._2 - assert(valuesFor2.toList.sorted === List(1)) - } finally { - System.setProperty("spark.shuffle.compress", "true") - } + val myConf = conf.clone().set("spark.shuffle.compress", "false") + sc = new SparkContext("local", "test", myConf) + val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (2, 1)), 4) + val groups = pairs.groupByKey(4).collect() + assert(groups.size === 2) + val valuesFor1 = groups.find(_._1 == 1).get._2 + assert(valuesFor1.toList.sorted === List(1, 2, 3)) + val valuesFor2 = groups.find(_._1 == 2).get._2 + assert(valuesFor2.toList.sorted === List(1)) } test("shuffle non-zero block size") { diff --git a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala index 5d018ea9868a7..790976a5ac308 100644 --- a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala @@ -19,27 +19,20 @@ package org.apache.spark import org.scalatest.FunSuite import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer} +import org.apache.spark.util.ResetSystemProperties import com.esotericsoftware.kryo.Kryo -class SparkConfSuite extends FunSuite with LocalSparkContext { +class SparkConfSuite extends FunSuite with LocalSparkContext with ResetSystemProperties { test("loading from system properties") { - try { - System.setProperty("spark.test.testProperty", "2") - val conf = new SparkConf() - assert(conf.get("spark.test.testProperty") === "2") - } finally { - System.clearProperty("spark.test.testProperty") - } + System.setProperty("spark.test.testProperty", "2") + val conf = new SparkConf() + assert(conf.get("spark.test.testProperty") === "2") } test("initializing without loading defaults") { - try { - System.setProperty("spark.test.testProperty", "2") - val conf = new SparkConf(false) - assert(!conf.contains("spark.test.testProperty")) - } finally { - System.clearProperty("spark.test.testProperty") - } + System.setProperty("spark.test.testProperty", "2") + val conf = new SparkConf(false) + assert(!conf.contains("spark.test.testProperty")) } test("named set methods") { @@ -117,23 +110,17 @@ class SparkConfSuite extends FunSuite with LocalSparkContext { test("nested property names") { // This wasn't supported by some external conf parsing libraries - try { - System.setProperty("spark.test.a", "a") - System.setProperty("spark.test.a.b", "a.b") - System.setProperty("spark.test.a.b.c", "a.b.c") - val conf = new SparkConf() - assert(conf.get("spark.test.a") === "a") - assert(conf.get("spark.test.a.b") === "a.b") - assert(conf.get("spark.test.a.b.c") === "a.b.c") - conf.set("spark.test.a.b", "A.B") - assert(conf.get("spark.test.a") === "a") - assert(conf.get("spark.test.a.b") === "A.B") - assert(conf.get("spark.test.a.b.c") === "a.b.c") - } finally { - System.clearProperty("spark.test.a") - System.clearProperty("spark.test.a.b") - System.clearProperty("spark.test.a.b.c") - } + System.setProperty("spark.test.a", "a") + System.setProperty("spark.test.a.b", "a.b") + System.setProperty("spark.test.a.b.c", "a.b.c") + val conf = new SparkConf() + assert(conf.get("spark.test.a") === "a") + assert(conf.get("spark.test.a.b") === "a.b") + assert(conf.get("spark.test.a.b.c") === "a.b.c") + conf.set("spark.test.a.b", "A.B") + assert(conf.get("spark.test.a") === "a") + assert(conf.get("spark.test.a.b") === "A.B") + assert(conf.get("spark.test.a.b.c") === "a.b.c") } test("register kryo classes through registerKryoClasses") { diff --git a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala index 0390a2e4f1dbb..8ae4f243ec1ae 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala @@ -27,10 +27,13 @@ import org.apache.spark.scheduler.local.LocalBackend class SparkContextSchedulerCreationSuite extends FunSuite with LocalSparkContext with PrivateMethodTester with Logging { - def createTaskScheduler(master: String): TaskSchedulerImpl = { + def createTaskScheduler(master: String): TaskSchedulerImpl = + createTaskScheduler(master, new SparkConf()) + + def createTaskScheduler(master: String, conf: SparkConf): TaskSchedulerImpl = { // Create local SparkContext to setup a SparkEnv. We don't actually want to start() the // real schedulers, so we don't want to create a full SparkContext with the desired scheduler. - sc = new SparkContext("local", "test") + sc = new SparkContext("local", "test", conf) val createTaskSchedulerMethod = PrivateMethod[Tuple2[SchedulerBackend, TaskScheduler]]('createTaskScheduler) val (_, sched) = SparkContext invokePrivate createTaskSchedulerMethod(sc, master) @@ -102,19 +105,13 @@ class SparkContextSchedulerCreationSuite } test("local-default-parallelism") { - val defaultParallelism = System.getProperty("spark.default.parallelism") - System.setProperty("spark.default.parallelism", "16") - val sched = createTaskScheduler("local") + val conf = new SparkConf().set("spark.default.parallelism", "16") + val sched = createTaskScheduler("local", conf) sched.backend match { case s: LocalBackend => assert(s.defaultParallelism() === 16) case _ => fail() } - - Option(defaultParallelism) match { - case Some(v) => System.setProperty("spark.default.parallelism", v) - case _ => System.clearProperty("spark.default.parallelism") - } } test("simr") { @@ -155,9 +152,10 @@ class SparkContextSchedulerCreationSuite testYarn("yarn-client", "org.apache.spark.scheduler.cluster.YarnClientClusterScheduler") } - def testMesos(master: String, expectedClass: Class[_]) { + def testMesos(master: String, expectedClass: Class[_], coarse: Boolean) { + val conf = new SparkConf().set("spark.mesos.coarse", coarse.toString) try { - val sched = createTaskScheduler(master) + val sched = createTaskScheduler(master, conf) assert(sched.backend.getClass === expectedClass) } catch { case e: UnsatisfiedLinkError => @@ -168,17 +166,14 @@ class SparkContextSchedulerCreationSuite } test("mesos fine-grained") { - System.setProperty("spark.mesos.coarse", "false") - testMesos("mesos://localhost:1234", classOf[MesosSchedulerBackend]) + testMesos("mesos://localhost:1234", classOf[MesosSchedulerBackend], coarse = false) } test("mesos coarse-grained") { - System.setProperty("spark.mesos.coarse", "true") - testMesos("mesos://localhost:1234", classOf[CoarseMesosSchedulerBackend]) + testMesos("mesos://localhost:1234", classOf[CoarseMesosSchedulerBackend], coarse = true) } test("mesos with zookeeper") { - System.setProperty("spark.mesos.coarse", "false") - testMesos("zk://localhost:1234,localhost:2345", classOf[MesosSchedulerBackend]) + testMesos("zk://localhost:1234,localhost:2345", classOf[MesosSchedulerBackend], coarse = false) } } diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala index 1362022104195..8b3c6871a7b39 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala @@ -23,55 +23,37 @@ import org.apache.hadoop.io.BytesWritable class SparkContextSuite extends FunSuite with LocalSparkContext { - /** Allows system properties to be changed in tests */ - private def withSystemProperty[T](property: String, value: String)(block: => T): T = { - val originalValue = System.getProperty(property) - try { - System.setProperty(property, value) - block - } finally { - if (originalValue == null) { - System.clearProperty(property) - } else { - System.setProperty(property, originalValue) - } - } - } - test("Only one SparkContext may be active at a time") { // Regression test for SPARK-4180 - withSystemProperty("spark.driver.allowMultipleContexts", "false") { - val conf = new SparkConf().setAppName("test").setMaster("local") - sc = new SparkContext(conf) - // A SparkContext is already running, so we shouldn't be able to create a second one - intercept[SparkException] { new SparkContext(conf) } - // After stopping the running context, we should be able to create a new one - resetSparkContext() - sc = new SparkContext(conf) - } + val conf = new SparkConf().setAppName("test").setMaster("local") + .set("spark.driver.allowMultipleContexts", "false") + sc = new SparkContext(conf) + // A SparkContext is already running, so we shouldn't be able to create a second one + intercept[SparkException] { new SparkContext(conf) } + // After stopping the running context, we should be able to create a new one + resetSparkContext() + sc = new SparkContext(conf) } test("Can still construct a new SparkContext after failing to construct a previous one") { - withSystemProperty("spark.driver.allowMultipleContexts", "false") { - // This is an invalid configuration (no app name or master URL) - intercept[SparkException] { - new SparkContext(new SparkConf()) - } - // Even though those earlier calls failed, we should still be able to create a new context - sc = new SparkContext(new SparkConf().setMaster("local").setAppName("test")) + val conf = new SparkConf().set("spark.driver.allowMultipleContexts", "false") + // This is an invalid configuration (no app name or master URL) + intercept[SparkException] { + new SparkContext(conf) } + // Even though those earlier calls failed, we should still be able to create a new context + sc = new SparkContext(conf.setMaster("local").setAppName("test")) } test("Check for multiple SparkContexts can be disabled via undocumented debug option") { - withSystemProperty("spark.driver.allowMultipleContexts", "true") { - var secondSparkContext: SparkContext = null - try { - val conf = new SparkConf().setAppName("test").setMaster("local") - sc = new SparkContext(conf) - secondSparkContext = new SparkContext(conf) - } finally { - Option(secondSparkContext).foreach(_.stop()) - } + var secondSparkContext: SparkContext = null + try { + val conf = new SparkConf().setAppName("test").setMaster("local") + .set("spark.driver.allowMultipleContexts", "true") + sc = new SparkContext(conf) + secondSparkContext = new SparkContext(conf) + } finally { + Option(secondSparkContext).foreach(_.stop()) } } diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index eb7bd7ab3986e..5eda2d41f0e6d 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -23,11 +23,13 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark._ import org.apache.spark.deploy.SparkSubmit._ -import org.apache.spark.util.Utils +import org.apache.spark.util.{ResetSystemProperties, Utils} import org.scalatest.FunSuite import org.scalatest.Matchers -class SparkSubmitSuite extends FunSuite with Matchers { +// Note: this suite mixes in ResetSystemProperties because SparkSubmit.main() sets a bunch +// of properties that neeed to be cleared after tests. +class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties { def beforeAll() { System.setProperty("spark.testing", "true") } diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala index b276343cb412c..24f41bf8cccda 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala @@ -26,9 +26,10 @@ import org.scalatest.Matchers import org.apache.spark.{LocalSparkContext, SparkContext} import org.apache.spark.executor.TaskMetrics +import org.apache.spark.util.ResetSystemProperties -class SparkListenerSuite extends FunSuite with LocalSparkContext with Matchers - with BeforeAndAfter with BeforeAndAfterAll { +class SparkListenerSuite extends FunSuite with LocalSparkContext with Matchers with BeforeAndAfter + with BeforeAndAfterAll with ResetSystemProperties { /** Length of time to wait while draining listener events. */ val WAIT_TIMEOUT_MILLIS = 10000 @@ -37,10 +38,6 @@ class SparkListenerSuite extends FunSuite with LocalSparkContext with Matchers sc = new SparkContext("local", "SparkListenerSuite") } - override def afterAll() { - System.clearProperty("spark.akka.frameSize") - } - test("basic creation and shutdown of LiveListenerBus") { val counter = new BasicJobCounter val bus = new LiveListenerBus diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala index 5768a3a733f00..3aab5a156ee77 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala @@ -21,7 +21,7 @@ import java.nio.ByteBuffer import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite} -import org.apache.spark.{LocalSparkContext, SparkContext, SparkEnv} +import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv} import org.apache.spark.storage.TaskResultBlockId /** @@ -55,27 +55,20 @@ class ResultDeletingTaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedule /** * Tests related to handling task results (both direct and indirect). */ -class TaskResultGetterSuite extends FunSuite with BeforeAndAfter with BeforeAndAfterAll - with LocalSparkContext { +class TaskResultGetterSuite extends FunSuite with BeforeAndAfter with LocalSparkContext { - override def beforeAll { - // Set the Akka frame size to be as small as possible (it must be an integer, so 1 is as small - // as we can make it) so the tests don't take too long. - System.setProperty("spark.akka.frameSize", "1") - } - - override def afterAll { - System.clearProperty("spark.akka.frameSize") - } + // Set the Akka frame size to be as small as possible (it must be an integer, so 1 is as small + // as we can make it) so the tests don't take too long. + def conf: SparkConf = new SparkConf().set("spark.akka.frameSize", "1") test("handling results smaller than Akka frame size") { - sc = new SparkContext("local", "test") + sc = new SparkContext("local", "test", conf) val result = sc.parallelize(Seq(1), 1).map(x => 2 * x).reduce((x, y) => x) assert(result === 2) } test("handling results larger than Akka frame size") { - sc = new SparkContext("local", "test") + sc = new SparkContext("local", "test", conf) val akkaFrameSize = sc.env.actorSystem.settings.config.getBytes("akka.remote.netty.tcp.maximum-frame-size").toInt val result = sc.parallelize(Seq(1), 1).map(x => 1.to(akkaFrameSize).toArray).reduce((x, y) => x) @@ -89,7 +82,7 @@ class TaskResultGetterSuite extends FunSuite with BeforeAndAfter with BeforeAndA test("task retried if result missing from block manager") { // Set the maximum number of task failures to > 0, so that the task set isn't aborted // after the result is missing. - sc = new SparkContext("local[1,2]", "test") + sc = new SparkContext("local[1,2]", "test", conf) // If this test hangs, it's probably because no resource offers were made after the task // failed. val scheduler: TaskSchedulerImpl = sc.taskScheduler match { diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala index 7532da88c6065..40aaf9dd1f1e9 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala @@ -162,12 +162,12 @@ class TaskSchedulerImplSuite extends FunSuite with LocalSparkContext with Loggin } test("Fair Scheduler Test") { - sc = new SparkContext("local", "TaskSchedulerImplSuite") + val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile() + val conf = new SparkConf().set("spark.scheduler.allocation.file", xmlPath) + sc = new SparkContext("local", "TaskSchedulerImplSuite", conf) val taskScheduler = new TaskSchedulerImpl(sc) val taskSet = FakeTask.createTaskSet(1) - val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile() - System.setProperty("spark.scheduler.allocation.file", xmlPath) val rootPool = new Pool("", SchedulingMode.FAIR, 0, 0) val schedulableBuilder = new FairSchedulableBuilder(rootPool, sc.conf) schedulableBuilder.buildPools() diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala index 5554efbcbadf8..ffe6f039145ea 100644 --- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala @@ -33,7 +33,7 @@ import akka.util.Timeout import org.mockito.Mockito.{mock, when} -import org.scalatest.{BeforeAndAfter, FunSuite, Matchers, PrivateMethodTester} +import org.scalatest._ import org.scalatest.concurrent.Eventually._ import org.scalatest.concurrent.Timeouts._ @@ -44,18 +44,17 @@ import org.apache.spark.scheduler.LiveListenerBus import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.shuffle.hash.HashShuffleManager import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat -import org.apache.spark.util.{AkkaUtils, ByteBufferInputStream, SizeEstimator, Utils} +import org.apache.spark.util._ -class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter - with PrivateMethodTester { +class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfterEach + with PrivateMethodTester with ResetSystemProperties { private val conf = new SparkConf(false) var store: BlockManager = null var store2: BlockManager = null var actorSystem: ActorSystem = null var master: BlockManagerMaster = null - var oldArch: String = null conf.set("spark.authenticate", "false") val securityMgr = new SecurityManager(conf) val mapOutputTracker = new MapOutputTrackerMaster(conf) @@ -79,13 +78,13 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter manager } - before { + override def beforeEach(): Unit = { val (actorSystem, boundPort) = AkkaUtils.createActorSystem( "test", "localhost", 0, conf = conf, securityManager = securityMgr) this.actorSystem = actorSystem // Set the arch to 64-bit and compressedOops to true to get a deterministic test-case - oldArch = System.setProperty("os.arch", "amd64") + System.setProperty("os.arch", "amd64") conf.set("os.arch", "amd64") conf.set("spark.test.useCompressedOops", "true") conf.set("spark.driver.port", boundPort.toString) @@ -100,7 +99,7 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter SizeEstimator invokePrivate initialize() } - after { + override def afterEach(): Unit = { if (store != null) { store.stop() store = null @@ -113,14 +112,6 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter actorSystem.awaitTermination() actorSystem = null master = null - - if (oldArch != null) { - conf.set("os.arch", oldArch) - } else { - System.clearProperty("os.arch") - } - - System.clearProperty("spark.test.useCompressedOops") } test("StorageLevel object caching") { diff --git a/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala index 7bca1711ae226..6bbf72e929dcb 100644 --- a/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala @@ -31,7 +31,7 @@ import org.apache.spark.storage.BlockManagerId /** * Test the AkkaUtils with various security settings. */ -class AkkaUtilsSuite extends FunSuite with LocalSparkContext { +class AkkaUtilsSuite extends FunSuite with LocalSparkContext with ResetSystemProperties { test("remote fetch security bad password") { val conf = new SparkConf diff --git a/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala b/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala new file mode 100644 index 0000000000000..d4b92f33dd9e6 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util + +import java.util.Properties + +import org.scalatest.{BeforeAndAfterEach, Suite} + +/** + * Mixin for automatically resetting system properties that are modified in ScalaTest tests. + * This resets the properties after each individual test. + * + * The order in which fixtures are mixed in affects the order in which they are invoked by tests. + * If we have a suite `MySuite extends FunSuite with Foo with Bar`, then + * Bar's `super` is Foo, so Bar's beforeEach() will and afterEach() methods will be invoked first + * by the rest runner. + * + * This means that ResetSystemProperties should appear as the last trait in test suites that it's + * mixed into in order to ensure that the system properties snapshot occurs as early as possible. + * ResetSystemProperties calls super.afterEach() before performing its own cleanup, ensuring that + * the old properties are restored as late as possible. + * + * See the "Composing fixtures by stacking traits" section at + * http://www.scalatest.org/user_guide/sharing_fixtures for more details about this pattern. + */ +private[spark] trait ResetSystemProperties extends BeforeAndAfterEach { this: Suite => + var oldProperties: Properties = null + + override def beforeEach(): Unit = { + oldProperties = new Properties(System.getProperties) + super.beforeEach() + } + + override def afterEach(): Unit = { + try { + super.afterEach() + } finally { + System.setProperties(oldProperties) + oldProperties = null + } + } +} diff --git a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala index 0ea2d13a83505..7424c2e91d4f2 100644 --- a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala @@ -17,9 +17,7 @@ package org.apache.spark.util -import org.scalatest.BeforeAndAfterAll -import org.scalatest.FunSuite -import org.scalatest.PrivateMethodTester +import org.scalatest.{BeforeAndAfterEach, BeforeAndAfterAll, FunSuite, PrivateMethodTester} class DummyClass1 {} @@ -46,20 +44,12 @@ class DummyString(val arr: Array[Char]) { } class SizeEstimatorSuite - extends FunSuite with BeforeAndAfterAll with PrivateMethodTester { + extends FunSuite with BeforeAndAfterEach with PrivateMethodTester with ResetSystemProperties { - var oldArch: String = _ - var oldOops: String = _ - - override def beforeAll() { + override def beforeEach() { // Set the arch to 64-bit and compressedOops to true to get a deterministic test-case - oldArch = System.setProperty("os.arch", "amd64") - oldOops = System.setProperty("spark.test.useCompressedOops", "true") - } - - override def afterAll() { - resetOrClear("os.arch", oldArch) - resetOrClear("spark.test.useCompressedOops", oldOops) + System.setProperty("os.arch", "amd64") + System.setProperty("spark.test.useCompressedOops", "true") } test("simple classes") { @@ -122,7 +112,7 @@ class SizeEstimatorSuite } test("32-bit arch") { - val arch = System.setProperty("os.arch", "x86") + System.setProperty("os.arch", "x86") val initialize = PrivateMethod[Unit]('initialize) SizeEstimator invokePrivate initialize() @@ -131,14 +121,13 @@ class SizeEstimatorSuite assertResult(48)(SizeEstimator.estimate(DummyString("a"))) assertResult(48)(SizeEstimator.estimate(DummyString("ab"))) assertResult(56)(SizeEstimator.estimate(DummyString("abcdefgh"))) - resetOrClear("os.arch", arch) } // NOTE: The String class definition varies across JDK versions (1.6 vs. 1.7) and vendors // (Sun vs IBM). Use a DummyString class to make tests deterministic. test("64-bit arch with no compressed oops") { - val arch = System.setProperty("os.arch", "amd64") - val oops = System.setProperty("spark.test.useCompressedOops", "false") + System.setProperty("os.arch", "amd64") + System.setProperty("spark.test.useCompressedOops", "false") val initialize = PrivateMethod[Unit]('initialize) SizeEstimator invokePrivate initialize() @@ -146,16 +135,5 @@ class SizeEstimatorSuite assertResult(64)(SizeEstimator.estimate(DummyString("a"))) assertResult(64)(SizeEstimator.estimate(DummyString("ab"))) assertResult(72)(SizeEstimator.estimate(DummyString("abcdefgh"))) - - resetOrClear("os.arch", arch) - resetOrClear("spark.test.useCompressedOops", oops) - } - - def resetOrClear(prop: String, oldValue: String) { - if (oldValue != null) { - System.setProperty(prop, oldValue) - } else { - System.clearProperty(prop) - } } } diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index f9d4bea823f7c..4544382094f96 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -31,7 +31,7 @@ import org.scalatest.FunSuite import org.apache.spark.SparkConf -class UtilsSuite extends FunSuite { +class UtilsSuite extends FunSuite with ResetSystemProperties { test("bytesToString") { assert(Utils.bytesToString(10) === "10.0 B") diff --git a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala index adecd934358c4..1b53f3edbe92e 100644 --- a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala +++ b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala @@ -28,11 +28,9 @@ object BroadcastTest { val bcName = if (args.length > 2) args(2) else "Http" val blockSize = if (args.length > 3) args(3) else "4096" - System.setProperty("spark.broadcast.factory", "org.apache.spark.broadcast." + bcName + - "BroadcastFactory") - System.setProperty("spark.broadcast.blockSize", blockSize) val sparkConf = new SparkConf().setAppName("Broadcast Test") - + .set("spark.broadcast.factory", s"org.apache.spark.broadcast.${bcName}BroaddcastFactory") + .set("spark.broadcast.blockSize", blockSize) val sc = new SparkContext(sparkConf) val slices = if (args.length > 0) args(0).toInt else 2 diff --git a/external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java index 6e1f01900071b..1e24da7f5f60c 100644 --- a/external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java +++ b/external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java @@ -17,6 +17,7 @@ package org.apache.spark.streaming; +import org.apache.spark.SparkConf; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.junit.After; import org.junit.Before; @@ -27,8 +28,11 @@ public abstract class LocalJavaStreamingContext { @Before public void setUp() { - System.setProperty("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); - ssc = new JavaStreamingContext("local[2]", "test", new Duration(1000)); + SparkConf conf = new SparkConf() + .setMaster("local[2]") + .setAppName("test") + .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); + ssc = new JavaStreamingContext(conf, new Duration(1000)); ssc.checkpoint("checkpoint"); } diff --git a/external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java index 6e1f01900071b..1e24da7f5f60c 100644 --- a/external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java +++ b/external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java @@ -17,6 +17,7 @@ package org.apache.spark.streaming; +import org.apache.spark.SparkConf; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.junit.After; import org.junit.Before; @@ -27,8 +28,11 @@ public abstract class LocalJavaStreamingContext { @Before public void setUp() { - System.setProperty("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); - ssc = new JavaStreamingContext("local[2]", "test", new Duration(1000)); + SparkConf conf = new SparkConf() + .setMaster("local[2]") + .setAppName("test") + .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); + ssc = new JavaStreamingContext(conf, new Duration(1000)); ssc.checkpoint("checkpoint"); } diff --git a/external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java index 6e1f01900071b..1e24da7f5f60c 100644 --- a/external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java +++ b/external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java @@ -17,6 +17,7 @@ package org.apache.spark.streaming; +import org.apache.spark.SparkConf; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.junit.After; import org.junit.Before; @@ -27,8 +28,11 @@ public abstract class LocalJavaStreamingContext { @Before public void setUp() { - System.setProperty("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); - ssc = new JavaStreamingContext("local[2]", "test", new Duration(1000)); + SparkConf conf = new SparkConf() + .setMaster("local[2]") + .setAppName("test") + .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); + ssc = new JavaStreamingContext(conf, new Duration(1000)); ssc.checkpoint("checkpoint"); } diff --git a/external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java index 6e1f01900071b..1e24da7f5f60c 100644 --- a/external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java +++ b/external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java @@ -17,6 +17,7 @@ package org.apache.spark.streaming; +import org.apache.spark.SparkConf; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.junit.After; import org.junit.Before; @@ -27,8 +28,11 @@ public abstract class LocalJavaStreamingContext { @Before public void setUp() { - System.setProperty("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); - ssc = new JavaStreamingContext("local[2]", "test", new Duration(1000)); + SparkConf conf = new SparkConf() + .setMaster("local[2]") + .setAppName("test") + .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); + ssc = new JavaStreamingContext(conf, new Duration(1000)); ssc.checkpoint("checkpoint"); } diff --git a/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java index 6e1f01900071b..1e24da7f5f60c 100644 --- a/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java +++ b/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java @@ -17,6 +17,7 @@ package org.apache.spark.streaming; +import org.apache.spark.SparkConf; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.junit.After; import org.junit.Before; @@ -27,8 +28,11 @@ public abstract class LocalJavaStreamingContext { @Before public void setUp() { - System.setProperty("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); - ssc = new JavaStreamingContext("local[2]", "test", new Duration(1000)); + SparkConf conf = new SparkConf() + .setMaster("local[2]") + .setAppName("test") + .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); + ssc = new JavaStreamingContext(conf, new Duration(1000)); ssc.checkpoint("checkpoint"); } diff --git a/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala b/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala index db58eb642b56d..15ee95070a3d3 100644 --- a/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala +++ b/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala @@ -21,7 +21,7 @@ import java.util.concurrent.{CountDownLatch, Executors} import java.util.concurrent.atomic.AtomicLong import org.apache.spark.executor.ShuffleWriteMetrics -import org.apache.spark.SparkContext +import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.shuffle.hash.HashShuffleManager import org.apache.spark.util.Utils @@ -49,13 +49,13 @@ object StoragePerfTester { val writeData = "1" * recordLength val executor = Executors.newFixedThreadPool(numMaps) - System.setProperty("spark.shuffle.compress", "false") - System.setProperty("spark.shuffle.sync", "true") - System.setProperty("spark.shuffle.manager", - "org.apache.spark.shuffle.hash.HashShuffleManager") + val conf = new SparkConf() + .set("spark.shuffle.compress", "false") + .set("spark.shuffle.sync", "true") + .set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager") // This is only used to instantiate a BlockManager. All thread scheduling is done manually. - val sc = new SparkContext("local[4]", "Write Tester") + val sc = new SparkContext("local[4]", "Write Tester", conf) val hashShuffleManager = sc.env.shuffleManager.asInstanceOf[HashShuffleManager] def writeOutputBytes(mapId: Int, total: AtomicLong) = { From 06a9aa589c518a40a3c7cc201e89d75af77ab93e Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 31 Dec 2014 11:50:53 -0800 Subject: [PATCH 034/116] [SPARK-4797] Replace breezeSquaredDistance This PR replaces slow breezeSquaredDistance. Author: Liang-Chi Hsieh Closes #3643 from viirya/faster_squareddistance and squashes the following commits: f28b275 [Liang-Chi Hsieh] Move the implementation to linalg.Vectors and rename as sqdist. 0bc48ee [Liang-Chi Hsieh] Merge branch 'master' into faster_squareddistance ba34422 [Liang-Chi Hsieh] Fix bug. 91849d0 [Liang-Chi Hsieh] Modified for comment. 44a65ad [Liang-Chi Hsieh] Modified for comments. 35db395 [Liang-Chi Hsieh] Fix bug and some modifications for comments. f4f5ebb [Liang-Chi Hsieh] Follow BLAS.dot pattern to replace intersect, diff with while-loop. a36e09f [Liang-Chi Hsieh] Use while-loop to replace foreach for better performance. d3e0628 [Liang-Chi Hsieh] Make the methods private. dd415bc [Liang-Chi Hsieh] Consider different cases of SparseVector and DenseVector. 13669db [Liang-Chi Hsieh] Replace breezeSquaredDistance. --- .../apache/spark/mllib/linalg/Vectors.scala | 80 +++++++++++++++++++ .../org/apache/spark/mllib/util/MLUtils.scala | 13 ++- .../spark/mllib/util/MLUtilsSuite.scala | 15 ++++ 3 files changed, 100 insertions(+), 8 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 01f3f90577142..6a782b079aac3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -312,6 +312,86 @@ object Vectors { math.pow(sum, 1.0 / p) } } + + /** + * Returns the squared distance between two Vectors. + * @param v1 first Vector. + * @param v2 second Vector. + * @return squared distance between two Vectors. + */ + def sqdist(v1: Vector, v2: Vector): Double = { + var squaredDistance = 0.0 + (v1, v2) match { + case (v1: SparseVector, v2: SparseVector) => + val v1Values = v1.values + val v1Indices = v1.indices + val v2Values = v2.values + val v2Indices = v2.indices + val nnzv1 = v1Indices.size + val nnzv2 = v2Indices.size + + var kv1 = 0 + var kv2 = 0 + while (kv1 < nnzv1 || kv2 < nnzv2) { + var score = 0.0 + + if (kv2 >= nnzv2 || (kv1 < nnzv1 && v1Indices(kv1) < v2Indices(kv2))) { + score = v1Values(kv1) + kv1 += 1 + } else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) { + score = v2Values(kv2) + kv2 += 1 + } else { + score = v1Values(kv1) - v2Values(kv2) + kv1 += 1 + kv2 += 1 + } + squaredDistance += score * score + } + + case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 => + squaredDistance = sqdist(v1, v2) + + case (v1: DenseVector, v2: SparseVector) if v2.indices.length / v2.size < 0.5 => + squaredDistance = sqdist(v2, v1) + + // When a SparseVector is approximately dense, we treat it as a DenseVector + case (v1, v2) => + squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0){ (distance, elems) => + val score = elems._1 - elems._2 + distance + score * score + } + } + squaredDistance + } + + /** + * Returns the squared distance between DenseVector and SparseVector. + */ + private[mllib] def sqdist(v1: SparseVector, v2: DenseVector): Double = { + var kv1 = 0 + var kv2 = 0 + val indices = v1.indices + var squaredDistance = 0.0 + var iv1 = indices(kv1) + val nnzv2 = v2.size + + while (kv2 < nnzv2) { + var score = 0.0 + if (kv2 != iv1) { + score = v2(kv2) + } else { + score = v1.values(kv1) - v2(kv2) + if (kv1 < indices.length - 1) { + kv1 += 1 + iv1 = indices(kv1) + } + } + squaredDistance += score * score + kv2 += 1 + } + squaredDistance + } } /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index da0da0a168c1d..c7843464a7505 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -19,8 +19,7 @@ package org.apache.spark.mllib.util import scala.reflect.ClassTag -import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, - squaredDistance => breezeSquaredDistance} +import breeze.linalg.{DenseVector => BDV, SparseVector => BSV} import org.apache.spark.annotation.Experimental import org.apache.spark.SparkContext @@ -28,7 +27,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.rdd.PartitionwiseSampledRDD import org.apache.spark.util.random.BernoulliCellSampler import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors} +import org.apache.spark.mllib.linalg.{SparseVector, DenseVector, Vector, Vectors} import org.apache.spark.mllib.linalg.BLAS.dot import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext @@ -266,7 +265,7 @@ object MLUtils { } Vectors.fromBreeze(vector1) } - + /** * Returns the squared Euclidean distance between two vectors. The following formula will be used * if it does not introduce too much numerical error: @@ -316,12 +315,10 @@ object MLUtils { val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) / (sqDist + EPSILON) if (precisionBound2 > precision) { - // TODO: breezeSquaredDistance is slow, - // so we should replace it with our own implementation. - sqDist = breezeSquaredDistance(v1.toBreeze, v2.toBreeze) + sqDist = Vectors.sqdist(v1, v2) } } else { - sqDist = breezeSquaredDistance(v1.toBreeze, v2.toBreeze) + sqDist = Vectors.sqdist(v1, v2) } sqDist } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala index df07987093fbf..7778847f8b72a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala @@ -52,12 +52,27 @@ class MLUtilsSuite extends FunSuite with MLlibTestSparkContext { val values = indices.map(i => a(i)) val v2 = Vectors.sparse(n, indices, values) val norm2 = Vectors.norm(v2, 2.0) + val v3 = Vectors.sparse(n, indices, indices.map(i => a(i) + 0.5)) + val norm3 = Vectors.norm(v3, 2.0) val squaredDist = breezeSquaredDistance(v1.toBreeze, v2.toBreeze) val fastSquaredDist1 = fastSquaredDistance(v1, norm1, v2, norm2, precision) assert((fastSquaredDist1 - squaredDist) <= precision * squaredDist, s"failed with m = $m") val fastSquaredDist2 = fastSquaredDistance(v1, norm1, Vectors.dense(v2.toArray), norm2, precision) assert((fastSquaredDist2 - squaredDist) <= precision * squaredDist, s"failed with m = $m") + val squaredDist2 = breezeSquaredDistance(v2.toBreeze, v3.toBreeze) + val fastSquaredDist3 = + fastSquaredDistance(v2, norm2, v3, norm3, precision) + assert((fastSquaredDist3 - squaredDist2) <= precision * squaredDist2, s"failed with m = $m") + if (m > 10) { + val v4 = Vectors.sparse(n, indices.slice(0, m - 10), + indices.map(i => a(i) + 0.5).slice(0, m - 10)) + val norm4 = Vectors.norm(v4, 2.0) + val squaredDist = breezeSquaredDistance(v2.toBreeze, v4.toBreeze) + val fastSquaredDist = + fastSquaredDistance(v2, norm2, v4, norm4, precision) + assert((fastSquaredDist - squaredDist) <= precision * squaredDist, s"failed with m = $m") + } } } From 8e14c5eb551ab06c94859c7f6d8c6b62b4d00d59 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 31 Dec 2014 11:54:10 -0800 Subject: [PATCH 035/116] [SPARK-4298][Core] - The spark-submit cannot read Main-Class from Manifest. Resolves a bug where the `Main-Class` from a .jar file wasn't being read in properly. This was caused by the fact that the `primaryResource` object was a URI and needed to be normalized through a call to `.getPath` before it could be passed into the `JarFile` object. Author: Brennon York Closes #3561 from brennonyork/SPARK-4298 and squashes the following commits: 5e0fce1 [Brennon York] Use string interpolation for error messages, moved comment line from original code to above its necessary code segment 14daa20 [Brennon York] pushed mainClass assignment into match statement, removed spurious spaces, removed { } from case statements, removed return values c6dad68 [Brennon York] Set case statement to support multiple jar URI's and enabled the 'file' URI to load the main-class 8d20936 [Brennon York] updated to reset the error message back to the default a043039 [Brennon York] updated to split the uri and jar vals 8da7cbf [Brennon York] fixes SPARK-4298 --- .../spark/deploy/SparkSubmitArguments.scala | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index f174bc1af59b4..1faabe91f49a8 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -17,6 +17,7 @@ package org.apache.spark.deploy +import java.net.URI import java.util.jar.JarFile import scala.collection.mutable.{ArrayBuffer, HashMap} @@ -125,14 +126,23 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St // Try to set main class from JAR if no --class argument is given if (mainClass == null && !isPython && primaryResource != null) { - try { - val jar = new JarFile(primaryResource) - // Note that this might still return null if no main-class is set; we catch that later - mainClass = jar.getManifest.getMainAttributes.getValue("Main-Class") - } catch { - case e: Exception => - SparkSubmit.printErrorAndExit("Cannot load main class from JAR: " + primaryResource) - return + val uri = new URI(primaryResource) + val uriScheme = uri.getScheme() + + uriScheme match { + case "file" => + try { + val jar = new JarFile(uri.getPath) + // Note that this might still return null if no main-class is set; we catch that later + mainClass = jar.getManifest.getMainAttributes.getValue("Main-Class") + } catch { + case e: Exception => + SparkSubmit.printErrorAndExit(s"Cannot load main class from JAR $primaryResource") + } + case _ => + SparkSubmit.printErrorAndExit( + s"Cannot load main class from JAR $primaryResource with URI $uriScheme. " + + "Please specify a class through --class.") } } From 3d194cc75761fceba77b2c91291b36479b8b556c Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Wed, 31 Dec 2014 13:37:04 -0800 Subject: [PATCH 036/116] SPARK-4547 [MLLIB] OOM when making bins in BinaryClassificationMetrics Now that I've implemented the basics here, I'm less convinced there is a need for this change, somehow. Callers can downsample before or after. Really the OOM is not in the ROC curve code, but in code that might `collect()` it for local analysis. Still, might be useful to down-sample since the ROC curve probably never needs millions of points. This is a first pass. Since the `(score,label)` are already grouped and sorted, I think it's sufficient to just take every Nth such pair, in order to downsample by a factor of N? this is just like retaining every Nth point on the curve, which I think is the goal. All of the data is still used to build the curve of course. What do you think about the API, and usefulness? Author: Sean Owen Closes #3702 from srowen/SPARK-4547 and squashes the following commits: 1d34d05 [Sean Owen] Indent and reorganize numBins scaladoc 692d825 [Sean Owen] Change handling of large numBins, make 2nd consturctor instead of optional param, style change a03610e [Sean Owen] Add downsamplingFactor to BinaryClassificationMetrics --- .../BinaryClassificationMetrics.scala | 59 ++++++++++++++++++- .../BinaryClassificationMetricsSuite.scala | 36 +++++++++++ 2 files changed, 92 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala index 1af40de2c7fcf..ced042e2f96ca 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala @@ -28,9 +28,30 @@ import org.apache.spark.rdd.{RDD, UnionRDD} * Evaluator for binary classification. * * @param scoreAndLabels an RDD of (score, label) pairs. + * @param numBins if greater than 0, then the curves (ROC curve, PR curve) computed internally + * will be down-sampled to this many "bins". If 0, no down-sampling will occur. + * This is useful because the curve contains a point for each distinct score + * in the input, and this could be as large as the input itself -- millions of + * points or more, when thousands may be entirely sufficient to summarize + * the curve. After down-sampling, the curves will instead be made of approximately + * `numBins` points instead. Points are made from bins of equal numbers of + * consecutive points. The size of each bin is + * `floor(scoreAndLabels.count() / numBins)`, which means the resulting number + * of bins may not exactly equal numBins. The last bin in each partition may + * be smaller as a result, meaning there may be an extra sample at + * partition boundaries. */ @Experimental -class BinaryClassificationMetrics(scoreAndLabels: RDD[(Double, Double)]) extends Logging { +class BinaryClassificationMetrics( + val scoreAndLabels: RDD[(Double, Double)], + val numBins: Int) extends Logging { + + require(numBins >= 0, "numBins must be nonnegative") + + /** + * Defaults `numBins` to 0. + */ + def this(scoreAndLabels: RDD[(Double, Double)]) = this(scoreAndLabels, 0) /** Unpersist intermediate RDDs used in the computation. */ def unpersist() { @@ -103,7 +124,39 @@ class BinaryClassificationMetrics(scoreAndLabels: RDD[(Double, Double)]) extends mergeValue = (c: BinaryLabelCounter, label: Double) => c += label, mergeCombiners = (c1: BinaryLabelCounter, c2: BinaryLabelCounter) => c1 += c2 ).sortByKey(ascending = false) - val agg = counts.values.mapPartitions { iter => + + val binnedCounts = + // Only down-sample if bins is > 0 + if (numBins == 0) { + // Use original directly + counts + } else { + val countsSize = counts.count() + // Group the iterator into chunks of about countsSize / numBins points, + // so that the resulting number of bins is about numBins + var grouping = countsSize / numBins + if (grouping < 2) { + // numBins was more than half of the size; no real point in down-sampling to bins + logInfo(s"Curve is too small ($countsSize) for $numBins bins to be useful") + counts + } else { + if (grouping >= Int.MaxValue) { + logWarning( + s"Curve too large ($countsSize) for $numBins bins; capping at ${Int.MaxValue}") + grouping = Int.MaxValue + } + counts.mapPartitions(_.grouped(grouping.toInt).map { pairs => + // The score of the combined point will be just the first one's score + val firstScore = pairs.head._1 + // The point will contain all counts in this chunk + val agg = new BinaryLabelCounter() + pairs.foreach(pair => agg += pair._2) + (firstScore, agg) + }) + } + } + + val agg = binnedCounts.values.mapPartitions { iter => val agg = new BinaryLabelCounter() iter.foreach(agg += _) Iterator(agg) @@ -113,7 +166,7 @@ class BinaryClassificationMetrics(scoreAndLabels: RDD[(Double, Double)]) extends (agg: BinaryLabelCounter, c: BinaryLabelCounter) => agg.clone() += c) val totalCount = partitionwiseCumulativeCounts.last logInfo(s"Total counts: $totalCount") - val cumulativeCounts = counts.mapPartitionsWithIndex( + val cumulativeCounts = binnedCounts.mapPartitionsWithIndex( (index: Int, iter: Iterator[(Double, BinaryLabelCounter)]) => { val cumCount = partitionwiseCumulativeCounts(index) iter.map { case (score, c) => diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala index 8a18e2971cab6..e0224f960cc43 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetricsSuite.scala @@ -124,4 +124,40 @@ class BinaryClassificationMetricsSuite extends FunSuite with MLlibTestSparkConte validateMetrics(metrics, thresholds, rocCurve, prCurve, f1, f2, precisions, recalls) } + + test("binary evaluation metrics with downsampling") { + val scoreAndLabels = Seq( + (0.1, 0.0), (0.2, 0.0), (0.3, 1.0), (0.4, 0.0), (0.5, 0.0), + (0.6, 1.0), (0.7, 1.0), (0.8, 0.0), (0.9, 1.0)) + + val scoreAndLabelsRDD = sc.parallelize(scoreAndLabels, 1) + + val original = new BinaryClassificationMetrics(scoreAndLabelsRDD) + val originalROC = original.roc().collect().sorted.toList + // Add 2 for (0,0) and (1,1) appended at either end + assert(2 + scoreAndLabels.size == originalROC.size) + assert( + List( + (0.0, 0.0), (0.0, 0.25), (0.2, 0.25), (0.2, 0.5), (0.2, 0.75), + (0.4, 0.75), (0.6, 0.75), (0.6, 1.0), (0.8, 1.0), (1.0, 1.0), + (1.0, 1.0) + ) == + originalROC) + + val numBins = 4 + + val downsampled = new BinaryClassificationMetrics(scoreAndLabelsRDD, numBins) + val downsampledROC = downsampled.roc().collect().sorted.toList + assert( + // May have to add 1 if the sample factor didn't divide evenly + 2 + (numBins + (if (scoreAndLabels.size % numBins == 0) 0 else 1)) == + downsampledROC.size) + assert( + List( + (0.0, 0.0), (0.2, 0.25), (0.2, 0.75), (0.6, 0.75), (0.8, 1.0), + (1.0, 1.0), (1.0, 1.0) + ) == + downsampledROC) + } + } From e24d3a9a29962023cc722896a14c7bfe06e8e601 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Fri, 12 Dec 2014 12:38:37 -0800 Subject: [PATCH 037/116] [HOTFIX] Disable Spark UI in SparkSubmitSuite tests This should fix a major cause of build breaks when running many parallel tests. --- .../test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index 5eda2d41f0e6d..065b7534cece6 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -290,6 +290,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties "--class", SimpleApplicationTest.getClass.getName.stripSuffix("$"), "--name", "testApp", "--master", "local", + "--conf", "spark.ui.enabled=false", unusedJar.toString) runSparkSubmit(args) } @@ -304,6 +305,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties "--name", "testApp", "--master", "local-cluster[2,1,512]", "--jars", jarsString, + "--conf", "spark.ui.enabled=false", unusedJar.toString) runSparkSubmit(args) } From c88a3d7fca20d36ee566d48e0cb91fe33a7a6d99 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 31 Dec 2014 14:25:03 -0800 Subject: [PATCH 038/116] [SPARK-5038][SQL] Add explicit return type for implicit functions in Spark SQL As we learned in https://github.com/apache/spark/pull/3580, not explicitly typing implicit functions can lead to compiler bugs and potentially unexpected runtime behavior. Author: Reynold Xin Closes #3859 from rxin/sql-implicits and squashes the following commits: 30c2c24 [Reynold Xin] [SPARK-5038] Add explicit return type for implicit functions in Spark SQL. --- .../spark/sql/catalyst/dsl/package.scala | 80 +++++++++---------- .../org/apache/spark/sql/SQLContext.scala | 2 +- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index a14e5b9ef14d0..8e39f79d2ca51 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -19,8 +19,6 @@ package org.apache.spark.sql.catalyst import java.sql.{Date, Timestamp} -import org.apache.spark.sql.catalyst.types.decimal.Decimal - import scala.language.implicitConversions import scala.reflect.runtime.universe.{TypeTag, typeTag} @@ -29,6 +27,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.{Inner, JoinType} import org.apache.spark.sql.catalyst.types._ +import org.apache.spark.sql.catalyst.types.decimal.Decimal /** * A collection of implicit conversions that create a DSL for constructing catalyst data structures. @@ -119,21 +118,22 @@ package object dsl { def expr = e } - implicit def booleanToLiteral(b: Boolean) = Literal(b) - implicit def byteToLiteral(b: Byte) = Literal(b) - implicit def shortToLiteral(s: Short) = Literal(s) - implicit def intToLiteral(i: Int) = Literal(i) - implicit def longToLiteral(l: Long) = Literal(l) - implicit def floatToLiteral(f: Float) = Literal(f) - implicit def doubleToLiteral(d: Double) = Literal(d) - implicit def stringToLiteral(s: String) = Literal(s) - implicit def dateToLiteral(d: Date) = Literal(d) - implicit def bigDecimalToLiteral(d: BigDecimal) = Literal(d) - implicit def decimalToLiteral(d: Decimal) = Literal(d) - implicit def timestampToLiteral(t: Timestamp) = Literal(t) - implicit def binaryToLiteral(a: Array[Byte]) = Literal(a) - - implicit def symbolToUnresolvedAttribute(s: Symbol) = analysis.UnresolvedAttribute(s.name) + implicit def booleanToLiteral(b: Boolean): Literal = Literal(b) + implicit def byteToLiteral(b: Byte): Literal = Literal(b) + implicit def shortToLiteral(s: Short): Literal = Literal(s) + implicit def intToLiteral(i: Int): Literal = Literal(i) + implicit def longToLiteral(l: Long): Literal = Literal(l) + implicit def floatToLiteral(f: Float): Literal = Literal(f) + implicit def doubleToLiteral(d: Double): Literal = Literal(d) + implicit def stringToLiteral(s: String): Literal = Literal(s) + implicit def dateToLiteral(d: Date): Literal = Literal(d) + implicit def bigDecimalToLiteral(d: BigDecimal): Literal = Literal(d) + implicit def decimalToLiteral(d: Decimal): Literal = Literal(d) + implicit def timestampToLiteral(t: Timestamp): Literal = Literal(t) + implicit def binaryToLiteral(a: Array[Byte]): Literal = Literal(a) + + implicit def symbolToUnresolvedAttribute(s: Symbol): analysis.UnresolvedAttribute = + analysis.UnresolvedAttribute(s.name) def sum(e: Expression) = Sum(e) def sumDistinct(e: Expression) = SumDistinct(e) @@ -301,52 +301,52 @@ package object dsl { (1 to 22).map { x => val argTypes = Seq.fill(x)("_").mkString(", ") - s"implicit def functionToUdfBuilder[T: TypeTag](func: Function$x[$argTypes, T]) = ScalaUdfBuilder(func)" + s"implicit def functionToUdfBuilder[T: TypeTag](func: Function$x[$argTypes, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func)" } */ - implicit def functionToUdfBuilder[T: TypeTag](func: Function1[_, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function1[_, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function2[_, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function2[_, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function3[_, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function3[_, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function4[_, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function4[_, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function5[_, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function5[_, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function6[_, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function6[_, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function7[_, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function7[_, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function8[_, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function8[_, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function9[_, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function9[_, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function10[_, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function10[_, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function11[_, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function11[_, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function12[_, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function12[_, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function13[_, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function13[_, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) - implicit def functionToUdfBuilder[T: TypeTag](func: Function22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]) = ScalaUdfBuilder(func) + implicit def functionToUdfBuilder[T: TypeTag](func: Function22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, T]): ScalaUdfBuilder[T] = ScalaUdfBuilder(func) // scalastyle:on } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 7a13302229012..6a1a4d995bf61 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -106,7 +106,7 @@ class SQLContext(@transient val sparkContext: SparkContext) * * @group userf */ - implicit def createSchemaRDD[A <: Product: TypeTag](rdd: RDD[A]) = { + implicit def createSchemaRDD[A <: Product: TypeTag](rdd: RDD[A]): SchemaRDD = { SparkPlan.currentContext.set(self) val attributeSeq = ScalaReflection.attributesFor[A] val schema = StructType.fromAttributes(attributeSeq) From 3610d3c615112faef98d94f04efaea602cc4aa8f Mon Sep 17 00:00:00 2001 From: Hari Shreedharan Date: Wed, 31 Dec 2014 14:35:07 -0800 Subject: [PATCH 039/116] [SPARK-4790][STREAMING] Fix ReceivedBlockTrackerSuite waits for old file... ...s to get deleted before continuing. Since the deletes are happening asynchronously, the getFileStatus call might throw an exception in older HDFS versions, if the delete happens between the time listFiles is called on the directory and getFileStatus is called on the file in the getFileStatus method. This PR addresses this by adding an option to delete the files synchronously and then waiting for the deletion to complete before proceeding. Author: Hari Shreedharan Closes #3726 from harishreedharan/spark-4790 and squashes the following commits: bbbacd1 [Hari Shreedharan] Call cleanUpOldLogs only once in the tests. 3255f17 [Hari Shreedharan] Add test for async deletion. Remove method from ReceiverTracker that does not take waitForCompletion. e4c83ec [Hari Shreedharan] Making waitForCompletion a mandatory param. Remove eventually from WALSuite since the cleanup method returns only after all files are deleted. af00fd1 [Hari Shreedharan] [SPARK-4790][STREAMING] Fix ReceivedBlockTrackerSuite waits for old files to get deleted before continuing. --- .../receiver/ReceivedBlockHandler.scala | 8 ++++---- .../scheduler/ReceivedBlockTracker.scala | 9 ++++++--- .../streaming/scheduler/ReceiverTracker.scala | 2 +- .../streaming/util/WriteAheadLogManager.scala | 17 +++++++++++++---- .../streaming/ReceivedBlockHandlerSuite.scala | 2 +- .../streaming/ReceivedBlockTrackerSuite.scala | 2 +- .../streaming/util/WriteAheadLogSuite.scala | 18 ++++++++++++++++-- 7 files changed, 42 insertions(+), 16 deletions(-) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala index 8b97db8dd36f1..f7a8ebee8a544 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala @@ -42,7 +42,7 @@ private[streaming] trait ReceivedBlockHandler { def storeBlock(blockId: StreamBlockId, receivedBlock: ReceivedBlock): ReceivedBlockStoreResult /** Cleanup old blocks older than the given threshold time */ - def cleanupOldBlock(threshTime: Long) + def cleanupOldBlocks(threshTime: Long) } @@ -82,7 +82,7 @@ private[streaming] class BlockManagerBasedBlockHandler( BlockManagerBasedStoreResult(blockId) } - def cleanupOldBlock(threshTime: Long) { + def cleanupOldBlocks(threshTime: Long) { // this is not used as blocks inserted into the BlockManager are cleared by DStream's clearing // of BlockRDDs. } @@ -192,8 +192,8 @@ private[streaming] class WriteAheadLogBasedBlockHandler( WriteAheadLogBasedStoreResult(blockId, segment) } - def cleanupOldBlock(threshTime: Long) { - logManager.cleanupOldLogs(threshTime) + def cleanupOldBlocks(threshTime: Long) { + logManager.cleanupOldLogs(threshTime, waitForCompletion = false) } def stop() { diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala index 02758e0bca6c5..2ce458cddec1a 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala @@ -139,14 +139,17 @@ private[streaming] class ReceivedBlockTracker( getReceivedBlockQueue(streamId).toSeq } - /** Clean up block information of old batches. */ - def cleanupOldBatches(cleanupThreshTime: Time): Unit = synchronized { + /** + * Clean up block information of old batches. If waitForCompletion is true, this method + * returns only after the files are cleaned up. + */ + def cleanupOldBatches(cleanupThreshTime: Time, waitForCompletion: Boolean): Unit = synchronized { assert(cleanupThreshTime.milliseconds < clock.currentTime()) val timesToCleanup = timeToAllocatedBlocks.keys.filter { _ < cleanupThreshTime }.toSeq logInfo("Deleting batches " + timesToCleanup) writeToLog(BatchCleanupEvent(timesToCleanup)) timeToAllocatedBlocks --= timesToCleanup - logManagerOption.foreach(_.cleanupOldLogs(cleanupThreshTime.milliseconds)) + logManagerOption.foreach(_.cleanupOldLogs(cleanupThreshTime.milliseconds, waitForCompletion)) log } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala index 1f0e442a12283..8dbb42a86e3bd 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala @@ -121,7 +121,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false /** Clean up metadata older than the given threshold time */ def cleanupOldMetadata(cleanupThreshTime: Time) { - receivedBlockTracker.cleanupOldBatches(cleanupThreshTime) + receivedBlockTracker.cleanupOldBatches(cleanupThreshTime, waitForCompletion = false) } /** Register a receiver */ diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogManager.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogManager.scala index 70d234320be7c..166661b7496df 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogManager.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogManager.scala @@ -19,11 +19,11 @@ package org.apache.spark.streaming.util import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer -import scala.concurrent.{ExecutionContext, Future} +import scala.concurrent.duration.Duration +import scala.concurrent.{Await, ExecutionContext, Future} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path -import org.apache.hadoop.fs.permission.FsPermission import org.apache.spark.Logging import org.apache.spark.util.Utils import WriteAheadLogManager._ @@ -124,8 +124,12 @@ private[streaming] class WriteAheadLogManager( * files, which is usually based on the local system time. So if there is coordination necessary * between the node calculating the threshTime (say, driver node), and the local system time * (say, worker node), the caller has to take account of possible time skew. + * + * If waitForCompletion is set to true, this method will return only after old logs have been + * deleted. This should be set to true only for testing. Else the files will be deleted + * asynchronously. */ - def cleanupOldLogs(threshTime: Long): Unit = { + def cleanupOldLogs(threshTime: Long, waitForCompletion: Boolean): Unit = { val oldLogFiles = synchronized { pastLogs.filter { _.endTime < threshTime } } logInfo(s"Attempting to clear ${oldLogFiles.size} old log files in $logDirectory " + s"older than $threshTime: ${oldLogFiles.map { _.path }.mkString("\n")}") @@ -146,10 +150,15 @@ private[streaming] class WriteAheadLogManager( logInfo(s"Cleared log files in $logDirectory older than $threshTime") } if (!executionContext.isShutdown) { - Future { deleteFiles() } + val f = Future { deleteFiles() } + if (waitForCompletion) { + import scala.concurrent.duration._ + Await.ready(f, 1 second) + } } } + /** Stop the manager, close any open log writer */ def stop(): Unit = synchronized { if (currentLogWriter != null) { diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala index 3661e16a9ef2f..132ff2443fc0f 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala @@ -168,7 +168,7 @@ class ReceivedBlockHandlerSuite extends FunSuite with BeforeAndAfter with Matche manualClock.currentTime() shouldEqual 5000L val cleanupThreshTime = 3000L - handler.cleanupOldBlock(cleanupThreshTime) + handler.cleanupOldBlocks(cleanupThreshTime) eventually(timeout(10000 millis), interval(10 millis)) { getWriteAheadLogFiles().size should be < preCleanupLogFiles.size } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala index 01a09b67b99dc..de7e9d624bf6b 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala @@ -166,7 +166,7 @@ class ReceivedBlockTrackerSuite // Cleanup first batch but not second batch val oldestLogFile = getWriteAheadLogFiles().head incrementTime() - tracker3.cleanupOldBatches(batchTime2) + tracker3.cleanupOldBatches(batchTime2, waitForCompletion = true) // Verify that the batch allocations have been cleaned, and the act has been written to log tracker3.getBlocksOfBatchAndStream(batchTime1, streamId) shouldEqual Seq.empty diff --git a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala index 8f69bcb64279d..7ce9499dc614d 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala @@ -182,15 +182,29 @@ class WriteAheadLogSuite extends FunSuite with BeforeAndAfter { } test("WriteAheadLogManager - cleanup old logs") { + logCleanUpTest(waitForCompletion = false) + } + + test("WriteAheadLogManager - cleanup old logs synchronously") { + logCleanUpTest(waitForCompletion = true) + } + + private def logCleanUpTest(waitForCompletion: Boolean): Unit = { // Write data with manager, recover with new manager and verify val manualClock = new ManualClock val dataToWrite = generateRandomData() manager = writeDataUsingManager(testDir, dataToWrite, manualClock, stopManager = false) val logFiles = getLogFilesInDirectory(testDir) assert(logFiles.size > 1) - manager.cleanupOldLogs(manualClock.currentTime() / 2) - eventually(timeout(1 second), interval(10 milliseconds)) { + + manager.cleanupOldLogs(manualClock.currentTime() / 2, waitForCompletion) + + if (waitForCompletion) { assert(getLogFilesInDirectory(testDir).size < logFiles.size) + } else { + eventually(timeout(1 second), interval(10 milliseconds)) { + assert(getLogFilesInDirectory(testDir).size < logFiles.size) + } } } From fdc2aa4918fd4c510f04812b782cc0bfef9a2107 Mon Sep 17 00:00:00 2001 From: jerryshao Date: Wed, 31 Dec 2014 14:45:31 -0800 Subject: [PATCH 040/116] [SPARK-5028][Streaming]Add total received and processed records metrics to Streaming UI This is a follow-up work of [SPARK-4537](https://issues.apache.org/jira/browse/SPARK-4537). Adding total received records and processed records metrics back to UI. ![screenshot](https://dl.dropboxusercontent.com/u/19230832/screenshot.png) Author: jerryshao Closes #3852 from jerryshao/SPARK-5028 and squashes the following commits: c8c4877 [jerryshao] Add total received and processed metrics to Streaming UI --- .../scala/org/apache/spark/streaming/ui/StreamingPage.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala index 1353e487c72cf..98e9a2e639e25 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala @@ -67,6 +67,12 @@ private[ui] class StreamingPage(parent: StreamingTab)
  • Waiting batches: {listener.numUnprocessedBatches}
  • +
  • + Received records: {listener.numTotalReceivedRecords} +
  • +
  • + Processed records: {listener.numTotalProcessedRecords} +
  • } From c4f0b4f334f7f3565375921fcac184ad5b1fb207 Mon Sep 17 00:00:00 2001 From: Travis Galoppo Date: Wed, 31 Dec 2014 15:39:58 -0800 Subject: [PATCH 041/116] SPARK-5020 [MLlib] GaussianMixtureModel.predictMembership() should take an RDD only Removed unnecessary parameters to predictMembership() CC: jkbradley Author: Travis Galoppo Closes #3854 from tgaloppo/spark-5020 and squashes the following commits: 1bf4669 [Travis Galoppo] renamed predictMembership() to predictSoft() 0f1d96e [Travis Galoppo] SPARK-5020 - Removed superfluous parameters from predictMembership() --- .../spark/mllib/clustering/GaussianMixtureModel.scala | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index 11a110db1f7ca..b461ea4f0f06e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -45,7 +45,7 @@ class GaussianMixtureModel( /** Maps given points to their cluster indices. */ def predict(points: RDD[Vector]): RDD[Int] = { - val responsibilityMatrix = predictMembership(points, mu, sigma, weight, k) + val responsibilityMatrix = predictSoft(points) responsibilityMatrix.map(r => r.indexOf(r.max)) } @@ -53,12 +53,7 @@ class GaussianMixtureModel( * Given the input vectors, return the membership value of each vector * to all mixture components. */ - def predictMembership( - points: RDD[Vector], - mu: Array[Vector], - sigma: Array[Matrix], - weight: Array[Double], - k: Int): RDD[Array[Double]] = { + def predictSoft(points: RDD[Vector]): RDD[Array[Double]] = { val sc = points.sparkContext val dists = sc.broadcast { (0 until k).map { i => From fe6efacc0b865e9e827a1565877077000e63976e Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 31 Dec 2014 16:02:47 -0800 Subject: [PATCH 042/116] [SPARK-5035] [Streaming] ReceiverMessage trait should extend Serializable Spark Streaming's ReceiverMessage trait should extend Serializable in order to fix a subtle bug that only occurs when running on a real cluster: If you attempt to send a fire-and-forget message to a remote Akka actor and that message cannot be serialized, then this seems to lead to more-or-less silent failures. As an optimization, Akka skips message serialization for messages sent within the same JVM. As a result, Spark's unit tests will never fail due to non-serializable Akka messages, but these will cause mostly-silent failures when running on a real cluster. Before this patch, here was the code for ReceiverMessage: ``` /** Messages sent to the NetworkReceiver. */ private[streaming] sealed trait ReceiverMessage private[streaming] object StopReceiver extends ReceiverMessage ``` Since ReceiverMessage does not extend Serializable and StopReceiver is a regular `object`, not a `case object`, StopReceiver will throw serialization errors. As a result, graceful receiver shutdown is broken on real clusters (and local-cluster mode) but works in local modes. If you want to reproduce this, try running the word count example from the Streaming Programming Guide in the Spark shell: ``` import org.apache.spark._ import org.apache.spark.streaming._ import org.apache.spark.streaming.StreamingContext._ val ssc = new StreamingContext(sc, Seconds(10)) // Create a DStream that will connect to hostname:port, like localhost:9999 val lines = ssc.socketTextStream("localhost", 9999) // Split each line into words val words = lines.flatMap(_.split(" ")) import org.apache.spark.streaming.StreamingContext._ // Count each word in each batch val pairs = words.map(word => (word, 1)) val wordCounts = pairs.reduceByKey(_ + _) // Print the first ten elements of each RDD generated in this DStream to the console wordCounts.print() ssc.start() Thread.sleep(10000) ssc.stop(true, true) ``` Prior to this patch, this would work correctly in local mode but fail when running against a real cluster (it would report that some receivers were not shut down). Author: Josh Rosen Closes #3857 from JoshRosen/SPARK-5035 and squashes the following commits: 71d0eae [Josh Rosen] [SPARK-5035] ReceiverMessage trait should extend Serializable. --- .../org/apache/spark/streaming/receiver/ReceiverMessage.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala index bf39d1e891cae..ab9fa192191aa 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala @@ -18,6 +18,6 @@ package org.apache.spark.streaming.receiver /** Messages sent to the NetworkReceiver. */ -private[streaming] sealed trait ReceiverMessage +private[streaming] sealed trait ReceiverMessage extends Serializable private[streaming] object StopReceiver extends ReceiverMessage From 4bb12488d56ea651c56d9688996b464b99095582 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Wed, 31 Dec 2014 16:59:17 -0800 Subject: [PATCH 043/116] SPARK-2757 [BUILD] [STREAMING] Add Mima test for Spark Sink after 1.10 is released Re-enable MiMa for Streaming Flume Sink module, now that 1.1.0 is released, per the JIRA TO-DO. That's pretty much all there is to this. Author: Sean Owen Closes #3842 from srowen/SPARK-2757 and squashes the following commits: 50ff80e [Sean Owen] Exclude apparent false positive turned up by re-enabling MiMa checks for Streaming Flume Sink 0e5ba5c [Sean Owen] Re-enable MiMa for Streaming Flume Sink module --- project/MimaExcludes.scala | 5 +++++ project/SparkBuild.scala | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 230239aa40500..c377e5cffa7d2 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -53,6 +53,11 @@ object MimaExcludes { "org.apache.spark.mllib.linalg.Matrices.randn"), ProblemFilters.exclude[MissingMethodProblem]( "org.apache.spark.mllib.linalg.Matrices.rand") + ) ++ Seq( + // SPARK-2757 + ProblemFilters.exclude[IncompatibleResultTypeProblem]( + "org.apache.spark.streaming.flume.sink.SparkAvroCallbackHandler." + + "removeAndGetProcessor") ) case v if v.startsWith("1.2") => diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index c512b62f6137e..46a54c6818409 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -166,7 +166,7 @@ object SparkBuild extends PomBuild { // TODO: Add Sql to mima checks allProjects.filterNot(x => Seq(spark, sql, hive, hiveThriftServer, catalyst, repl, - streamingFlumeSink, networkCommon, networkShuffle, networkYarn).contains(x)).foreach { + networkCommon, networkShuffle, networkYarn).contains(x)).foreach { x => enable(MimaBuild.mimaSettings(sparkHome, x))(x) } From 7749dd6c36a182478b20f4636734c8db0b7ddb00 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 31 Dec 2014 17:07:47 -0800 Subject: [PATCH 044/116] [SPARK-5038] Add explicit return type for implicit functions. As we learned in #3580, not explicitly typing implicit functions can lead to compiler bugs and potentially unexpected runtime behavior. This is a follow up PR for rest of Spark (outside Spark SQL). The original PR for Spark SQL can be found at https://github.com/apache/spark/pull/3859 Author: Reynold Xin Closes #3860 from rxin/implicit and squashes the following commits: 73702f9 [Reynold Xin] [SPARK-5038] Add explicit return type for implicit functions. --- .../scala/org/apache/spark/SparkContext.scala | 14 ++--- .../scala/org/apache/spark/util/Vector.scala | 38 +++++------ .../graphx/impl/EdgePartitionBuilder.scala | 63 ++++++++++--------- .../impl/ShippableVertexPartition.scala | 4 +- .../spark/graphx/impl/VertexPartition.scala | 4 +- .../graphx/impl/VertexPartitionBaseOps.scala | 4 +- 6 files changed, 64 insertions(+), 63 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 57bc3d4e4ae36..df1cb3cda2dba 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -1708,19 +1708,19 @@ object SparkContext extends Logging { // Implicit conversions to common Writable types, for saveAsSequenceFile - implicit def intToIntWritable(i: Int) = new IntWritable(i) + implicit def intToIntWritable(i: Int): IntWritable = new IntWritable(i) - implicit def longToLongWritable(l: Long) = new LongWritable(l) + implicit def longToLongWritable(l: Long): LongWritable = new LongWritable(l) - implicit def floatToFloatWritable(f: Float) = new FloatWritable(f) + implicit def floatToFloatWritable(f: Float): FloatWritable = new FloatWritable(f) - implicit def doubleToDoubleWritable(d: Double) = new DoubleWritable(d) + implicit def doubleToDoubleWritable(d: Double): DoubleWritable = new DoubleWritable(d) - implicit def boolToBoolWritable (b: Boolean) = new BooleanWritable(b) + implicit def boolToBoolWritable (b: Boolean): BooleanWritable = new BooleanWritable(b) - implicit def bytesToBytesWritable (aob: Array[Byte]) = new BytesWritable(aob) + implicit def bytesToBytesWritable (aob: Array[Byte]): BytesWritable = new BytesWritable(aob) - implicit def stringToText(s: String) = new Text(s) + implicit def stringToText(s: String): Text = new Text(s) private implicit def arrayToArrayWritable[T <% Writable: ClassTag](arr: Traversable[T]) : ArrayWritable = { diff --git a/core/src/main/scala/org/apache/spark/util/Vector.scala b/core/src/main/scala/org/apache/spark/util/Vector.scala index c6cab82c3e546..2ed827eab46df 100644 --- a/core/src/main/scala/org/apache/spark/util/Vector.scala +++ b/core/src/main/scala/org/apache/spark/util/Vector.scala @@ -24,9 +24,9 @@ import org.apache.spark.util.random.XORShiftRandom @deprecated("Use Vectors.dense from Spark's mllib.linalg package instead.", "1.0.0") class Vector(val elements: Array[Double]) extends Serializable { - def length = elements.length + def length: Int = elements.length - def apply(index: Int) = elements(index) + def apply(index: Int): Double = elements(index) def + (other: Vector): Vector = { if (length != other.length) { @@ -35,7 +35,7 @@ class Vector(val elements: Array[Double]) extends Serializable { Vector(length, i => this(i) + other(i)) } - def add(other: Vector) = this + other + def add(other: Vector): Vector = this + other def - (other: Vector): Vector = { if (length != other.length) { @@ -44,7 +44,7 @@ class Vector(val elements: Array[Double]) extends Serializable { Vector(length, i => this(i) - other(i)) } - def subtract(other: Vector) = this - other + def subtract(other: Vector): Vector = this - other def dot(other: Vector): Double = { if (length != other.length) { @@ -93,19 +93,19 @@ class Vector(val elements: Array[Double]) extends Serializable { this } - def addInPlace(other: Vector) = this +=other + def addInPlace(other: Vector): Vector = this +=other def * (scale: Double): Vector = Vector(length, i => this(i) * scale) - def multiply (d: Double) = this * d + def multiply (d: Double): Vector = this * d def / (d: Double): Vector = this * (1 / d) - def divide (d: Double) = this / d + def divide (d: Double): Vector = this / d - def unary_- = this * -1 + def unary_- : Vector = this * -1 - def sum = elements.reduceLeft(_ + _) + def sum: Double = elements.reduceLeft(_ + _) def squaredDist(other: Vector): Double = { var ans = 0.0 @@ -119,40 +119,40 @@ class Vector(val elements: Array[Double]) extends Serializable { def dist(other: Vector): Double = math.sqrt(squaredDist(other)) - override def toString = elements.mkString("(", ", ", ")") + override def toString: String = elements.mkString("(", ", ", ")") } object Vector { - def apply(elements: Array[Double]) = new Vector(elements) + def apply(elements: Array[Double]): Vector = new Vector(elements) - def apply(elements: Double*) = new Vector(elements.toArray) + def apply(elements: Double*): Vector = new Vector(elements.toArray) def apply(length: Int, initializer: Int => Double): Vector = { val elements: Array[Double] = Array.tabulate(length)(initializer) new Vector(elements) } - def zeros(length: Int) = new Vector(new Array[Double](length)) + def zeros(length: Int): Vector = new Vector(new Array[Double](length)) - def ones(length: Int) = Vector(length, _ => 1) + def ones(length: Int): Vector = Vector(length, _ => 1) /** * Creates this [[org.apache.spark.util.Vector]] of given length containing random numbers * between 0.0 and 1.0. Optional scala.util.Random number generator can be provided. */ - def random(length: Int, random: Random = new XORShiftRandom()) = + def random(length: Int, random: Random = new XORShiftRandom()): Vector = Vector(length, _ => random.nextDouble()) class Multiplier(num: Double) { - def * (vec: Vector) = vec * num + def * (vec: Vector): Vector = vec * num } - implicit def doubleToMultiplier(num: Double) = new Multiplier(num) + implicit def doubleToMultiplier(num: Double): Multiplier = new Multiplier(num) implicit object VectorAccumParam extends org.apache.spark.AccumulatorParam[Vector] { - def addInPlace(t1: Vector, t2: Vector) = t1 + t2 + def addInPlace(t1: Vector, t2: Vector): Vector = t1 + t2 - def zero(initialValue: Vector) = Vector.zeros(initialValue.length) + def zero(initialValue: Vector): Vector = Vector.zeros(initialValue.length) } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala index 409cf60977f6f..906d42328fcb9 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala @@ -129,44 +129,45 @@ private[impl] case class EdgeWithLocalIds[@specialized ED]( srcId: VertexId, dstId: VertexId, localSrcId: Int, localDstId: Int, attr: ED) private[impl] object EdgeWithLocalIds { - implicit def lexicographicOrdering[ED] = new Ordering[EdgeWithLocalIds[ED]] { - override def compare(a: EdgeWithLocalIds[ED], b: EdgeWithLocalIds[ED]): Int = { - if (a.srcId == b.srcId) { - if (a.dstId == b.dstId) 0 - else if (a.dstId < b.dstId) -1 + implicit def lexicographicOrdering[ED]: Ordering[EdgeWithLocalIds[ED]] = + new Ordering[EdgeWithLocalIds[ED]] { + override def compare(a: EdgeWithLocalIds[ED], b: EdgeWithLocalIds[ED]): Int = { + if (a.srcId == b.srcId) { + if (a.dstId == b.dstId) 0 + else if (a.dstId < b.dstId) -1 + else 1 + } else if (a.srcId < b.srcId) -1 else 1 - } else if (a.srcId < b.srcId) -1 - else 1 + } } - } - private[graphx] def edgeArraySortDataFormat[ED] - = new SortDataFormat[EdgeWithLocalIds[ED], Array[EdgeWithLocalIds[ED]]] { - override def getKey( - data: Array[EdgeWithLocalIds[ED]], pos: Int): EdgeWithLocalIds[ED] = { - data(pos) - } + private[graphx] def edgeArraySortDataFormat[ED] = { + new SortDataFormat[EdgeWithLocalIds[ED], Array[EdgeWithLocalIds[ED]]] { + override def getKey(data: Array[EdgeWithLocalIds[ED]], pos: Int): EdgeWithLocalIds[ED] = { + data(pos) + } - override def swap(data: Array[EdgeWithLocalIds[ED]], pos0: Int, pos1: Int): Unit = { - val tmp = data(pos0) - data(pos0) = data(pos1) - data(pos1) = tmp - } + override def swap(data: Array[EdgeWithLocalIds[ED]], pos0: Int, pos1: Int): Unit = { + val tmp = data(pos0) + data(pos0) = data(pos1) + data(pos1) = tmp + } - override def copyElement( - src: Array[EdgeWithLocalIds[ED]], srcPos: Int, - dst: Array[EdgeWithLocalIds[ED]], dstPos: Int) { - dst(dstPos) = src(srcPos) - } + override def copyElement( + src: Array[EdgeWithLocalIds[ED]], srcPos: Int, + dst: Array[EdgeWithLocalIds[ED]], dstPos: Int) { + dst(dstPos) = src(srcPos) + } - override def copyRange( - src: Array[EdgeWithLocalIds[ED]], srcPos: Int, - dst: Array[EdgeWithLocalIds[ED]], dstPos: Int, length: Int) { - System.arraycopy(src, srcPos, dst, dstPos, length) - } + override def copyRange( + src: Array[EdgeWithLocalIds[ED]], srcPos: Int, + dst: Array[EdgeWithLocalIds[ED]], dstPos: Int, length: Int) { + System.arraycopy(src, srcPos, dst, dstPos, length) + } - override def allocate(length: Int): Array[EdgeWithLocalIds[ED]] = { - new Array[EdgeWithLocalIds[ED]](length) + override def allocate(length: Int): Array[EdgeWithLocalIds[ED]] = { + new Array[EdgeWithLocalIds[ED]](length) + } } } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/ShippableVertexPartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/ShippableVertexPartition.scala index 5412d720475dc..aa320088f2088 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/ShippableVertexPartition.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/ShippableVertexPartition.scala @@ -74,8 +74,8 @@ object ShippableVertexPartition { * Implicit conversion to allow invoking `VertexPartitionBase` operations directly on a * `ShippableVertexPartition`. */ - implicit def shippablePartitionToOps[VD: ClassTag](partition: ShippableVertexPartition[VD]) = - new ShippableVertexPartitionOps(partition) + implicit def shippablePartitionToOps[VD: ClassTag](partition: ShippableVertexPartition[VD]) + : ShippableVertexPartitionOps[VD] = new ShippableVertexPartitionOps(partition) /** * Implicit evidence that `ShippableVertexPartition` is a member of the diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala index 55c7a19d1bdab..fbe53acfc32aa 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala @@ -38,8 +38,8 @@ private[graphx] object VertexPartition { * Implicit conversion to allow invoking `VertexPartitionBase` operations directly on a * `VertexPartition`. */ - implicit def partitionToOps[VD: ClassTag](partition: VertexPartition[VD]) = - new VertexPartitionOps(partition) + implicit def partitionToOps[VD: ClassTag](partition: VertexPartition[VD]) + : VertexPartitionOps[VD] = new VertexPartitionOps(partition) /** * Implicit evidence that `VertexPartition` is a member of the `VertexPartitionBaseOpsConstructor` diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala index b40aa1b417a0f..4fd2548b7faf6 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala @@ -238,8 +238,8 @@ private[graphx] abstract class VertexPartitionBaseOps * because these methods return a `Self` and this implicit conversion re-wraps that in a * `VertexPartitionBaseOps`. This relies on the context bound on `Self`. */ - private implicit def toOps[VD2: ClassTag]( - partition: Self[VD2]): VertexPartitionBaseOps[VD2, Self] = { + private implicit def toOps[VD2: ClassTag](partition: Self[VD2]) + : VertexPartitionBaseOps[VD2, Self] = { implicitly[VertexPartitionBaseOpsConstructor[Self]].toOps(partition) } } From 012839807c3dc6e7c8c41ac6e956d52a550bb031 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 1 Jan 2015 15:03:54 -0800 Subject: [PATCH 045/116] [HOTFIX] Bind web UI to ephemeral port in DriverSuite The job launched by DriverSuite should bind the web UI to an ephemeral port, since it looks like port contention in this test has caused a large number of Jenkins failures when many builds are started simultaneously. Our tests already disable the web UI, but this doesn't affect subprocesses launched by our tests. In this case, I've opted to bind to an ephemeral port instead of disabling the UI because disabling features in this test may mask its ability to catch certain bugs. See also: e24d3a9 Author: Josh Rosen Closes #3873 from JoshRosen/driversuite-webui-port and squashes the following commits: 48cd05c [Josh Rosen] [HOTFIX] Bind web UI to ephemeral port in DriverSuite. --- core/src/test/scala/org/apache/spark/DriverSuite.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/src/test/scala/org/apache/spark/DriverSuite.scala b/core/src/test/scala/org/apache/spark/DriverSuite.scala index 5265ba904032f..541d8eac80556 100644 --- a/core/src/test/scala/org/apache/spark/DriverSuite.scala +++ b/core/src/test/scala/org/apache/spark/DriverSuite.scala @@ -50,7 +50,10 @@ class DriverSuite extends FunSuite with Timeouts { object DriverWithoutCleanup { def main(args: Array[String]) { Utils.configTestLog4j("INFO") - val sc = new SparkContext(args(0), "DriverWithoutCleanup") + // Bind the web UI to an ephemeral port in order to avoid conflicts with other tests running on + // the same machine (we shouldn't just disable the UI here, since that might mask bugs): + val conf = new SparkConf().set("spark.ui.port", "0") + val sc = new SparkContext(args(0), "DriverWithoutCleanup", conf) sc.parallelize(1 to 100, 4).count() } } From bd88b7185358ae60efc83dc6cbb3fb1d2bff6074 Mon Sep 17 00:00:00 2001 From: Yadong Qi Date: Fri, 2 Jan 2015 15:09:41 -0800 Subject: [PATCH 046/116] [SPARK-3325][Streaming] Add a parameter to the method print in class DStream This PR is a fixed version of the original PR #3237 by watermen and scwf. This adds the ability to specify how many elements to print in `DStream.print`. Author: Yadong Qi Author: q00251598 Author: Tathagata Das Author: wangfei Closes #3865 from tdas/print-num and squashes the following commits: cd34e9e [Tathagata Das] Fix bug 7c09f16 [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into HEAD bb35d1a [Yadong Qi] Update MimaExcludes.scala f8098ca [Yadong Qi] Update MimaExcludes.scala f6ac3cb [Yadong Qi] Update MimaExcludes.scala e4ed897 [Yadong Qi] Update MimaExcludes.scala 3b9d5cf [wangfei] fix conflicts ec8a3af [q00251598] move to Spark 1.3 26a70c0 [q00251598] extend the Python DStream's print b589a4b [q00251598] add another print function --- project/MimaExcludes.scala | 3 +++ python/pyspark/streaming/dstream.py | 12 +++++++----- .../spark/streaming/api/java/JavaDStreamLike.scala | 10 +++++++++- .../apache/spark/streaming/dstream/DStream.scala | 14 +++++++++++--- 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index c377e5cffa7d2..31d4c317ae569 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -54,6 +54,9 @@ object MimaExcludes { ProblemFilters.exclude[MissingMethodProblem]( "org.apache.spark.mllib.linalg.Matrices.rand") ) ++ Seq( + // SPARK-3325 + ProblemFilters.exclude[MissingMethodProblem]( + "org.apache.spark.streaming.api.java.JavaDStreamLike.print"), // SPARK-2757 ProblemFilters.exclude[IncompatibleResultTypeProblem]( "org.apache.spark.streaming.flume.sink.SparkAvroCallbackHandler." + diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py index 0826ddc56e844..2fe39392ff081 100644 --- a/python/pyspark/streaming/dstream.py +++ b/python/pyspark/streaming/dstream.py @@ -157,18 +157,20 @@ def foreachRDD(self, func): api = self._ssc._jvm.PythonDStream api.callForeachRDD(self._jdstream, jfunc) - def pprint(self): + def pprint(self, num=10): """ - Print the first ten elements of each RDD generated in this DStream. + Print the first num elements of each RDD generated in this DStream. + + @param num: the number of elements from the first will be printed. """ def takeAndPrint(time, rdd): - taken = rdd.take(11) + taken = rdd.take(num + 1) print "-------------------------------------------" print "Time: %s" % time print "-------------------------------------------" - for record in taken[:10]: + for record in taken[:num]: print record - if len(taken) > 10: + if len(taken) > num: print "..." print diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala index 2a7004e56ef53..e0542eda1383f 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala @@ -51,7 +51,15 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T * operator, so this DStream will be registered as an output stream and there materialized. */ def print(): Unit = { - dstream.print() + print(10) + } + + /** + * Print the first num elements of each RDD generated in this DStream. This is an output + * operator, so this DStream will be registered as an output stream and there materialized. + */ + def print(num: Int): Unit = { + dstream.print(num) } /** diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala index 7f8651e719d84..28fc00cf3944f 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala @@ -605,13 +605,21 @@ abstract class DStream[T: ClassTag] ( * operator, so this DStream will be registered as an output stream and there materialized. */ def print() { + print(10) + } + + /** + * Print the first num elements of each RDD generated in this DStream. This is an output + * operator, so this DStream will be registered as an output stream and there materialized. + */ + def print(num: Int) { def foreachFunc = (rdd: RDD[T], time: Time) => { - val first11 = rdd.take(11) + val firstNum = rdd.take(num + 1) println ("-------------------------------------------") println ("Time: " + time) println ("-------------------------------------------") - first11.take(10).foreach(println) - if (first11.size > 10) println("...") + firstNum.take(num).foreach(println) + if (firstNum.size > num) println("...") println() } new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register() From cdccc263b20c1bb27b864411c82cfad7daca1f47 Mon Sep 17 00:00:00 2001 From: Akhil Das Date: Fri, 2 Jan 2015 15:12:27 -0800 Subject: [PATCH 047/116] Fixed typos in streaming-kafka-integration.md Changed projrect to project :) Author: Akhil Das Closes #3876 from akhld/patch-1 and squashes the following commits: e0cf9ef [Akhil Das] Fixed typos in streaming-kafka-integration.md --- docs/streaming-kafka-integration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/streaming-kafka-integration.md b/docs/streaming-kafka-integration.md index 1c956fcb40da8..4378521dcac70 100644 --- a/docs/streaming-kafka-integration.md +++ b/docs/streaming-kafka-integration.md @@ -4,7 +4,7 @@ title: Spark Streaming + Kafka Integration Guide --- [Apache Kafka](http://kafka.apache.org/) is publish-subscribe messaging rethought as a distributed, partitioned, replicated commit log service. Here we explain how to configure Spark Streaming to receive data from Kafka. -1. **Linking:** In your SBT/Maven projrect definition, link your streaming application against the following artifact (see [Linking section](streaming-programming-guide.html#linking) in the main programming guide for further information). +1. **Linking:** In your SBT/Maven project definition, link your streaming application against the following artifact (see [Linking section](streaming-programming-guide.html#linking) in the main programming guide for further information). groupId = org.apache.spark artifactId = spark-streaming-kafka_{{site.SCALA_BINARY_VERSION}} From 342612b65f3d77c660383a332f0346872f076647 Mon Sep 17 00:00:00 2001 From: sigmoidanalytics Date: Sat, 3 Jan 2015 19:46:08 -0800 Subject: [PATCH 048/116] [SPARK-5058] Updated broken links Updated the broken link pointing to the KafkaWordCount example to the correct one. Author: sigmoidanalytics Closes #3877 from sigmoidanalytics/patch-1 and squashes the following commits: 3e19b31 [sigmoidanalytics] Updated broken links --- docs/streaming-kafka-integration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/streaming-kafka-integration.md b/docs/streaming-kafka-integration.md index 4378521dcac70..0e38fe2144e9f 100644 --- a/docs/streaming-kafka-integration.md +++ b/docs/streaming-kafka-integration.md @@ -20,7 +20,7 @@ title: Spark Streaming + Kafka Integration Guide streamingContext, [zookeeperQuorum], [group id of the consumer], [per-topic number of Kafka partitions to consume]) See the [API docs](api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$) - and the [example]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala). + and the [example]({{site.SPARK_GITHUB_URL}}/blob/master/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/KafkaWordCount.scala).
    import org.apache.spark.streaming.kafka.*; From b96008d5529bac5fd57b76554fd01760139cffff Mon Sep 17 00:00:00 2001 From: Brennon York Date: Sun, 4 Jan 2015 12:40:39 -0800 Subject: [PATCH 049/116] [SPARK-794][Core] Remove sleep() in ClusterScheduler.stop Removed `sleep()` from the `stop()` method of the `TaskSchedulerImpl` class which, from the JIRA ticket, is believed to be a legacy artifact slowing down testing originally introduced in the `ClusterScheduler` class. Author: Brennon York Closes #3851 from brennonyork/SPARK-794 and squashes the following commits: 04c3e64 [Brennon York] Removed sleep() from the stop() method --- .../scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala | 3 --- 1 file changed, 3 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index cd3c015321e85..a41f3eef195d2 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -394,9 +394,6 @@ private[spark] class TaskSchedulerImpl( taskResultGetter.stop() } starvationTimer.cancel() - - // sleeping for an arbitrary 1 seconds to ensure that messages are sent out. - Thread.sleep(1000L) } override def defaultParallelism() = backend.defaultParallelism() From 3fddc9468fa50e7683caa973fec6c52e1132268d Mon Sep 17 00:00:00 2001 From: Dale Date: Sun, 4 Jan 2015 13:28:37 -0800 Subject: [PATCH 050/116] [SPARK-4787] Stop SparkContext if a DAGScheduler init error occurs Author: Dale Closes #3809 from tigerquoll/SPARK-4787 and squashes the following commits: 5661e01 [Dale] [SPARK-4787] Ensure that call to stop() doesn't lose the exception by using a finally block. 2172578 [Dale] [SPARK-4787] Stop context properly if an exception occurs during DAGScheduler initialization. --- core/src/main/scala/org/apache/spark/SparkContext.scala | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index df1cb3cda2dba..4c25d5d6c0ceb 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -329,8 +329,13 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli try { dagScheduler = new DAGScheduler(this) } catch { - case e: Exception => throw - new SparkException("DAGScheduler cannot be initialized due to %s".format(e.getMessage)) + case e: Exception => { + try { + stop() + } finally { + throw new SparkException("Error while constructing DAGScheduler", e) + } + } } // start TaskScheduler after taskScheduler sets DAGScheduler reference in DAGScheduler's From e767d7ddac5c2330af553f2a74b8575dfc7afb67 Mon Sep 17 00:00:00 2001 From: bilna Date: Sun, 4 Jan 2015 19:37:48 -0800 Subject: [PATCH 051/116] [SPARK-4631] unit test for MQTT Please review the unit test for MQTT Author: bilna Author: Bilna P Closes #3844 from Bilna/master and squashes the following commits: acea3a3 [bilna] Adding dependency with scope test 28681fa [bilna] Merge remote-tracking branch 'upstream/master' fac3904 [bilna] Correction in Indentation and coding style ed9db4c [bilna] Merge remote-tracking branch 'upstream/master' 4b34ee7 [Bilna P] Update MQTTStreamSuite.scala 04503cf [bilna] Added embedded broker service for mqtt test 89d804e [bilna] Merge remote-tracking branch 'upstream/master' fc8eb28 [bilna] Merge remote-tracking branch 'upstream/master' 4b58094 [Bilna P] Update MQTTStreamSuite.scala b1ac4ad [bilna] Added BeforeAndAfter 5f6bfd2 [bilna] Added BeforeAndAfter e8b6623 [Bilna P] Update MQTTStreamSuite.scala 5ca6691 [Bilna P] Update MQTTStreamSuite.scala 8616495 [bilna] [SPARK-4631] unit test for MQTT --- external/mqtt/pom.xml | 6 + .../streaming/mqtt/MQTTStreamSuite.scala | 110 +++++++++++++++--- 2 files changed, 101 insertions(+), 15 deletions(-) diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml index 9025915f4447e..d478267b605ba 100644 --- a/external/mqtt/pom.xml +++ b/external/mqtt/pom.xml @@ -66,6 +66,12 @@ junit-interface test + + org.apache.activemq + activemq-core + 5.7.0 + test + target/scala-${scala.binary.version}/classes diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala index 84595acf45ccb..98fe6cb301f52 100644 --- a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala +++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala @@ -17,31 +17,111 @@ package org.apache.spark.streaming.mqtt -import org.scalatest.FunSuite +import java.net.{URI, ServerSocket} -import org.apache.spark.streaming.{Seconds, StreamingContext} +import org.apache.activemq.broker.{TransportConnector, BrokerService} +import org.apache.spark.util.Utils +import org.scalatest.{BeforeAndAfter, FunSuite} +import org.scalatest.concurrent.Eventually +import scala.concurrent.duration._ +import org.apache.spark.streaming.{Milliseconds, StreamingContext} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream +import org.eclipse.paho.client.mqttv3._ +import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence -class MQTTStreamSuite extends FunSuite { - - val batchDuration = Seconds(1) +class MQTTStreamSuite extends FunSuite with Eventually with BeforeAndAfter { + private val batchDuration = Milliseconds(500) private val master: String = "local[2]" - private val framework: String = this.getClass.getSimpleName + private val freePort = findFreePort() + private val brokerUri = "//localhost:" + freePort + private val topic = "def" + private var ssc: StreamingContext = _ + private val persistenceDir = Utils.createTempDir() + private var broker: BrokerService = _ + private var connector: TransportConnector = _ - test("mqtt input stream") { - val ssc = new StreamingContext(master, framework, batchDuration) - val brokerUrl = "abc" - val topic = "def" + before { + ssc = new StreamingContext(master, framework, batchDuration) + setupMQTT() + } - // tests the API, does not actually test data receiving - val test1: ReceiverInputDStream[String] = MQTTUtils.createStream(ssc, brokerUrl, topic) - val test2: ReceiverInputDStream[String] = - MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_AND_DISK_SER_2) + after { + if (ssc != null) { + ssc.stop() + ssc = null + } + Utils.deleteRecursively(persistenceDir) + tearDownMQTT() + } - // TODO: Actually test receiving data + test("mqtt input stream") { + val sendMessage = "MQTT demo for spark streaming" + val receiveStream: ReceiverInputDStream[String] = + MQTTUtils.createStream(ssc, "tcp:" + brokerUri, topic, StorageLevel.MEMORY_ONLY) + var receiveMessage: List[String] = List() + receiveStream.foreachRDD { rdd => + if (rdd.collect.length > 0) { + receiveMessage = receiveMessage ::: List(rdd.first) + receiveMessage + } + } + ssc.start() + publishData(sendMessage) + eventually(timeout(10000 milliseconds), interval(100 milliseconds)) { + assert(sendMessage.equals(receiveMessage(0))) + } ssc.stop() } + + private def setupMQTT() { + broker = new BrokerService() + connector = new TransportConnector() + connector.setName("mqtt") + connector.setUri(new URI("mqtt:" + brokerUri)) + broker.addConnector(connector) + broker.start() + } + + private def tearDownMQTT() { + if (broker != null) { + broker.stop() + broker = null + } + if (connector != null) { + connector.stop() + connector = null + } + } + + private def findFreePort(): Int = { + Utils.startServiceOnPort(23456, (trialPort: Int) => { + val socket = new ServerSocket(trialPort) + socket.close() + (null, trialPort) + })._2 + } + + def publishData(data: String): Unit = { + var client: MqttClient = null + try { + val persistence: MqttClientPersistence = new MqttDefaultFilePersistence(persistenceDir.getAbsolutePath) + client = new MqttClient("tcp:" + brokerUri, MqttClient.generateClientId(), persistence) + client.connect() + if (client.isConnected) { + val msgTopic: MqttTopic = client.getTopic(topic) + val message: MqttMessage = new MqttMessage(data.getBytes("utf-8")) + message.setQos(1) + message.setRetained(true) + for (i <- 0 to 100) + msgTopic.publish(message) + } + } finally { + client.disconnect() + client.close() + client = null + } + } } From 939ba1f8f6e32fef9026cc43fce55b36e4b9bfd1 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sun, 4 Jan 2015 20:26:18 -0800 Subject: [PATCH 052/116] [SPARK-4835] Disable validateOutputSpecs for Spark Streaming jobs This patch disables output spec. validation for jobs launched through Spark Streaming, since this interferes with checkpoint recovery. Hadoop OutputFormats have a `checkOutputSpecs` method which performs certain checks prior to writing output, such as checking whether the output directory already exists. SPARK-1100 added checks for FileOutputFormat, SPARK-1677 (#947) added a SparkConf configuration to disable these checks, and SPARK-2309 (#1088) extended these checks to run for all OutputFormats, not just FileOutputFormat. In Spark Streaming, we might have to re-process a batch during checkpoint recovery, so `save` actions may be called multiple times. In addition to `DStream`'s own save actions, users might use `transform` or `foreachRDD` and call the `RDD` and `PairRDD` save actions. When output spec. validation is enabled, the second calls to these actions will fail due to existing output. This patch automatically disables output spec. validation for jobs submitted by the Spark Streaming scheduler. This is done by using Scala's `DynamicVariable` to propagate the bypass setting without having to mutate SparkConf or introduce a global variable. Author: Josh Rosen Closes #3832 from JoshRosen/SPARK-4835 and squashes the following commits: 36eaf35 [Josh Rosen] Add comment explaining use of transform() in test. 6485cf8 [Josh Rosen] Add test case in Streaming; fix bug for transform() 7b3e06a [Josh Rosen] Remove Streaming-specific setting to undo this change; update conf. guide bf9094d [Josh Rosen] Revise disableOutputSpecValidation() comment to not refer to Spark Streaming. e581d17 [Josh Rosen] Deduplicate isOutputSpecValidationEnabled logic. 762e473 [Josh Rosen] [SPARK-4835] Disable validateOutputSpecs for Spark Streaming jobs. --- .../apache/spark/rdd/PairRDDFunctions.scala | 19 ++++++++- docs/configuration.md | 4 +- .../spark/streaming/dstream/DStream.scala | 10 ++++- .../dstream/TransformedDStream.scala | 2 +- .../streaming/scheduler/JobScheduler.scala | 8 +++- .../spark/streaming/CheckpointSuite.scala | 39 +++++++++++++++++++ 6 files changed, 75 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 4469c89e6bb1c..f8df5b2a08866 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -25,6 +25,7 @@ import scala.collection.{Map, mutable} import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag +import scala.util.DynamicVariable import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus import org.apache.hadoop.conf.{Configurable, Configuration} @@ -964,7 +965,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) val outfmt = job.getOutputFormatClass val jobFormat = outfmt.newInstance - if (self.conf.getBoolean("spark.hadoop.validateOutputSpecs", true)) { + if (isOutputSpecValidationEnabled) { // FileOutputFormat ignores the filesystem parameter jobFormat.checkOutputSpecs(job) } @@ -1042,7 +1043,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) logDebug("Saving as hadoop file of type (" + keyClass.getSimpleName + ", " + valueClass.getSimpleName + ")") - if (self.conf.getBoolean("spark.hadoop.validateOutputSpecs", true)) { + if (isOutputSpecValidationEnabled) { // FileOutputFormat ignores the filesystem parameter val ignoredFs = FileSystem.get(hadoopConf) hadoopConf.getOutputFormat.checkOutputSpecs(ignoredFs, hadoopConf) @@ -1117,8 +1118,22 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) private[spark] def valueClass: Class[_] = vt.runtimeClass private[spark] def keyOrdering: Option[Ordering[K]] = Option(ord) + + // Note: this needs to be a function instead of a 'val' so that the disableOutputSpecValidation + // setting can take effect: + private def isOutputSpecValidationEnabled: Boolean = { + val validationDisabled = PairRDDFunctions.disableOutputSpecValidation.value + val enabledInConf = self.conf.getBoolean("spark.hadoop.validateOutputSpecs", true) + enabledInConf && !validationDisabled + } } private[spark] object PairRDDFunctions { val RECORDS_BETWEEN_BYTES_WRITTEN_METRIC_UPDATES = 256 + + /** + * Allows for the `spark.hadoop.validateOutputSpecs` checks to be disabled on a case-by-case + * basis; see SPARK-4835 for more details. + */ + val disableOutputSpecValidation: DynamicVariable[Boolean] = new DynamicVariable[Boolean](false) } diff --git a/docs/configuration.md b/docs/configuration.md index fa9d311f85068..9bb6499993735 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -709,7 +709,9 @@ Apart from these, the following properties are also available, and may be useful
    + previous versions of Spark. Simply use Hadoop's FileSystem API to delete output directories by hand. + This setting is ignored for jobs generated through Spark Streaming's StreamingContext, since + data may need to be rewritten to pre-existing output directories during checkpoint recovery. diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala index 28fc00cf3944f..b874f561c12eb 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala @@ -26,7 +26,7 @@ import scala.reflect.ClassTag import scala.util.matching.Regex import org.apache.spark.{Logging, SparkException} -import org.apache.spark.rdd.{BlockRDD, RDD} +import org.apache.spark.rdd.{BlockRDD, PairRDDFunctions, RDD} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.streaming.StreamingContext.rddToFileName @@ -292,7 +292,13 @@ abstract class DStream[T: ClassTag] ( // set this DStream's creation site, generate RDDs and then restore the previous call site. val prevCallSite = ssc.sparkContext.getCallSite() ssc.sparkContext.setCallSite(creationSite) - val rddOption = compute(time) + // Disable checks for existing output directories in jobs launched by the streaming + // scheduler, since we may need to write output to an existing directory during checkpoint + // recovery; see SPARK-4835 for more details. We need to have this call here because + // compute() might cause Spark jobs to be launched. + val rddOption = PairRDDFunctions.disableOutputSpecValidation.withValue(true) { + compute(time) + } ssc.sparkContext.setCallSite(prevCallSite) rddOption.foreach { case newRDD => diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala index 7cd4554282ca1..71b61856e23c0 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala @@ -17,7 +17,7 @@ package org.apache.spark.streaming.dstream -import org.apache.spark.rdd.RDD +import org.apache.spark.rdd.{PairRDDFunctions, RDD} import org.apache.spark.streaming.{Duration, Time} import scala.reflect.ClassTag diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala index cfa3cd8925c80..0e0f5bd3b9db4 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala @@ -22,6 +22,7 @@ import scala.collection.JavaConversions._ import java.util.concurrent.{TimeUnit, ConcurrentHashMap, Executors} import akka.actor.{ActorRef, Actor, Props} import org.apache.spark.{SparkException, Logging, SparkEnv} +import org.apache.spark.rdd.PairRDDFunctions import org.apache.spark.streaming._ @@ -168,7 +169,12 @@ class JobScheduler(val ssc: StreamingContext) extends Logging { private class JobHandler(job: Job) extends Runnable { def run() { eventActor ! JobStarted(job) - job.run() + // Disable checks for existing output directories in jobs launched by the streaming scheduler, + // since we may need to write output to an existing directory during checkpoint recovery; + // see SPARK-4835 for more details. + PairRDDFunctions.disableOutputSpecValidation.withValue(true) { + job.run() + } eventActor ! JobCompleted(job) } } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala index 72d055eb2ea31..5d232c6ade7a9 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala @@ -255,6 +255,45 @@ class CheckpointSuite extends TestSuiteBase { } } + test("recovery with saveAsHadoopFile inside transform operation") { + // Regression test for SPARK-4835. + // + // In that issue, the problem was that `saveAsHadoopFile(s)` would fail when the last batch + // was restarted from a checkpoint since the output directory would already exist. However, + // the other saveAsHadoopFile* tests couldn't catch this because they only tested whether the + // output matched correctly and not whether the post-restart batch had successfully finished + // without throwing any errors. The following test reproduces the same bug with a test that + // actually fails because the error in saveAsHadoopFile causes transform() to fail, which + // prevents the expected output from being written to the output stream. + // + // This is not actually a valid use of transform, but it's being used here so that we can test + // the fix for SPARK-4835 independently of additional test cleanup. + // + // After SPARK-5079 is addressed, should be able to remove this test since a strengthened + // version of the other saveAsHadoopFile* tests would prevent regressions for this issue. + val tempDir = Files.createTempDir() + try { + testCheckpointedOperation( + Seq(Seq("a", "a", "b"), Seq("", ""), Seq(), Seq("a", "a", "b"), Seq("", ""), Seq()), + (s: DStream[String]) => { + s.transform { (rdd, time) => + val output = rdd.map(x => (x, 1)).reduceByKey(_ + _) + output.saveAsHadoopFile( + new File(tempDir, "result-" + time.milliseconds).getAbsolutePath, + classOf[Text], + classOf[IntWritable], + classOf[TextOutputFormat[Text, IntWritable]]) + output + } + }, + Seq(Seq(("a", 2), ("b", 1)), Seq(("", 2)), Seq(), Seq(("a", 2), ("b", 1)), Seq(("", 2)), Seq()), + 3 + ) + } finally { + Utils.deleteRecursively(tempDir) + } + } + // This tests whether the StateDStream's RDD checkpoints works correctly such // that the system can recover from a master failure. This assumes as reliable, // replayable input source - TestInputDStream. From 72396522bcf5303f761956658510672e4feb2845 Mon Sep 17 00:00:00 2001 From: zsxwing Date: Sun, 4 Jan 2015 21:03:17 -0800 Subject: [PATCH 053/116] [SPARK-5067][Core] Use '===' to compare well-defined case class A simple fix would be adding `assert(e1.appId == e2.appId)` for `SparkListenerApplicationStart`. But actually we can use `===` for well-defined case class directly. Therefore, instead of fixing this issue, I use `===` to compare those well-defined case classes (all fields have implemented a correct `equals` method, such as primitive types) Author: zsxwing Closes #3886 from zsxwing/SPARK-5067 and squashes the following commits: 0a51711 [zsxwing] Use '===' to compare well-defined case class --- .../apache/spark/util/JsonProtocolSuite.scala | 32 +++---------------- 1 file changed, 4 insertions(+), 28 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala index 593d6dd8c3794..63c2559c5c5f5 100644 --- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala @@ -280,7 +280,7 @@ class JsonProtocolSuite extends FunSuite { private def testBlockManagerId(id: BlockManagerId) { val newId = JsonProtocol.blockManagerIdFromJson(JsonProtocol.blockManagerIdToJson(id)) - assertEquals(id, newId) + assert(id === newId) } private def testTaskInfo(info: TaskInfo) { @@ -335,22 +335,8 @@ class JsonProtocolSuite extends FunSuite { assertEquals(e1.jobResult, e2.jobResult) case (e1: SparkListenerEnvironmentUpdate, e2: SparkListenerEnvironmentUpdate) => assertEquals(e1.environmentDetails, e2.environmentDetails) - case (e1: SparkListenerBlockManagerAdded, e2: SparkListenerBlockManagerAdded) => - assert(e1.maxMem === e2.maxMem) - assert(e1.time === e2.time) - assertEquals(e1.blockManagerId, e2.blockManagerId) - case (e1: SparkListenerBlockManagerRemoved, e2: SparkListenerBlockManagerRemoved) => - assert(e1.time === e2.time) - assertEquals(e1.blockManagerId, e2.blockManagerId) - case (e1: SparkListenerUnpersistRDD, e2: SparkListenerUnpersistRDD) => - assert(e1.rddId == e2.rddId) - case (e1: SparkListenerApplicationStart, e2: SparkListenerApplicationStart) => - assert(e1.appName == e2.appName) - assert(e1.time == e2.time) - assert(e1.sparkUser == e2.sparkUser) - case (e1: SparkListenerApplicationEnd, e2: SparkListenerApplicationEnd) => - assert(e1.time == e2.time) - case (SparkListenerShutdown, SparkListenerShutdown) => + case (e1, e2) => + assert(e1 === e2) case _ => fail("Events don't match in types!") } } @@ -435,16 +421,6 @@ class JsonProtocolSuite extends FunSuite { assert(metrics1.bytesRead === metrics2.bytesRead) } - private def assertEquals(bm1: BlockManagerId, bm2: BlockManagerId) { - if (bm1 == null || bm2 == null) { - assert(bm1 === bm2) - } else { - assert(bm1.executorId === bm2.executorId) - assert(bm1.host === bm2.host) - assert(bm1.port === bm2.port) - } - } - private def assertEquals(result1: JobResult, result2: JobResult) { (result1, result2) match { case (JobSucceeded, JobSucceeded) => @@ -462,7 +438,7 @@ class JsonProtocolSuite extends FunSuite { assert(r1.shuffleId === r2.shuffleId) assert(r1.mapId === r2.mapId) assert(r1.reduceId === r2.reduceId) - assertEquals(r1.bmAddress, r2.bmAddress) + assert(r1.bmAddress === r2.bmAddress) assert(r1.message === r2.message) case (r1: ExceptionFailure, r2: ExceptionFailure) => assert(r1.className === r2.className) From 6c726a3fbd9cd3aa5f3a1992b2132b25eabb76a0 Mon Sep 17 00:00:00 2001 From: zsxwing Date: Sun, 4 Jan 2015 21:06:04 -0800 Subject: [PATCH 054/116] [SPARK-5069][Core] Fix the race condition of TaskSchedulerImpl.dagScheduler It's not necessary to set `TaskSchedulerImpl.dagScheduler` in preStart. It's safe to set it after `initializeEventProcessActor()`. Author: zsxwing Closes #3887 from zsxwing/SPARK-5069 and squashes the following commits: d95894f [zsxwing] Fix the race condition of TaskSchedulerImpl.dagScheduler --- .../scala/org/apache/spark/scheduler/DAGScheduler.scala | 7 +------ .../apache/spark/scheduler/TaskSchedulerImplSuite.scala | 1 - 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index cb8ccfbdbdcbb..259621d263d7c 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -138,6 +138,7 @@ class DAGScheduler( } initializeEventProcessActor() + taskScheduler.setDAGScheduler(this) // Called by TaskScheduler to report task's starting. def taskStarted(task: Task[_], taskInfo: TaskInfo) { @@ -1375,12 +1376,6 @@ private[scheduler] class DAGSchedulerActorSupervisor(dagScheduler: DAGScheduler) private[scheduler] class DAGSchedulerEventProcessActor(dagScheduler: DAGScheduler) extends Actor with Logging { - override def preStart() { - // set DAGScheduler for taskScheduler to ensure eventProcessActor is always - // valid when the messages arrive - dagScheduler.taskScheduler.setDAGScheduler(dagScheduler) - } - /** * The main event loop of the DAG scheduler. */ diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala index 40aaf9dd1f1e9..00812e6018d1f 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala @@ -305,7 +305,6 @@ class TaskSchedulerImplSuite extends FunSuite with LocalSparkContext with Loggin override def taskStarted(task: Task[_], taskInfo: TaskInfo) {} override def executorAdded(execId: String, host: String) {} } - taskScheduler.setDAGScheduler(dagScheduler) // Give zero core offers. Should not generate any tasks val zeroCoreWorkerOffers = Seq(new WorkerOffer("executor0", "host0", 0), new WorkerOffer("executor1", "host1", 0)) From 27e7f5a7237d9d64a3b2c8a030ba3e3a9a96b26c Mon Sep 17 00:00:00 2001 From: zsxwing Date: Sun, 4 Jan 2015 21:09:21 -0800 Subject: [PATCH 055/116] [SPARK-5083][Core] Fix a flaky test in TaskResultGetterSuite Because `sparkEnv.blockManager.master.removeBlock` is asynchronous, we need to make sure the block has already been removed before calling `super.enqueueSuccessfulTask`. Author: zsxwing Closes #3894 from zsxwing/SPARK-5083 and squashes the following commits: d97c03d [zsxwing] Fix a flaky test in TaskResultGetterSuite --- .../scheduler/TaskResultGetterSuite.scala | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala index 3aab5a156ee77..e3a3803e6483a 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala @@ -19,7 +19,12 @@ package org.apache.spark.scheduler import java.nio.ByteBuffer -import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite} +import scala.concurrent.duration._ +import scala.language.postfixOps +import scala.util.control.NonFatal + +import org.scalatest.{BeforeAndAfter, FunSuite} +import org.scalatest.concurrent.Eventually._ import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv} import org.apache.spark.storage.TaskResultBlockId @@ -34,6 +39,8 @@ class ResultDeletingTaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedule extends TaskResultGetter(sparkEnv, scheduler) { var removedResult = false + @volatile var removeBlockSuccessfully = false + override def enqueueSuccessfulTask( taskSetManager: TaskSetManager, tid: Long, serializedData: ByteBuffer) { if (!removedResult) { @@ -42,6 +49,15 @@ class ResultDeletingTaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedule serializer.get().deserialize[TaskResult[_]](serializedData) match { case IndirectTaskResult(blockId, size) => sparkEnv.blockManager.master.removeBlock(blockId) + // removeBlock is asynchronous. Need to wait it's removed successfully + try { + eventually(timeout(3 seconds), interval(200 milliseconds)) { + assert(!sparkEnv.blockManager.master.contains(blockId)) + } + removeBlockSuccessfully = true + } catch { + case NonFatal(e) => removeBlockSuccessfully = false + } case directResult: DirectTaskResult[_] => taskSetManager.abort("Internal error: expect only indirect results") } @@ -92,10 +108,12 @@ class TaskResultGetterSuite extends FunSuite with BeforeAndAfter with LocalSpark assert(false, "Expect local cluster to use TaskSchedulerImpl") throw new ClassCastException } - scheduler.taskResultGetter = new ResultDeletingTaskResultGetter(sc.env, scheduler) + val resultGetter = new ResultDeletingTaskResultGetter(sc.env, scheduler) + scheduler.taskResultGetter = resultGetter val akkaFrameSize = sc.env.actorSystem.settings.config.getBytes("akka.remote.netty.tcp.maximum-frame-size").toInt val result = sc.parallelize(Seq(1), 1).map(x => 1.to(akkaFrameSize).toArray).reduce((x, y) => x) + assert(resultGetter.removeBlockSuccessfully) assert(result === 1.to(akkaFrameSize).toArray) // Make sure two tasks were run (one failed one, and a second retried one). From 5c506cecb933b156b2f06a688ee08c4347bf0d47 Mon Sep 17 00:00:00 2001 From: zsxwing Date: Sun, 4 Jan 2015 21:18:33 -0800 Subject: [PATCH 056/116] [SPARK-5074][Core] Fix a non-deterministic test failure Add `assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))` to make sure `sparkListener` receive the message. Author: zsxwing Closes #3889 from zsxwing/SPARK-5074 and squashes the following commits: e61c198 [zsxwing] Fix a non-deterministic test failure --- .../scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index d6ec9e129cceb..d30eb10bbe947 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -247,6 +247,7 @@ class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with F test("[SPARK-3353] parent stage should have lower stage id") { sparkListener.stageByOrderOfExecution.clear() sc.parallelize(1 to 10).map(x => (x, x)).reduceByKey(_ + _, 4).count() + assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)) assert(sparkListener.stageByOrderOfExecution.length === 2) assert(sparkListener.stageByOrderOfExecution(0) < sparkListener.stageByOrderOfExecution(1)) } From d3f07fd23cc26a70f44c52e24445974d4885d58a Mon Sep 17 00:00:00 2001 From: Varun Saxena Date: Mon, 5 Jan 2015 10:32:37 -0800 Subject: [PATCH 057/116] [SPARK-4688] Have a single shared network timeout in Spark [SPARK-4688] Have a single shared network timeout in Spark Author: Varun Saxena Author: varunsaxena Closes #3562 from varunsaxena/SPARK-4688 and squashes the following commits: 6e97f72 [Varun Saxena] [SPARK-4688] Single shared network timeout cd783a2 [Varun Saxena] SPARK-4688 d6f8c29 [Varun Saxena] SCALA-4688 9562b15 [Varun Saxena] SPARK-4688 a75f014 [varunsaxena] SPARK-4688 594226c [varunsaxena] SPARK-4688 --- .../apache/spark/network/nio/ConnectionManager.scala | 3 ++- .../apache/spark/storage/BlockManagerMasterActor.scala | 7 +++++-- .../main/scala/org/apache/spark/util/AkkaUtils.scala | 2 +- docs/configuration.md | 10 ++++++++++ .../org/apache/spark/network/util/TransportConf.java | 4 +++- 5 files changed, 21 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala b/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala index 243b71c980864..98455c0968263 100644 --- a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala +++ b/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala @@ -81,7 +81,8 @@ private[nio] class ConnectionManager( private val ackTimeoutMonitor = new HashedWheelTimer(Utils.namedThreadFactory("AckTimeoutMonitor")) - private val ackTimeout = conf.getInt("spark.core.connection.ack.wait.timeout", 60) + private val ackTimeout = + conf.getInt("spark.core.connection.ack.wait.timeout", conf.getInt("spark.network.timeout", 100)) // Get the thread counts from the Spark Configuration. // diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala index 9cbda41223a8b..9d77cf27882eb 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala @@ -52,8 +52,11 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus private val akkaTimeout = AkkaUtils.askTimeout(conf) - val slaveTimeout = conf.getLong("spark.storage.blockManagerSlaveTimeoutMs", - math.max(conf.getInt("spark.executor.heartbeatInterval", 10000) * 3, 45000)) + val slaveTimeout = { + val defaultMs = math.max(conf.getInt("spark.executor.heartbeatInterval", 10000) * 3, 45000) + val networkTimeout = conf.getInt("spark.network.timeout", defaultMs / 1000) + conf.getLong("spark.storage.blockManagerSlaveTimeoutMs", networkTimeout * 1000) + } val checkTimeoutInterval = conf.getLong("spark.storage.blockManagerTimeoutIntervalMs", 60000) diff --git a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala index 8c2457f56bffe..64e3a5416c6b5 100644 --- a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala @@ -65,7 +65,7 @@ private[spark] object AkkaUtils extends Logging { val akkaThreads = conf.getInt("spark.akka.threads", 4) val akkaBatchSize = conf.getInt("spark.akka.batchSize", 15) - val akkaTimeout = conf.getInt("spark.akka.timeout", 100) + val akkaTimeout = conf.getInt("spark.akka.timeout", conf.getInt("spark.network.timeout", 100)) val akkaFrameSize = maxFrameSizeBytes(conf) val akkaLogLifecycleEvents = conf.getBoolean("spark.akka.logLifecycleEvents", false) val lifecycleEvents = if (akkaLogLifecycleEvents) "on" else "off" diff --git a/docs/configuration.md b/docs/configuration.md index 9bb6499993735..7ada67fc303c6 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -818,6 +818,16 @@ Apart from these, the following properties are also available, and may be useful Communication timeout between Spark nodes, in seconds. + + + + + diff --git a/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java b/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java index 7c9adf52af0f0..e34382da22a50 100644 --- a/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java +++ b/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java @@ -37,7 +37,9 @@ public boolean preferDirectBufs() { /** Connect timeout in milliseconds. Default 120 secs. */ public int connectionTimeoutMs() { - return conf.getInt("spark.shuffle.io.connectionTimeout", 120) * 1000; + int timeout = + conf.getInt("spark.shuffle.io.connectionTimeout", conf.getInt("spark.network.timeout", 100)); + return timeout * 1000; } /** Number of concurrent connections between two nodes for fetching data. */ From ce39b34404868de4ca51be06832169187b1aef7d Mon Sep 17 00:00:00 2001 From: WangTao Date: Mon, 5 Jan 2015 11:59:38 -0800 Subject: [PATCH 058/116] [SPARK-5057] Log message in failed askWithReply attempts https://issues.apache.org/jira/browse/SPARK-5057 Author: WangTao Author: WangTaoTheTonic Closes #3875 from WangTaoTheTonic/SPARK-5057 and squashes the following commits: 1503487 [WangTao] use string interpolation 706c8a7 [WangTaoTheTonic] log more messages --- .../scala/org/apache/spark/util/AkkaUtils.scala | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala index 64e3a5416c6b5..8d86fd3e11ad7 100644 --- a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala @@ -89,7 +89,7 @@ private[spark] object AkkaUtils extends Logging { } val requireCookie = if (isAuthOn) "on" else "off" val secureCookie = if (isAuthOn) secretKey else "" - logDebug("In createActorSystem, requireCookie is: " + requireCookie) + logDebug(s"In createActorSystem, requireCookie is: $requireCookie") val akkaConf = ConfigFactory.parseMap(conf.getAkkaConf.toMap[String, String]).withFallback( ConfigFactory.parseString( @@ -140,8 +140,8 @@ private[spark] object AkkaUtils extends Logging { def maxFrameSizeBytes(conf: SparkConf): Int = { val frameSizeInMB = conf.getInt("spark.akka.frameSize", 10) if (frameSizeInMB > AKKA_MAX_FRAME_SIZE_IN_MB) { - throw new IllegalArgumentException("spark.akka.frameSize should not be greater than " - + AKKA_MAX_FRAME_SIZE_IN_MB + "MB") + throw new IllegalArgumentException( + s"spark.akka.frameSize should not be greater than $AKKA_MAX_FRAME_SIZE_IN_MB MB") } frameSizeInMB * 1024 * 1024 } @@ -182,8 +182,8 @@ private[spark] object AkkaUtils extends Logging { timeout: FiniteDuration): T = { // TODO: Consider removing multiple attempts if (actor == null) { - throw new SparkException("Error sending message as actor is null " + - "[message = " + message + "]") + throw new SparkException(s"Error sending message [message = $message]" + + " as actor is null ") } var attempts = 0 var lastException: Exception = null @@ -200,13 +200,13 @@ private[spark] object AkkaUtils extends Logging { case ie: InterruptedException => throw ie case e: Exception => lastException = e - logWarning("Error sending message in " + attempts + " attempts", e) + logWarning(s"Error sending message [message = $message] in $attempts attempts", e) } Thread.sleep(retryInterval) } throw new SparkException( - "Error sending message [message = " + message + "]", lastException) + s"Error sending message [message = $message]", lastException) } def makeDriverRef(name: String, conf: SparkConf, actorSystem: ActorSystem): ActorRef = { From 1c0e7ce056c79e1db96f85b8c56a479b8b043970 Mon Sep 17 00:00:00 2001 From: Jongyoul Lee Date: Mon, 5 Jan 2015 12:05:09 -0800 Subject: [PATCH 059/116] [SPARK-4465] runAsSparkUser doesn't affect TaskRunner in Mesos environme... ...nt at all. - fixed a scope of runAsSparkUser from MesosExecutorDriver.run to MesosExecutorBackend.launchTask - See the Jira Issue for more details. Author: Jongyoul Lee Closes #3741 from jongyoul/SPARK-4465 and squashes the following commits: 46ad71e [Jongyoul Lee] [SPARK-4465] runAsSparkUser doesn't affect TaskRunner in Mesos environment at all. - Removed unused import 3d6631f [Jongyoul Lee] [SPARK-4465] runAsSparkUser doesn't affect TaskRunner in Mesos environment at all. - Removed comments and adjusted indentations 2343f13 [Jongyoul Lee] [SPARK-4465] runAsSparkUser doesn't affect TaskRunner in Mesos environment at all. - fixed a scope of runAsSparkUser from MesosExecutorDriver.run to MesosExecutorBackend.launchTask --- .../spark/executor/MesosExecutorBackend.scala | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala index a098d07bd8659..2e23ae0a4f831 100644 --- a/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala @@ -22,7 +22,7 @@ import java.nio.ByteBuffer import scala.collection.JavaConversions._ import org.apache.mesos.protobuf.ByteString -import org.apache.mesos.{Executor => MesosExecutor, ExecutorDriver, MesosExecutorDriver, MesosNativeLibrary} +import org.apache.mesos.{Executor => MesosExecutor, ExecutorDriver, MesosExecutorDriver} import org.apache.mesos.Protos.{TaskStatus => MesosTaskStatus, _} import org.apache.spark.{Logging, TaskState, SparkConf, SparkEnv} @@ -80,7 +80,9 @@ private[spark] class MesosExecutorBackend if (executor == null) { logError("Received launchTask but executor was null") } else { - executor.launchTask(this, taskId, taskInfo.getName, taskInfo.getData.asReadOnlyByteBuffer) + SparkHadoopUtil.get.runAsSparkUser { () => + executor.launchTask(this, taskId, taskInfo.getName, taskInfo.getData.asReadOnlyByteBuffer) + } } } @@ -112,11 +114,8 @@ private[spark] class MesosExecutorBackend private[spark] object MesosExecutorBackend extends Logging { def main(args: Array[String]) { SignalLogger.register(log) - SparkHadoopUtil.get.runAsSparkUser { () => - MesosNativeLibrary.load() - // Create a new Executor and start it running - val runner = new MesosExecutorBackend() - new MesosExecutorDriver(runner).run() - } + // Create a new Executor and start it running + val runner = new MesosExecutorBackend() + new MesosExecutorDriver(runner).run() } } From 6c6f32574023b8e43a24f2081ff17e6e446de2f3 Mon Sep 17 00:00:00 2001 From: freeman Date: Mon, 5 Jan 2015 13:10:59 -0800 Subject: [PATCH 060/116] [SPARK-5089][PYSPARK][MLLIB] Fix vector convert This is a small change addressing a potentially significant bug in how PySpark + MLlib handles non-float64 numpy arrays. The automatic conversion to `DenseVector` that occurs when passing RDDs to MLlib algorithms in PySpark should automatically upcast to float64s, but currently this wasn't actually happening. As a result, non-float64 would be silently parsed inappropriately during SerDe, yielding erroneous results when running, for example, KMeans. The PR includes the fix, as well as a new test for the correct conversion behavior. davies Author: freeman Closes #3902 from freeman-lab/fix-vector-convert and squashes the following commits: 764db47 [freeman] Add a test for proper conversion behavior 704f97e [freeman] Return array after changing type --- python/pyspark/mllib/linalg.py | 2 +- python/pyspark/mllib/tests.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py index f7aa2b0cb04b3..4f8491f43e457 100644 --- a/python/pyspark/mllib/linalg.py +++ b/python/pyspark/mllib/linalg.py @@ -178,7 +178,7 @@ def __init__(self, ar): elif not isinstance(ar, np.ndarray): ar = np.array(ar, dtype=np.float64) if ar.dtype != np.float64: - ar.astype(np.float64) + ar = ar.astype(np.float64) self.array = ar def __reduce__(self): diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 5034f229e824a..1f48bc1219dba 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -110,6 +110,16 @@ def test_squared_distance(self): self.assertEquals(0.0, _squared_distance(dv, dv)) self.assertEquals(0.0, _squared_distance(lst, lst)) + def test_conversion(self): + # numpy arrays should be automatically upcast to float64 + # tests for fix of [SPARK-5089] + v = array([1, 2, 3, 4], dtype='float64') + dv = DenseVector(v) + self.assertTrue(dv.array.dtype == 'float64') + v = array([1, 2, 3, 4], dtype='float32') + dv = DenseVector(v) + self.assertTrue(dv.array.dtype == 'float64') + class ListTests(PySparkTestCase): From bbcba3a9430365640c0188e7ca6e0677d3227dd8 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Mon, 5 Jan 2015 15:19:53 -0800 Subject: [PATCH 061/116] [SPARK-5093] Set spark.network.timeout to 120s consistently. Author: Reynold Xin Closes #3903 from rxin/timeout-120 and squashes the following commits: 7c2138e [Reynold Xin] [SPARK-5093] Set spark.network.timeout to 120s consistently. --- .../org/apache/spark/network/nio/ConnectionManager.scala | 2 +- .../org/apache/spark/storage/BlockManagerMasterActor.scala | 6 +----- core/src/main/scala/org/apache/spark/util/AkkaUtils.scala | 2 +- docs/configuration.md | 6 +++--- .../java/org/apache/spark/network/util/TransportConf.java | 5 ++--- 5 files changed, 8 insertions(+), 13 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala b/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala index 98455c0968263..3340fca08014e 100644 --- a/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala +++ b/core/src/main/scala/org/apache/spark/network/nio/ConnectionManager.scala @@ -82,7 +82,7 @@ private[nio] class ConnectionManager( new HashedWheelTimer(Utils.namedThreadFactory("AckTimeoutMonitor")) private val ackTimeout = - conf.getInt("spark.core.connection.ack.wait.timeout", conf.getInt("spark.network.timeout", 100)) + conf.getInt("spark.core.connection.ack.wait.timeout", conf.getInt("spark.network.timeout", 120)) // Get the thread counts from the Spark Configuration. // diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala index 9d77cf27882eb..64133464d8daa 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala @@ -52,11 +52,7 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus private val akkaTimeout = AkkaUtils.askTimeout(conf) - val slaveTimeout = { - val defaultMs = math.max(conf.getInt("spark.executor.heartbeatInterval", 10000) * 3, 45000) - val networkTimeout = conf.getInt("spark.network.timeout", defaultMs / 1000) - conf.getLong("spark.storage.blockManagerSlaveTimeoutMs", networkTimeout * 1000) - } + val slaveTimeout = conf.getLong("spark.storage.blockManagerSlaveTimeoutMs", 120 * 1000) val checkTimeoutInterval = conf.getLong("spark.storage.blockManagerTimeoutIntervalMs", 60000) diff --git a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala index 8d86fd3e11ad7..db2531dc171f8 100644 --- a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala @@ -65,7 +65,7 @@ private[spark] object AkkaUtils extends Logging { val akkaThreads = conf.getInt("spark.akka.threads", 4) val akkaBatchSize = conf.getInt("spark.akka.batchSize", 15) - val akkaTimeout = conf.getInt("spark.akka.timeout", conf.getInt("spark.network.timeout", 100)) + val akkaTimeout = conf.getInt("spark.akka.timeout", conf.getInt("spark.network.timeout", 120)) val akkaFrameSize = maxFrameSizeBytes(conf) val akkaLogLifecycleEvents = conf.getBoolean("spark.akka.logLifecycleEvents", false) val lifecycleEvents = if (akkaLogLifecycleEvents) "on" else "off" diff --git a/docs/configuration.md b/docs/configuration.md index 7ada67fc303c6..2add48569bece 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -820,12 +820,12 @@ Apart from these, the following properties are also available, and may be useful - + diff --git a/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java b/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java index e34382da22a50..6c9178688693f 100644 --- a/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java +++ b/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java @@ -37,9 +37,8 @@ public boolean preferDirectBufs() { /** Connect timeout in milliseconds. Default 120 secs. */ public int connectionTimeoutMs() { - int timeout = - conf.getInt("spark.shuffle.io.connectionTimeout", conf.getInt("spark.network.timeout", 100)); - return timeout * 1000; + int defaultTimeout = conf.getInt("spark.network.timeout", 120); + return conf.getInt("spark.shuffle.io.connectionTimeout", defaultTimeout) * 1000; } /** Number of concurrent connections between two nodes for fetching data. */ From 04d55d8e8e4890d110ce5561b5c1ae608c34a7c9 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Mon, 5 Jan 2015 15:34:22 -0800 Subject: [PATCH 062/116] [SPARK-5040][SQL] Support expressing unresolved attributes using $"attribute name" notation in SQL DSL. Author: Reynold Xin Closes #3862 from rxin/stringcontext-attr and squashes the following commits: 9b10f57 [Reynold Xin] Rename StrongToAttributeConversionHelper 72121af [Reynold Xin] [SPARK-5040][SQL] Support expressing unresolved attributes using $"attribute name" notation in SQL DSL. --- .../org/apache/spark/sql/catalyst/dsl/package.scala | 9 +++++++++ .../scala/org/apache/spark/sql/DslQuerySuite.scala | 12 ++++++++++++ 2 files changed, 21 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 8e39f79d2ca51..9608e15c0f302 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -135,6 +135,15 @@ package object dsl { implicit def symbolToUnresolvedAttribute(s: Symbol): analysis.UnresolvedAttribute = analysis.UnresolvedAttribute(s.name) + /** Converts $"col name" into an [[analysis.UnresolvedAttribute]]. */ + implicit class StringToAttributeConversionHelper(val sc: StringContext) { + // Note that if we make ExpressionConversions an object rather than a trait, we can + // then make this a value class to avoid the small penalty of runtime instantiation. + def $(args: Any*): analysis.UnresolvedAttribute = { + analysis.UnresolvedAttribute(sc.s(args :_*)) + } + } + def sum(e: Expression) = Sum(e) def sumDistinct(e: Expression) = SumDistinct(e) def count(e: Expression) = Count(e) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala index c0b9cf5163120..ab88f3ad10d66 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala @@ -56,6 +56,18 @@ class DslQuerySuite extends QueryTest { ) } + test("convert $\"attribute name\" into unresolved attribute") { + checkAnswer( + testData.where($"key" === 1).select($"value"), + Seq(Seq("1"))) + } + + test("convert Scala Symbol 'attrname into unresolved attribute") { + checkAnswer( + testData.where('key === 1).select('value), + Seq(Seq("1"))) + } + test("select *") { checkAnswer( testData.select(Star(None)), From 451546aa6d2e61e43b0c0f0669f18cfb7489e584 Mon Sep 17 00:00:00 2001 From: Kostas Sakellis Date: Mon, 5 Jan 2015 23:26:33 -0800 Subject: [PATCH 063/116] SPARK-4843 [YARN] Squash ExecutorRunnableUtil and ExecutorRunnable ExecutorRunnableUtil is a parent of ExecutorRunnable because of the yarn-alpha and yarn-stable split. Now that yarn-alpha is gone, this commit squashes the unnecessary hierarchy. The methods from ExecutorRunnableUtil are added as private. Author: Kostas Sakellis Closes #3696 from ksakellis/kostas-spark-4843 and squashes the following commits: 486716f [Kostas Sakellis] Moved prepareEnvironment call to after yarnConf declaration 470e22e [Kostas Sakellis] Fixed indentation and renamed sparkConf variable 9b1b1c9 [Kostas Sakellis] SPARK-4843 [YARN] Squash ExecutorRunnableUtil and ExecutorRunnable --- .../spark/deploy/yarn/ExecutorRunnable.scala | 182 +++++++++++++++- .../deploy/yarn/ExecutorRunnableUtil.scala | 203 ------------------ 2 files changed, 172 insertions(+), 213 deletions(-) delete mode 100644 yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala index fdd3c2300fa78..6d9198c122e97 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala @@ -17,32 +17,33 @@ package org.apache.spark.deploy.yarn +import java.net.URI import java.nio.ByteBuffer -import java.security.PrivilegedExceptionAction + +import org.apache.hadoop.fs.Path +import org.apache.hadoop.yarn.api.ApplicationConstants.Environment +import org.apache.spark.util.Utils import scala.collection.JavaConversions._ +import scala.collection.mutable.{HashMap, ListBuffer} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.DataOutputBuffer -import org.apache.hadoop.net.NetUtils import org.apache.hadoop.security.UserGroupInformation import org.apache.hadoop.yarn.api._ import org.apache.hadoop.yarn.api.records._ -import org.apache.hadoop.yarn.api.records.impl.pb.ProtoUtils -import org.apache.hadoop.yarn.api.protocolrecords._ import org.apache.hadoop.yarn.client.api.NMClient import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.hadoop.yarn.ipc.YarnRPC -import org.apache.hadoop.yarn.util.{Apps, ConverterUtils, Records} +import org.apache.hadoop.yarn.util.{ConverterUtils, Records} import org.apache.spark.{SecurityManager, SparkConf, Logging} import org.apache.spark.network.util.JavaUtils - class ExecutorRunnable( container: Container, conf: Configuration, - spConf: SparkConf, + sparkConf: SparkConf, masterAddress: String, slaveId: String, hostname: String, @@ -50,13 +51,13 @@ class ExecutorRunnable( executorCores: Int, appId: String, securityMgr: SecurityManager) - extends Runnable with ExecutorRunnableUtil with Logging { + extends Runnable with Logging { var rpc: YarnRPC = YarnRPC.create(conf) var nmClient: NMClient = _ - val sparkConf = spConf val yarnConf: YarnConfiguration = new YarnConfiguration(conf) - + lazy val env = prepareEnvironment + def run = { logInfo("Starting Executor Container") nmClient = NMClient.createNMClient() @@ -110,4 +111,165 @@ class ExecutorRunnable( nmClient.startContainer(container, ctx) } + private def prepareCommand( + masterAddress: String, + slaveId: String, + hostname: String, + executorMemory: Int, + executorCores: Int, + appId: String, + localResources: HashMap[String, LocalResource]): List[String] = { + // Extra options for the JVM + val javaOpts = ListBuffer[String]() + + // Set the environment variable through a command prefix + // to append to the existing value of the variable + var prefixEnv: Option[String] = None + + // Set the JVM memory + val executorMemoryString = executorMemory + "m" + javaOpts += "-Xms" + executorMemoryString + " -Xmx" + executorMemoryString + " " + + // Set extra Java options for the executor, if defined + sys.props.get("spark.executor.extraJavaOptions").foreach { opts => + javaOpts += opts + } + sys.env.get("SPARK_JAVA_OPTS").foreach { opts => + javaOpts += opts + } + sys.props.get("spark.executor.extraLibraryPath").foreach { p => + prefixEnv = Some(Utils.libraryPathEnvPrefix(Seq(p))) + } + + javaOpts += "-Djava.io.tmpdir=" + + new Path(Environment.PWD.$(), YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR) + + // Certain configs need to be passed here because they are needed before the Executor + // registers with the Scheduler and transfers the spark configs. Since the Executor backend + // uses Akka to connect to the scheduler, the akka settings are needed as well as the + // authentication settings. + sparkConf.getAll. + filter { case (k, v) => k.startsWith("spark.auth") || k.startsWith("spark.akka") }. + foreach { case (k, v) => javaOpts += YarnSparkHadoopUtil.escapeForShell(s"-D$k=$v") } + + sparkConf.getAkkaConf. + foreach { case (k, v) => javaOpts += YarnSparkHadoopUtil.escapeForShell(s"-D$k=$v") } + + // Commenting it out for now - so that people can refer to the properties if required. Remove + // it once cpuset version is pushed out. + // The context is, default gc for server class machines end up using all cores to do gc - hence + // if there are multiple containers in same node, spark gc effects all other containers + // performance (which can also be other spark containers) + // Instead of using this, rely on cpusets by YARN to enforce spark behaves 'properly' in + // multi-tenant environments. Not sure how default java gc behaves if it is limited to subset + // of cores on a node. + /* + else { + // If no java_opts specified, default to using -XX:+CMSIncrementalMode + // It might be possible that other modes/config is being done in + // spark.executor.extraJavaOptions, so we dont want to mess with it. + // In our expts, using (default) throughput collector has severe perf ramnifications in + // multi-tennent machines + // The options are based on + // http://www.oracle.com/technetwork/java/gc-tuning-5-138395.html#0.0.0.%20When%20to%20Use + // %20the%20Concurrent%20Low%20Pause%20Collector|outline + javaOpts += " -XX:+UseConcMarkSweepGC " + javaOpts += " -XX:+CMSIncrementalMode " + javaOpts += " -XX:+CMSIncrementalPacing " + javaOpts += " -XX:CMSIncrementalDutyCycleMin=0 " + javaOpts += " -XX:CMSIncrementalDutyCycle=10 " + } + */ + + // For log4j configuration to reference + javaOpts += ("-Dspark.yarn.app.container.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR) + + val commands = prefixEnv ++ Seq(Environment.JAVA_HOME.$() + "/bin/java", + "-server", + // Kill if OOM is raised - leverage yarn's failure handling to cause rescheduling. + // Not killing the task leaves various aspects of the executor and (to some extent) the jvm in + // an inconsistent state. + // TODO: If the OOM is not recoverable by rescheduling it on different node, then do + // 'something' to fail job ... akin to blacklisting trackers in mapred ? + "-XX:OnOutOfMemoryError='kill %p'") ++ + javaOpts ++ + Seq("org.apache.spark.executor.CoarseGrainedExecutorBackend", + masterAddress.toString, + slaveId.toString, + hostname.toString, + executorCores.toString, + appId, + "1>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout", + "2>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr") + + // TODO: it would be nicer to just make sure there are no null commands here + commands.map(s => if (s == null) "null" else s).toList + } + + private def setupDistributedCache( + file: String, + rtype: LocalResourceType, + localResources: HashMap[String, LocalResource], + timestamp: String, + size: String, + vis: String): Unit = { + val uri = new URI(file) + val amJarRsrc = Records.newRecord(classOf[LocalResource]) + amJarRsrc.setType(rtype) + amJarRsrc.setVisibility(LocalResourceVisibility.valueOf(vis)) + amJarRsrc.setResource(ConverterUtils.getYarnUrlFromURI(uri)) + amJarRsrc.setTimestamp(timestamp.toLong) + amJarRsrc.setSize(size.toLong) + localResources(uri.getFragment()) = amJarRsrc + } + + private def prepareLocalResources: HashMap[String, LocalResource] = { + logInfo("Preparing Local resources") + val localResources = HashMap[String, LocalResource]() + + if (System.getenv("SPARK_YARN_CACHE_FILES") != null) { + val timeStamps = System.getenv("SPARK_YARN_CACHE_FILES_TIME_STAMPS").split(',') + val fileSizes = System.getenv("SPARK_YARN_CACHE_FILES_FILE_SIZES").split(',') + val distFiles = System.getenv("SPARK_YARN_CACHE_FILES").split(',') + val visibilities = System.getenv("SPARK_YARN_CACHE_FILES_VISIBILITIES").split(',') + for( i <- 0 to distFiles.length - 1) { + setupDistributedCache(distFiles(i), LocalResourceType.FILE, localResources, timeStamps(i), + fileSizes(i), visibilities(i)) + } + } + + if (System.getenv("SPARK_YARN_CACHE_ARCHIVES") != null) { + val timeStamps = System.getenv("SPARK_YARN_CACHE_ARCHIVES_TIME_STAMPS").split(',') + val fileSizes = System.getenv("SPARK_YARN_CACHE_ARCHIVES_FILE_SIZES").split(',') + val distArchives = System.getenv("SPARK_YARN_CACHE_ARCHIVES").split(',') + val visibilities = System.getenv("SPARK_YARN_CACHE_ARCHIVES_VISIBILITIES").split(',') + for( i <- 0 to distArchives.length - 1) { + setupDistributedCache(distArchives(i), LocalResourceType.ARCHIVE, localResources, + timeStamps(i), fileSizes(i), visibilities(i)) + } + } + + logInfo("Prepared Local resources " + localResources) + localResources + } + + private def prepareEnvironment: HashMap[String, String] = { + val env = new HashMap[String, String]() + val extraCp = sparkConf.getOption("spark.executor.extraClassPath") + ClientBase.populateClasspath(null, yarnConf, sparkConf, env, extraCp) + + sparkConf.getExecutorEnv.foreach { case (key, value) => + // This assumes each executor environment variable set here is a path + // This is kept for backward compatibility and consistency with hadoop + YarnSparkHadoopUtil.addPathToEnvironment(env, key, value) + } + + // Keep this for backwards compatibility but users should move to the config + sys.env.get("SPARK_YARN_USER_ENV").foreach { userEnvs => + YarnSparkHadoopUtil.setEnvFromInputString(env, userEnvs) + } + + System.getenv().filterKeys(_.startsWith("SPARK")).foreach { case (k, v) => env(k) = v } + env + } } diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala deleted file mode 100644 index 22d73ecf6d010..0000000000000 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.deploy.yarn - -import java.net.URI - -import scala.collection.JavaConversions._ -import scala.collection.mutable.{HashMap, ListBuffer} - -import org.apache.hadoop.fs.Path -import org.apache.hadoop.yarn.api._ -import org.apache.hadoop.yarn.api.ApplicationConstants.Environment -import org.apache.hadoop.yarn.api.records._ -import org.apache.hadoop.yarn.conf.YarnConfiguration -import org.apache.hadoop.yarn.util.{ConverterUtils, Records} - -import org.apache.spark.{Logging, SparkConf} -import org.apache.spark.util.Utils - -trait ExecutorRunnableUtil extends Logging { - - val yarnConf: YarnConfiguration - val sparkConf: SparkConf - lazy val env = prepareEnvironment - - def prepareCommand( - masterAddress: String, - slaveId: String, - hostname: String, - executorMemory: Int, - executorCores: Int, - appId: String, - localResources: HashMap[String, LocalResource]): List[String] = { - // Extra options for the JVM - val javaOpts = ListBuffer[String]() - - // Set the environment variable through a command prefix - // to append to the existing value of the variable - var prefixEnv: Option[String] = None - - // Set the JVM memory - val executorMemoryString = executorMemory + "m" - javaOpts += "-Xms" + executorMemoryString + " -Xmx" + executorMemoryString + " " - - // Set extra Java options for the executor, if defined - sys.props.get("spark.executor.extraJavaOptions").foreach { opts => - javaOpts += opts - } - sys.env.get("SPARK_JAVA_OPTS").foreach { opts => - javaOpts += opts - } - sys.props.get("spark.executor.extraLibraryPath").foreach { p => - prefixEnv = Some(Utils.libraryPathEnvPrefix(Seq(p))) - } - - javaOpts += "-Djava.io.tmpdir=" + - new Path(Environment.PWD.$(), YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR) - - // Certain configs need to be passed here because they are needed before the Executor - // registers with the Scheduler and transfers the spark configs. Since the Executor backend - // uses Akka to connect to the scheduler, the akka settings are needed as well as the - // authentication settings. - sparkConf.getAll. - filter { case (k, v) => k.startsWith("spark.auth") || k.startsWith("spark.akka") }. - foreach { case (k, v) => javaOpts += YarnSparkHadoopUtil.escapeForShell(s"-D$k=$v") } - - sparkConf.getAkkaConf. - foreach { case (k, v) => javaOpts += YarnSparkHadoopUtil.escapeForShell(s"-D$k=$v") } - - // Commenting it out for now - so that people can refer to the properties if required. Remove - // it once cpuset version is pushed out. - // The context is, default gc for server class machines end up using all cores to do gc - hence - // if there are multiple containers in same node, spark gc effects all other containers - // performance (which can also be other spark containers) - // Instead of using this, rely on cpusets by YARN to enforce spark behaves 'properly' in - // multi-tenant environments. Not sure how default java gc behaves if it is limited to subset - // of cores on a node. - /* - else { - // If no java_opts specified, default to using -XX:+CMSIncrementalMode - // It might be possible that other modes/config is being done in - // spark.executor.extraJavaOptions, so we dont want to mess with it. - // In our expts, using (default) throughput collector has severe perf ramnifications in - // multi-tennent machines - // The options are based on - // http://www.oracle.com/technetwork/java/gc-tuning-5-138395.html#0.0.0.%20When%20to%20Use - // %20the%20Concurrent%20Low%20Pause%20Collector|outline - javaOpts += " -XX:+UseConcMarkSweepGC " - javaOpts += " -XX:+CMSIncrementalMode " - javaOpts += " -XX:+CMSIncrementalPacing " - javaOpts += " -XX:CMSIncrementalDutyCycleMin=0 " - javaOpts += " -XX:CMSIncrementalDutyCycle=10 " - } - */ - - // For log4j configuration to reference - javaOpts += ("-Dspark.yarn.app.container.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR) - - val commands = prefixEnv ++ Seq(Environment.JAVA_HOME.$() + "/bin/java", - "-server", - // Kill if OOM is raised - leverage yarn's failure handling to cause rescheduling. - // Not killing the task leaves various aspects of the executor and (to some extent) the jvm in - // an inconsistent state. - // TODO: If the OOM is not recoverable by rescheduling it on different node, then do - // 'something' to fail job ... akin to blacklisting trackers in mapred ? - "-XX:OnOutOfMemoryError='kill %p'") ++ - javaOpts ++ - Seq("org.apache.spark.executor.CoarseGrainedExecutorBackend", - masterAddress.toString, - slaveId.toString, - hostname.toString, - executorCores.toString, - appId, - "1>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout", - "2>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr") - - // TODO: it would be nicer to just make sure there are no null commands here - commands.map(s => if (s == null) "null" else s).toList - } - - private def setupDistributedCache( - file: String, - rtype: LocalResourceType, - localResources: HashMap[String, LocalResource], - timestamp: String, - size: String, - vis: String): Unit = { - val uri = new URI(file) - val amJarRsrc = Records.newRecord(classOf[LocalResource]) - amJarRsrc.setType(rtype) - amJarRsrc.setVisibility(LocalResourceVisibility.valueOf(vis)) - amJarRsrc.setResource(ConverterUtils.getYarnUrlFromURI(uri)) - amJarRsrc.setTimestamp(timestamp.toLong) - amJarRsrc.setSize(size.toLong) - localResources(uri.getFragment()) = amJarRsrc - } - - def prepareLocalResources: HashMap[String, LocalResource] = { - logInfo("Preparing Local resources") - val localResources = HashMap[String, LocalResource]() - - if (System.getenv("SPARK_YARN_CACHE_FILES") != null) { - val timeStamps = System.getenv("SPARK_YARN_CACHE_FILES_TIME_STAMPS").split(',') - val fileSizes = System.getenv("SPARK_YARN_CACHE_FILES_FILE_SIZES").split(',') - val distFiles = System.getenv("SPARK_YARN_CACHE_FILES").split(',') - val visibilities = System.getenv("SPARK_YARN_CACHE_FILES_VISIBILITIES").split(',') - for( i <- 0 to distFiles.length - 1) { - setupDistributedCache(distFiles(i), LocalResourceType.FILE, localResources, timeStamps(i), - fileSizes(i), visibilities(i)) - } - } - - if (System.getenv("SPARK_YARN_CACHE_ARCHIVES") != null) { - val timeStamps = System.getenv("SPARK_YARN_CACHE_ARCHIVES_TIME_STAMPS").split(',') - val fileSizes = System.getenv("SPARK_YARN_CACHE_ARCHIVES_FILE_SIZES").split(',') - val distArchives = System.getenv("SPARK_YARN_CACHE_ARCHIVES").split(',') - val visibilities = System.getenv("SPARK_YARN_CACHE_ARCHIVES_VISIBILITIES").split(',') - for( i <- 0 to distArchives.length - 1) { - setupDistributedCache(distArchives(i), LocalResourceType.ARCHIVE, localResources, - timeStamps(i), fileSizes(i), visibilities(i)) - } - } - - logInfo("Prepared Local resources " + localResources) - localResources - } - - def prepareEnvironment: HashMap[String, String] = { - val env = new HashMap[String, String]() - val extraCp = sparkConf.getOption("spark.executor.extraClassPath") - ClientBase.populateClasspath(null, yarnConf, sparkConf, env, extraCp) - - sparkConf.getExecutorEnv.foreach { case (key, value) => - // This assumes each executor environment variable set here is a path - // This is kept for backward compatibility and consistency with hadoop - YarnSparkHadoopUtil.addPathToEnvironment(env, key, value) - } - - // Keep this for backwards compatibility but users should move to the config - sys.env.get("SPARK_YARN_USER_ENV").foreach { userEnvs => - YarnSparkHadoopUtil.setEnvFromInputString(env, userEnvs) - } - - System.getenv().filterKeys(_.startsWith("SPARK")).foreach { case (k, v) => env(k) = v } - env - } - -} From a6394bc2c094c6c662237236c2effa2dabe67910 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Tue, 6 Jan 2015 00:31:19 -0800 Subject: [PATCH 064/116] [SPARK-1600] Refactor FileInputStream tests to remove Thread.sleep() calls and SystemClock usage This patch refactors Spark Streaming's FileInputStream tests to remove uses of Thread.sleep() and SystemClock, which should hopefully resolve some longstanding flakiness in these tests (see SPARK-1600). Key changes: - Modify FileInputDStream to use the scheduler's Clock instead of System.currentTimeMillis(); this allows it to be tested using ManualClock. - Fix a synchronization issue in ManualClock's `currentTime` method. - Add a StreamingTestWaiter class which allows callers to block until a certain number of batches have finished. - Change the FileInputStream tests so that files' modification times are manually set based off of ManualClock; this eliminates many Thread.sleep calls. - Update these tests to use the withStreamingContext fixture. Author: Josh Rosen Closes #3801 from JoshRosen/SPARK-1600 and squashes the following commits: e4494f4 [Josh Rosen] Address a potential race when setting file modification times 8340bd0 [Josh Rosen] Use set comparisons for output. 0b9c252 [Josh Rosen] Fix some ManualClock usage problems. 1cc689f [Josh Rosen] ConcurrentHashMap -> SynchronizedMap db26c3a [Josh Rosen] Use standard timeout in ScalaTest `eventually` blocks. 3939432 [Josh Rosen] Rename StreamingTestWaiter to BatchCounter 0b9c3a1 [Josh Rosen] Wait for checkpoint to complete 863d71a [Josh Rosen] Remove Thread.sleep that was used to make task run slowly b4442c3 [Josh Rosen] batchTimeToSelectedFiles should be thread-safe 15b48ee [Josh Rosen] Replace several TestWaiter methods w/ ScalaTest eventually. fffc51c [Josh Rosen] Revert "Remove last remaining sleep() call" dbb8247 [Josh Rosen] Remove last remaining sleep() call 566a63f [Josh Rosen] Fix log message and comment typos da32f3f [Josh Rosen] Fix log message and comment typos 3689214 [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-1600 c8f06b1 [Josh Rosen] Remove Thread.sleep calls in FileInputStream CheckpointSuite test. d4f2d87 [Josh Rosen] Refactor file input stream tests to not rely on SystemClock. dda1403 [Josh Rosen] Add StreamingTestWaiter class. 3c3efc3 [Josh Rosen] Synchronize `currentTime` in ManualClock a95ddc4 [Josh Rosen] Modify FileInputDStream to use Clock class. --- .../streaming/dstream/FileInputDStream.scala | 16 +- .../apache/spark/streaming/util/Clock.scala | 6 +- .../streaming/BasicOperationsSuite.scala | 2 +- .../spark/streaming/CheckpointSuite.scala | 248 +++++++++++------- .../spark/streaming/InputStreamsSuite.scala | 69 +++-- .../spark/streaming/TestSuiteBase.scala | 46 +++- 6 files changed, 251 insertions(+), 136 deletions(-) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala index 5f13fdc5579ed..e7c5639a63499 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala @@ -18,6 +18,7 @@ package org.apache.spark.streaming.dstream import java.io.{IOException, ObjectInputStream} +import java.util.concurrent.ConcurrentHashMap import scala.collection.mutable import scala.reflect.ClassTag @@ -74,12 +75,15 @@ class FileInputDStream[K: ClassTag, V: ClassTag, F <: NewInputFormat[K,V] : Clas newFilesOnly: Boolean = true) extends InputDStream[(K, V)](ssc_) { + // This is a def so that it works during checkpoint recovery: + private def clock = ssc.scheduler.clock + // Data to be saved as part of the streaming checkpoints protected[streaming] override val checkpointData = new FileInputDStreamCheckpointData // Initial ignore threshold based on which old, existing files in the directory (at the time of // starting the streaming application) will be ignored or considered - private val initialModTimeIgnoreThreshold = if (newFilesOnly) System.currentTimeMillis() else 0L + private val initialModTimeIgnoreThreshold = if (newFilesOnly) clock.currentTime() else 0L /* * Make sure that the information of files selected in the last few batches are remembered. @@ -91,8 +95,9 @@ class FileInputDStream[K: ClassTag, V: ClassTag, F <: NewInputFormat[K,V] : Clas remember(durationToRemember) // Map of batch-time to selected file info for the remembered batches + // This is a concurrent map because it's also accessed in unit tests @transient private[streaming] var batchTimeToSelectedFiles = - new mutable.HashMap[Time, Array[String]] + new mutable.HashMap[Time, Array[String]] with mutable.SynchronizedMap[Time, Array[String]] // Set of files that were selected in the remembered batches @transient private var recentlySelectedFiles = new mutable.HashSet[String]() @@ -151,7 +156,7 @@ class FileInputDStream[K: ClassTag, V: ClassTag, F <: NewInputFormat[K,V] : Clas */ private def findNewFiles(currentTime: Long): Array[String] = { try { - lastNewFileFindingTime = System.currentTimeMillis + lastNewFileFindingTime = clock.currentTime() // Calculate ignore threshold val modTimeIgnoreThreshold = math.max( @@ -164,7 +169,7 @@ class FileInputDStream[K: ClassTag, V: ClassTag, F <: NewInputFormat[K,V] : Clas def accept(path: Path): Boolean = isNewFile(path, currentTime, modTimeIgnoreThreshold) } val newFiles = fs.listStatus(directoryPath, filter).map(_.getPath.toString) - val timeTaken = System.currentTimeMillis - lastNewFileFindingTime + val timeTaken = clock.currentTime() - lastNewFileFindingTime logInfo("Finding new files took " + timeTaken + " ms") logDebug("# cached file times = " + fileToModTime.size) if (timeTaken > slideDuration.milliseconds) { @@ -267,7 +272,8 @@ class FileInputDStream[K: ClassTag, V: ClassTag, F <: NewInputFormat[K,V] : Clas logDebug(this.getClass().getSimpleName + ".readObject used") ois.defaultReadObject() generatedRDDs = new mutable.HashMap[Time, RDD[(K,V)]] () - batchTimeToSelectedFiles = new mutable.HashMap[Time, Array[String]]() + batchTimeToSelectedFiles = + new mutable.HashMap[Time, Array[String]] with mutable.SynchronizedMap[Time, Array[String]] recentlySelectedFiles = new mutable.HashSet[String]() fileToModTime = new TimeStampedHashMap[String, Long](true) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/Clock.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/Clock.scala index 7cd867ce34b87..d6d96d7ba00fd 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/Clock.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/Clock.scala @@ -59,9 +59,11 @@ class SystemClock() extends Clock { private[streaming] class ManualClock() extends Clock { - var time = 0L + private var time = 0L - def currentTime() = time + def currentTime() = this.synchronized { + time + } def setTime(timeToSet: Long) = { this.synchronized { diff --git a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala index 199f5e7161124..e8f4a7779ec21 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala @@ -638,7 +638,7 @@ class BasicOperationsSuite extends TestSuiteBase { if (rememberDuration != null) ssc.remember(rememberDuration) val output = runStreams[(Int, Int)](ssc, cleanupTestInput.size, numExpectedOutput) val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] - assert(clock.time === Seconds(10).milliseconds) + assert(clock.currentTime() === Seconds(10).milliseconds) assert(output.size === numExpectedOutput) operatedStream } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala index 5d232c6ade7a9..8f8bc61437ba5 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala @@ -18,17 +18,18 @@ package org.apache.spark.streaming import java.io.File -import java.nio.charset.Charset -import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer} import scala.reflect.ClassTag +import com.google.common.base.Charsets import com.google.common.io.Files import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{IntWritable, Text} import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat} +import org.scalatest.concurrent.Eventually._ import org.apache.spark.streaming.dstream.{DStream, FileInputDStream} import org.apache.spark.streaming.util.ManualClock @@ -45,8 +46,6 @@ class CheckpointSuite extends TestSuiteBase { override def batchDuration = Milliseconds(500) - override def actuallyWait = true // to allow checkpoints to be written - override def beforeFunction() { super.beforeFunction() Utils.deleteRecursively(new File(checkpointDir)) @@ -143,7 +142,6 @@ class CheckpointSuite extends TestSuiteBase { ssc.start() advanceTimeWithRealDelay(ssc, 4) ssc.stop() - System.clearProperty("spark.streaming.manualClock.jump") ssc = null } @@ -312,109 +310,161 @@ class CheckpointSuite extends TestSuiteBase { testCheckpointedOperation(input, operation, output, 7) } - // This tests whether file input stream remembers what files were seen before // the master failure and uses them again to process a large window operation. // It also tests whether batches, whose processing was incomplete due to the // failure, are re-processed or not. test("recovery with file input stream") { // Set up the streaming context and input streams + val batchDuration = Seconds(2) // Due to 1-second resolution of setLastModified() on some OS's. val testDir = Utils.createTempDir() - var ssc = new StreamingContext(master, framework, Seconds(1)) - ssc.checkpoint(checkpointDir) - val fileStream = ssc.textFileStream(testDir.toString) - // Making value 3 take large time to process, to ensure that the master - // shuts down in the middle of processing the 3rd batch - val mappedStream = fileStream.map(s => { - val i = s.toInt - if (i == 3) Thread.sleep(2000) - i - }) - - // Reducing over a large window to ensure that recovery from master failure - // requires reprocessing of all the files seen before the failure - val reducedStream = mappedStream.reduceByWindow(_ + _, Seconds(30), Seconds(1)) - val outputBuffer = new ArrayBuffer[Seq[Int]] - var outputStream = new TestOutputStream(reducedStream, outputBuffer) - outputStream.register() - ssc.start() - - // Create files and advance manual clock to process them - // var clock = ssc.scheduler.clock.asInstanceOf[ManualClock] - Thread.sleep(1000) - for (i <- Seq(1, 2, 3)) { - Files.write(i + "\n", new File(testDir, i.toString), Charset.forName("UTF-8")) - // wait to make sure that the file is written such that it gets shown in the file listings - Thread.sleep(1000) + val outputBuffer = new ArrayBuffer[Seq[Int]] with SynchronizedBuffer[Seq[Int]] + + /** + * Writes a file named `i` (which contains the number `i`) to the test directory and sets its + * modification time to `clock`'s current time. + */ + def writeFile(i: Int, clock: ManualClock): Unit = { + val file = new File(testDir, i.toString) + Files.write(i + "\n", file, Charsets.UTF_8) + assert(file.setLastModified(clock.currentTime())) + // Check that the file's modification date is actually the value we wrote, since rounding or + // truncation will break the test: + assert(file.lastModified() === clock.currentTime()) } - logInfo("Output = " + outputStream.output.mkString(",")) - assert(outputStream.output.size > 0, "No files processed before restart") - ssc.stop() - // Verify whether files created have been recorded correctly or not - var fileInputDStream = ssc.graph.getInputStreams().head.asInstanceOf[FileInputDStream[_, _, _]] - def recordedFiles = fileInputDStream.batchTimeToSelectedFiles.values.flatten - assert(!recordedFiles.filter(_.endsWith("1")).isEmpty) - assert(!recordedFiles.filter(_.endsWith("2")).isEmpty) - assert(!recordedFiles.filter(_.endsWith("3")).isEmpty) - - // Create files while the master is down - for (i <- Seq(4, 5, 6)) { - Files.write(i + "\n", new File(testDir, i.toString), Charset.forName("UTF-8")) - Thread.sleep(1000) + /** + * Returns ids that identify which files which have been recorded by the file input stream. + */ + def recordedFiles(ssc: StreamingContext): Seq[Int] = { + val fileInputDStream = + ssc.graph.getInputStreams().head.asInstanceOf[FileInputDStream[_, _, _]] + val filenames = fileInputDStream.batchTimeToSelectedFiles.values.flatten + filenames.map(_.split(File.separator).last.toInt).toSeq.sorted } - // Recover context from checkpoint file and verify whether the files that were - // recorded before failure were saved and successfully recovered - logInfo("*********** RESTARTING ************") - ssc = new StreamingContext(checkpointDir) - fileInputDStream = ssc.graph.getInputStreams().head.asInstanceOf[FileInputDStream[_, _, _]] - assert(!recordedFiles.filter(_.endsWith("1")).isEmpty) - assert(!recordedFiles.filter(_.endsWith("2")).isEmpty) - assert(!recordedFiles.filter(_.endsWith("3")).isEmpty) + try { + // This is a var because it's re-assigned when we restart from a checkpoint + var clock: ManualClock = null + withStreamingContext(new StreamingContext(conf, batchDuration)) { ssc => + ssc.checkpoint(checkpointDir) + clock = ssc.scheduler.clock.asInstanceOf[ManualClock] + val batchCounter = new BatchCounter(ssc) + val fileStream = ssc.textFileStream(testDir.toString) + // Make value 3 take a large time to process, to ensure that the driver + // shuts down in the middle of processing the 3rd batch + CheckpointSuite.batchThreeShouldBlockIndefinitely = true + val mappedStream = fileStream.map(s => { + val i = s.toInt + if (i == 3) { + while (CheckpointSuite.batchThreeShouldBlockIndefinitely) { + Thread.sleep(Long.MaxValue) + } + } + i + }) + + // Reducing over a large window to ensure that recovery from driver failure + // requires reprocessing of all the files seen before the failure + val reducedStream = mappedStream.reduceByWindow(_ + _, batchDuration * 30, batchDuration) + val outputStream = new TestOutputStream(reducedStream, outputBuffer) + outputStream.register() + ssc.start() + + // Advance half a batch so that the first file is created after the StreamingContext starts + clock.addToTime(batchDuration.milliseconds / 2) + // Create files and advance manual clock to process them + for (i <- Seq(1, 2, 3)) { + writeFile(i, clock) + // Advance the clock after creating the file to avoid a race when + // setting its modification time + clock.addToTime(batchDuration.milliseconds) + if (i != 3) { + // Since we want to shut down while the 3rd batch is processing + eventually(eventuallyTimeout) { + assert(batchCounter.getNumCompletedBatches === i) + } + } + } + clock.addToTime(batchDuration.milliseconds) + eventually(eventuallyTimeout) { + // Wait until all files have been recorded and all batches have started + assert(recordedFiles(ssc) === Seq(1, 2, 3) && batchCounter.getNumStartedBatches === 3) + } + // Wait for a checkpoint to be written + val fs = new Path(checkpointDir).getFileSystem(ssc.sc.hadoopConfiguration) + eventually(eventuallyTimeout) { + assert(Checkpoint.getCheckpointFiles(checkpointDir, fs).size === 6) + } + ssc.stop() + // Check that we shut down while the third batch was being processed + assert(batchCounter.getNumCompletedBatches === 2) + assert(outputStream.output.flatten === Seq(1, 3)) + } - // Restart stream computation - ssc.start() - for (i <- Seq(7, 8, 9)) { - Files.write(i + "\n", new File(testDir, i.toString), Charset.forName("UTF-8")) - Thread.sleep(1000) - } - Thread.sleep(1000) - logInfo("Output = " + outputStream.output.mkString("[", ", ", "]")) - assert(outputStream.output.size > 0, "No files processed after restart") - ssc.stop() + // The original StreamingContext has now been stopped. + CheckpointSuite.batchThreeShouldBlockIndefinitely = false - // Verify whether files created while the driver was down have been recorded or not - assert(!recordedFiles.filter(_.endsWith("4")).isEmpty) - assert(!recordedFiles.filter(_.endsWith("5")).isEmpty) - assert(!recordedFiles.filter(_.endsWith("6")).isEmpty) - - // Verify whether new files created after recover have been recorded or not - assert(!recordedFiles.filter(_.endsWith("7")).isEmpty) - assert(!recordedFiles.filter(_.endsWith("8")).isEmpty) - assert(!recordedFiles.filter(_.endsWith("9")).isEmpty) - - // Append the new output to the old buffer - outputStream = ssc.graph.getOutputStreams().head.asInstanceOf[TestOutputStream[Int]] - outputBuffer ++= outputStream.output - - val expectedOutput = Seq(1, 3, 6, 10, 15, 21, 28, 36, 45) - logInfo("--------------------------------") - logInfo("output, size = " + outputBuffer.size) - outputBuffer.foreach(x => logInfo("[" + x.mkString(",") + "]")) - logInfo("expected output, size = " + expectedOutput.size) - expectedOutput.foreach(x => logInfo("[" + x + "]")) - logInfo("--------------------------------") - - // Verify whether all the elements received are as expected - val output = outputBuffer.flatMap(x => x) - assert(output.contains(6)) // To ensure that the 3rd input (i.e., 3) was processed - output.foreach(o => // To ensure all the inputs are correctly added cumulatively - assert(expectedOutput.contains(o), "Expected value " + o + " not found") - ) - // To ensure that all the inputs were received correctly - assert(expectedOutput.last === output.last) - Utils.deleteRecursively(testDir) + // Create files while the streaming driver is down + for (i <- Seq(4, 5, 6)) { + writeFile(i, clock) + // Advance the clock after creating the file to avoid a race when + // setting its modification time + clock.addToTime(batchDuration.milliseconds) + } + + // Recover context from checkpoint file and verify whether the files that were + // recorded before failure were saved and successfully recovered + logInfo("*********** RESTARTING ************") + withStreamingContext(new StreamingContext(checkpointDir)) { ssc => + // So that the restarted StreamingContext's clock has gone forward in time since failure + ssc.conf.set("spark.streaming.manualClock.jump", (batchDuration * 3).milliseconds.toString) + val oldClockTime = clock.currentTime() + clock = ssc.scheduler.clock.asInstanceOf[ManualClock] + val batchCounter = new BatchCounter(ssc) + val outputStream = ssc.graph.getOutputStreams().head.asInstanceOf[TestOutputStream[Int]] + // Check that we remember files that were recorded before the restart + assert(recordedFiles(ssc) === Seq(1, 2, 3)) + + // Restart stream computation + ssc.start() + // Verify that the clock has traveled forward to the expected time + eventually(eventuallyTimeout) { + clock.currentTime() === oldClockTime + } + // Wait for pre-failure batch to be recomputed (3 while SSC was down plus last batch) + val numBatchesAfterRestart = 4 + eventually(eventuallyTimeout) { + assert(batchCounter.getNumCompletedBatches === numBatchesAfterRestart) + } + for ((i, index) <- Seq(7, 8, 9).zipWithIndex) { + writeFile(i, clock) + // Advance the clock after creating the file to avoid a race when + // setting its modification time + clock.addToTime(batchDuration.milliseconds) + eventually(eventuallyTimeout) { + assert(batchCounter.getNumCompletedBatches === index + numBatchesAfterRestart + 1) + } + } + clock.addToTime(batchDuration.milliseconds) + logInfo("Output after restart = " + outputStream.output.mkString("[", ", ", "]")) + assert(outputStream.output.size > 0, "No files processed after restart") + ssc.stop() + + // Verify whether files created while the driver was down (4, 5, 6) and files created after + // recovery (7, 8, 9) have been recorded + assert(recordedFiles(ssc) === (1 to 9)) + + // Append the new output to the old buffer + outputBuffer ++= outputStream.output + + // Verify whether all the elements received are as expected + val expectedOutput = Seq(1, 3, 6, 10, 15, 21, 28, 36, 45) + assert(outputBuffer.flatten.toSet === expectedOutput.toSet) + } + } finally { + Utils.deleteRecursively(testDir) + } } @@ -471,12 +521,12 @@ class CheckpointSuite extends TestSuiteBase { */ def advanceTimeWithRealDelay[V: ClassTag](ssc: StreamingContext, numBatches: Long): Seq[Seq[V]] = { val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] - logInfo("Manual clock before advancing = " + clock.time) + logInfo("Manual clock before advancing = " + clock.currentTime()) for (i <- 1 to numBatches.toInt) { clock.addToTime(batchDuration.milliseconds) Thread.sleep(batchDuration.milliseconds) } - logInfo("Manual clock after advancing = " + clock.time) + logInfo("Manual clock after advancing = " + clock.currentTime()) Thread.sleep(batchDuration.milliseconds) val outputStream = ssc.graph.getOutputStreams.filter { dstream => @@ -485,3 +535,7 @@ class CheckpointSuite extends TestSuiteBase { outputStream.output.map(_.flatten) } } + +private object CheckpointSuite extends Serializable { + var batchThreeShouldBlockIndefinitely: Boolean = true +} \ No newline at end of file diff --git a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala index 307052a4a9cbb..bddf51e130422 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala @@ -28,7 +28,6 @@ import java.util.concurrent.{Executors, TimeUnit, ArrayBlockingQueue} import java.util.concurrent.atomic.AtomicInteger import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer, SynchronizedQueue} -import scala.concurrent.duration._ import scala.language.postfixOps import com.google.common.io.Files @@ -234,45 +233,57 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter { } def testFileStream(newFilesOnly: Boolean) { - var ssc: StreamingContext = null val testDir: File = null try { + val batchDuration = Seconds(2) val testDir = Utils.createTempDir() + // Create a file that exists before the StreamingContext is created: val existingFile = new File(testDir, "0") Files.write("0\n", existingFile, Charset.forName("UTF-8")) + assert(existingFile.setLastModified(10000) && existingFile.lastModified === 10000) - Thread.sleep(1000) // Set up the streaming context and input streams - val newConf = conf.clone.set( - "spark.streaming.clock", "org.apache.spark.streaming.util.SystemClock") - ssc = new StreamingContext(newConf, batchDuration) - val fileStream = ssc.fileStream[LongWritable, Text, TextInputFormat]( - testDir.toString, (x: Path) => true, newFilesOnly = newFilesOnly).map(_._2.toString) - val outputBuffer = new ArrayBuffer[Seq[String]] with SynchronizedBuffer[Seq[String]] - val outputStream = new TestOutputStream(fileStream, outputBuffer) - outputStream.register() - ssc.start() - - // Create files in the directory - val input = Seq(1, 2, 3, 4, 5) - input.foreach { i => - Thread.sleep(batchDuration.milliseconds) - val file = new File(testDir, i.toString) - Files.write(i + "\n", file, Charset.forName("UTF-8")) - logInfo("Created file " + file) - } + withStreamingContext(new StreamingContext(conf, batchDuration)) { ssc => + val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] + // This `setTime` call ensures that the clock is past the creation time of `existingFile` + clock.setTime(existingFile.lastModified + batchDuration.milliseconds) + val batchCounter = new BatchCounter(ssc) + val fileStream = ssc.fileStream[LongWritable, Text, TextInputFormat]( + testDir.toString, (x: Path) => true, newFilesOnly = newFilesOnly).map(_._2.toString) + val outputBuffer = new ArrayBuffer[Seq[String]] with SynchronizedBuffer[Seq[String]] + val outputStream = new TestOutputStream(fileStream, outputBuffer) + outputStream.register() + ssc.start() + + // Advance the clock so that the files are created after StreamingContext starts, but + // not enough to trigger a batch + clock.addToTime(batchDuration.milliseconds / 2) + + // Over time, create files in the directory + val input = Seq(1, 2, 3, 4, 5) + input.foreach { i => + val file = new File(testDir, i.toString) + Files.write(i + "\n", file, Charset.forName("UTF-8")) + assert(file.setLastModified(clock.currentTime())) + assert(file.lastModified === clock.currentTime) + logInfo("Created file " + file) + // Advance the clock after creating the file to avoid a race when + // setting its modification time + clock.addToTime(batchDuration.milliseconds) + eventually(eventuallyTimeout) { + assert(batchCounter.getNumCompletedBatches === i) + } + } - // Verify that all the files have been read - val expectedOutput = if (newFilesOnly) { - input.map(_.toString).toSet - } else { - (Seq(0) ++ input).map(_.toString).toSet - } - eventually(timeout(maxWaitTimeMillis milliseconds), interval(100 milliseconds)) { + // Verify that all the files have been read + val expectedOutput = if (newFilesOnly) { + input.map(_.toString).toSet + } else { + (Seq(0) ++ input).map(_.toString).toSet + } assert(outputBuffer.flatten.toSet === expectedOutput) } } finally { - if (ssc != null) ssc.stop() if (testDir != null) Utils.deleteRecursively(testDir) } } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala index 52972f63c6c5c..7d82c3e4aadcf 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala @@ -21,11 +21,16 @@ import java.io.{ObjectInputStream, IOException} import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.SynchronizedBuffer +import scala.language.implicitConversions import scala.reflect.ClassTag import org.scalatest.{BeforeAndAfter, FunSuite} +import org.scalatest.time.{Span, Seconds => ScalaTestSeconds} +import org.scalatest.concurrent.Eventually.timeout +import org.scalatest.concurrent.PatienceConfiguration import org.apache.spark.streaming.dstream.{DStream, InputDStream, ForEachDStream} +import org.apache.spark.streaming.scheduler.{StreamingListenerBatchStarted, StreamingListenerBatchCompleted, StreamingListener} import org.apache.spark.streaming.util.ManualClock import org.apache.spark.{SparkConf, Logging} import org.apache.spark.rdd.RDD @@ -103,6 +108,40 @@ class TestOutputStreamWithPartitions[T: ClassTag](parent: DStream[T], def toTestOutputStream = new TestOutputStream[T](this.parent, this.output.map(_.flatten)) } +/** + * An object that counts the number of started / completed batches. This is implemented using a + * StreamingListener. Constructing a new instance automatically registers a StreamingListener on + * the given StreamingContext. + */ +class BatchCounter(ssc: StreamingContext) { + + // All access to this state should be guarded by `BatchCounter.this.synchronized` + private var numCompletedBatches = 0 + private var numStartedBatches = 0 + + private val listener = new StreamingListener { + override def onBatchStarted(batchStarted: StreamingListenerBatchStarted): Unit = + BatchCounter.this.synchronized { + numStartedBatches += 1 + BatchCounter.this.notifyAll() + } + override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = + BatchCounter.this.synchronized { + numCompletedBatches += 1 + BatchCounter.this.notifyAll() + } + } + ssc.addStreamingListener(listener) + + def getNumCompletedBatches: Int = this.synchronized { + numCompletedBatches + } + + def getNumStartedBatches: Int = this.synchronized { + numStartedBatches + } +} + /** * This is the base trait for Spark Streaming testsuites. This provides basic functionality * to run user-defined set of input on user-defined stream operations, and verify the output. @@ -142,6 +181,9 @@ trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging { .setMaster(master) .setAppName(framework) + // Timeout for use in ScalaTest `eventually` blocks + val eventuallyTimeout: PatienceConfiguration.Timeout = timeout(Span(10, ScalaTestSeconds)) + // Default before function for any streaming test suite. Override this // if you want to add your stuff to "before" (i.e., don't call before { } ) def beforeFunction() { @@ -291,7 +333,7 @@ trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging { // Advance manual clock val clock = ssc.scheduler.clock.asInstanceOf[ManualClock] - logInfo("Manual clock before advancing = " + clock.time) + logInfo("Manual clock before advancing = " + clock.currentTime()) if (actuallyWait) { for (i <- 1 to numBatches) { logInfo("Actually waiting for " + batchDuration) @@ -301,7 +343,7 @@ trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging { } else { clock.addToTime(numBatches * batchDuration.milliseconds) } - logInfo("Manual clock after advancing = " + clock.time) + logInfo("Manual clock after advancing = " + clock.currentTime()) // Wait until expected number of output items have been generated val startTime = System.currentTimeMillis() From 5e3ec1110495899a298313c4aa9c6c151c1f54da Mon Sep 17 00:00:00 2001 From: kj-ki Date: Tue, 6 Jan 2015 09:49:37 -0800 Subject: [PATCH 065/116] [Minor] Fix comments for GraphX 2D partitioning strategy The sum of vertices on matrix (v0 to v11) is 12. And, I think one same block overlaps in this strategy. This is minor PR, so I didn't file in JIRA. Author: kj-ki Closes #3904 from kj-ki/fix-partitionstrategy-comments and squashes the following commits: 79829d9 [kj-ki] Fix comments for 2D partitioning. --- .../scala/org/apache/spark/graphx/PartitionStrategy.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala b/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala index 13033fee0e6b5..7372dfbd9fe98 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala @@ -32,9 +32,9 @@ trait PartitionStrategy extends Serializable { object PartitionStrategy { /** * Assigns edges to partitions using a 2D partitioning of the sparse edge adjacency matrix, - * guaranteeing a `2 * sqrt(numParts)` bound on vertex replication. + * guaranteeing a `2 * sqrt(numParts) - 1` bound on vertex replication. * - * Suppose we have a graph with 11 vertices that we want to partition + * Suppose we have a graph with 12 vertices that we want to partition * over 9 machines. We can use the following sparse matrix representation: * *
    @@ -61,7 +61,7 @@ object PartitionStrategy {
        * that edges adjacent to `v11` can only be in the first column of blocks `(P0, P3,
        * P6)` or the last
        * row of blocks `(P6, P7, P8)`.  As a consequence we can guarantee that `v11` will need to be
    -   * replicated to at most `2 * sqrt(numParts)` machines.
    +   * replicated to at most `2 * sqrt(numParts) - 1` machines.
        *
        * Notice that `P0` has many edges and as a consequence this partitioning would lead to poor work
        * balance.  To improve balance we first multiply each vertex id by a large prime to shuffle the
    
    From 4cba6eb42031b1a4cc3308833116ca5d9ccb1a89 Mon Sep 17 00:00:00 2001
    From: Sean Owen 
    Date: Tue, 6 Jan 2015 12:02:08 -0800
    Subject: [PATCH 066/116] SPARK-4159 [CORE] Maven build doesn't run JUnit test
     suites
    
    This PR:
    
    - Reenables `surefire`, and copies config from `scalatest` (which is itself an old fork of `surefire`, so similar)
    - Tells `surefire` to test only Java tests
    - Enables `surefire` and `scalatest` for all children, and in turn eliminates some duplication.
    
    For me this causes the Scala and Java tests to be run once each, it seems, as desired. It doesn't affect the SBT build but works for Maven. I still need to verify that all of the Scala tests and Java tests are being run.
    
    Author: Sean Owen 
    
    Closes #3651 from srowen/SPARK-4159 and squashes the following commits:
    
    2e8a0af [Sean Owen] Remove specialized SPARK_HOME setting for REPL, YARN tests as it appears to be obsolete
    12e4558 [Sean Owen] Append to unit-test.log instead of overwriting, so that both surefire and scalatest output is preserved. Also standardize/correct comments a bit.
    e6f8601 [Sean Owen] Reenable Java tests by reenabling surefire with config cloned from scalatest; centralize test config in the parent
    ---
     bagel/pom.xml                                 | 11 -----
     bagel/src/test/resources/log4j.properties     |  4 +-
     core/pom.xml                                  | 18 --------
     core/src/test/resources/log4j.properties      |  4 +-
     examples/pom.xml                              |  5 ---
     external/flume-sink/pom.xml                   |  9 ----
     .../src/test/resources/log4j.properties       |  3 +-
     external/flume/pom.xml                        | 11 -----
     .../flume/src/test/resources/log4j.properties |  5 +--
     external/kafka/pom.xml                        | 11 -----
     .../kafka/src/test/resources/log4j.properties |  5 +--
     external/mqtt/pom.xml                         | 11 -----
     .../mqtt/src/test/resources/log4j.properties  |  5 +--
     external/twitter/pom.xml                      | 11 -----
     .../src/test/resources/log4j.properties       |  5 +--
     external/zeromq/pom.xml                       | 11 -----
     .../src/test/resources/log4j.properties       |  5 +--
     extras/java8-tests/pom.xml                    | 15 -------
     .../src/test/resources/log4j.properties       |  2 +-
     extras/kinesis-asl/pom.xml                    | 11 -----
     .../src/test/resources/log4j.properties       |  5 ++-
     graphx/pom.xml                                | 11 -----
     graphx/src/test/resources/log4j.properties    |  4 +-
     mllib/pom.xml                                 | 11 -----
     mllib/src/test/resources/log4j.properties     |  4 +-
     network/common/pom.xml                        |  5 ---
     network/shuffle/pom.xml                       |  5 ---
     pom.xml                                       | 45 +++++++++++++++++--
     repl/pom.xml                                  | 14 ------
     repl/src/test/resources/log4j.properties      |  4 +-
     sql/catalyst/pom.xml                          | 10 -----
     sql/core/pom.xml                              | 11 -----
     sql/hive-thriftserver/pom.xml                 |  9 ----
     sql/hive/pom.xml                              |  5 ---
     streaming/pom.xml                             | 10 -----
     streaming/src/test/resources/log4j.properties |  5 +--
     tools/pom.xml                                 |  9 ----
     yarn/pom.xml                                  | 14 ------
     yarn/src/test/resources/log4j.properties      |  4 +-
     39 files changed, 70 insertions(+), 277 deletions(-)
    
    diff --git a/bagel/pom.xml b/bagel/pom.xml
    index 0327ffa402671..3bcd38fa3245c 100644
    --- a/bagel/pom.xml
    +++ b/bagel/pom.xml
    @@ -44,11 +44,6 @@
           org.eclipse.jetty
           jetty-server
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
         
           org.scalacheck
           scalacheck_${scala.binary.version}
    @@ -58,11 +53,5 @@
       
         target/scala-${scala.binary.version}/classes
         target/scala-${scala.binary.version}/test-classes
    -    
    -      
    -        org.scalatest
    -        scalatest-maven-plugin
    -      
    -    
       
     
    diff --git a/bagel/src/test/resources/log4j.properties b/bagel/src/test/resources/log4j.properties
    index 789869f72e3b0..853ef0ed2986f 100644
    --- a/bagel/src/test/resources/log4j.properties
    +++ b/bagel/src/test/resources/log4j.properties
    @@ -15,10 +15,10 @@
     # limitations under the License.
     #
     
    -# Set everything to be logged to the file bagel/target/unit-tests.log
    +# Set everything to be logged to the file target/unit-tests.log
     log4j.rootCategory=INFO, file
     log4j.appender.file=org.apache.log4j.FileAppender
    -log4j.appender.file.append=false
    +log4j.appender.file.append=true
     log4j.appender.file.file=target/unit-tests.log
     log4j.appender.file.layout=org.apache.log4j.PatternLayout
     log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
    diff --git a/core/pom.xml b/core/pom.xml
    index c5c41b2b5de42..d9a49c9e08afc 100644
    --- a/core/pom.xml
    +++ b/core/pom.xml
    @@ -276,11 +276,6 @@
           selenium-java
           test
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
         
           org.mockito
           mockito-all
    @@ -326,19 +321,6 @@
         target/scala-${scala.binary.version}/classes
         target/scala-${scala.binary.version}/test-classes
         
    -      
    -        org.scalatest
    -        scalatest-maven-plugin
    -        
    -          
    -            test
    -            
    -              test
    -            
    -          
    -        
    -      
    -
           
           
             org.apache.maven.plugins
    diff --git a/core/src/test/resources/log4j.properties b/core/src/test/resources/log4j.properties
    index 9dd05f17f012b..287c8e3563503 100644
    --- a/core/src/test/resources/log4j.properties
    +++ b/core/src/test/resources/log4j.properties
    @@ -15,10 +15,10 @@
     # limitations under the License.
     #
     
    -# Set everything to be logged to the file core/target/unit-tests.log
    +# Set everything to be logged to the file target/unit-tests.log
     log4j.rootCategory=INFO, file
     log4j.appender.file=org.apache.log4j.FileAppender
    -log4j.appender.file.append=false
    +log4j.appender.file.append=true
     log4j.appender.file.file=target/unit-tests.log
     log4j.appender.file.layout=org.apache.log4j.PatternLayout
     log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
    diff --git a/examples/pom.xml b/examples/pom.xml
    index 8713230e1e8ed..bdc5d0562f3e1 100644
    --- a/examples/pom.xml
    +++ b/examples/pom.xml
    @@ -244,11 +244,6 @@
           algebird-core_${scala.binary.version}
           0.8.1
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
         
           org.scalacheck
           scalacheck_${scala.binary.version}
    diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
    index 72618b6515f83..71f595d0a6800 100644
    --- a/external/flume-sink/pom.xml
    +++ b/external/flume-sink/pom.xml
    @@ -65,11 +65,6 @@
             
           
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
         
           org.scala-lang
           scala-library
    @@ -91,10 +86,6 @@
         target/scala-${scala.binary.version}/classes
         target/scala-${scala.binary.version}/test-classes
         
    -      
    -        org.scalatest
    -        scalatest-maven-plugin
    -      
           
             org.apache.avro
             avro-maven-plugin
    diff --git a/external/flume-sink/src/test/resources/log4j.properties b/external/flume-sink/src/test/resources/log4j.properties
    index 4411d6e20c52a..2a58e99817224 100644
    --- a/external/flume-sink/src/test/resources/log4j.properties
    +++ b/external/flume-sink/src/test/resources/log4j.properties
    @@ -17,9 +17,8 @@
     
     # Set everything to be logged to the file streaming/target/unit-tests.log
     log4j.rootCategory=INFO, file
    -# log4j.appender.file=org.apache.log4j.FileAppender
     log4j.appender.file=org.apache.log4j.FileAppender
    -log4j.appender.file.append=false
    +log4j.appender.file.append=true
     log4j.appender.file.file=target/unit-tests.log
     log4j.appender.file.layout=org.apache.log4j.PatternLayout
     log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
    diff --git a/external/flume/pom.xml b/external/flume/pom.xml
    index a682f0e8471d8..0374262212e08 100644
    --- a/external/flume/pom.xml
    +++ b/external/flume/pom.xml
    @@ -61,11 +61,6 @@
             
           
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
         
           org.scalacheck
           scalacheck_${scala.binary.version}
    @@ -85,11 +80,5 @@
       
         target/scala-${scala.binary.version}/classes
         target/scala-${scala.binary.version}/test-classes
    -    
    -      
    -        org.scalatest
    -        scalatest-maven-plugin
    -      
    -    
       
     
    diff --git a/external/flume/src/test/resources/log4j.properties b/external/flume/src/test/resources/log4j.properties
    index 4411d6e20c52a..9697237bfa1a3 100644
    --- a/external/flume/src/test/resources/log4j.properties
    +++ b/external/flume/src/test/resources/log4j.properties
    @@ -15,11 +15,10 @@
     # limitations under the License.
     #
     
    -# Set everything to be logged to the file streaming/target/unit-tests.log
    +# Set everything to be logged to the file target/unit-tests.log
     log4j.rootCategory=INFO, file
    -# log4j.appender.file=org.apache.log4j.FileAppender
     log4j.appender.file=org.apache.log4j.FileAppender
    -log4j.appender.file.append=false
    +log4j.appender.file.append=true
     log4j.appender.file.file=target/unit-tests.log
     log4j.appender.file.layout=org.apache.log4j.PatternLayout
     log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
    diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml
    index b3f44471cd326..b29b0509656ba 100644
    --- a/external/kafka/pom.xml
    +++ b/external/kafka/pom.xml
    @@ -74,11 +74,6 @@
           3.2
           test
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
         
           org.scalacheck
           scalacheck_${scala.binary.version}
    @@ -98,11 +93,5 @@
       
         target/scala-${scala.binary.version}/classes
         target/scala-${scala.binary.version}/test-classes
    -    
    -      
    -        org.scalatest
    -        scalatest-maven-plugin
    -      
    -    
       
     
    diff --git a/external/kafka/src/test/resources/log4j.properties b/external/kafka/src/test/resources/log4j.properties
    index 4411d6e20c52a..9697237bfa1a3 100644
    --- a/external/kafka/src/test/resources/log4j.properties
    +++ b/external/kafka/src/test/resources/log4j.properties
    @@ -15,11 +15,10 @@
     # limitations under the License.
     #
     
    -# Set everything to be logged to the file streaming/target/unit-tests.log
    +# Set everything to be logged to the file target/unit-tests.log
     log4j.rootCategory=INFO, file
    -# log4j.appender.file=org.apache.log4j.FileAppender
     log4j.appender.file=org.apache.log4j.FileAppender
    -log4j.appender.file.append=false
    +log4j.appender.file.append=true
     log4j.appender.file.file=target/unit-tests.log
     log4j.appender.file.layout=org.apache.log4j.PatternLayout
     log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
    diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml
    index d478267b605ba..560c8b9d18276 100644
    --- a/external/mqtt/pom.xml
    +++ b/external/mqtt/pom.xml
    @@ -46,11 +46,6 @@
           org.eclipse.paho.client.mqttv3
           1.0.1
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
         
           org.scalacheck
           scalacheck_${scala.binary.version}
    @@ -76,11 +71,5 @@
       
         target/scala-${scala.binary.version}/classes
         target/scala-${scala.binary.version}/test-classes
    -    
    -      
    -        org.scalatest
    -        scalatest-maven-plugin
    -      
    -    
       
     
    diff --git a/external/mqtt/src/test/resources/log4j.properties b/external/mqtt/src/test/resources/log4j.properties
    index 4411d6e20c52a..9697237bfa1a3 100644
    --- a/external/mqtt/src/test/resources/log4j.properties
    +++ b/external/mqtt/src/test/resources/log4j.properties
    @@ -15,11 +15,10 @@
     # limitations under the License.
     #
     
    -# Set everything to be logged to the file streaming/target/unit-tests.log
    +# Set everything to be logged to the file target/unit-tests.log
     log4j.rootCategory=INFO, file
    -# log4j.appender.file=org.apache.log4j.FileAppender
     log4j.appender.file=org.apache.log4j.FileAppender
    -log4j.appender.file.append=false
    +log4j.appender.file.append=true
     log4j.appender.file.file=target/unit-tests.log
     log4j.appender.file.layout=org.apache.log4j.PatternLayout
     log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
    diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml
    index 000ace1446e5e..da6ffe7662f63 100644
    --- a/external/twitter/pom.xml
    +++ b/external/twitter/pom.xml
    @@ -46,11 +46,6 @@
           twitter4j-stream
           3.0.3
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
         
           org.scalacheck
           scalacheck_${scala.binary.version}
    @@ -70,11 +65,5 @@
       
         target/scala-${scala.binary.version}/classes
         target/scala-${scala.binary.version}/test-classes
    -    
    -      
    -        org.scalatest
    -        scalatest-maven-plugin
    -      
    -    
       
     
    diff --git a/external/twitter/src/test/resources/log4j.properties b/external/twitter/src/test/resources/log4j.properties
    index 4411d6e20c52a..64bfc5745088f 100644
    --- a/external/twitter/src/test/resources/log4j.properties
    +++ b/external/twitter/src/test/resources/log4j.properties
    @@ -15,11 +15,10 @@
     # limitations under the License.
     #
     
    -# Set everything to be logged to the file streaming/target/unit-tests.log
    +# Set everything to be logged to the filetarget/unit-tests.log
     log4j.rootCategory=INFO, file
    -# log4j.appender.file=org.apache.log4j.FileAppender
     log4j.appender.file=org.apache.log4j.FileAppender
    -log4j.appender.file.append=false
    +log4j.appender.file.append=true
     log4j.appender.file.file=target/unit-tests.log
     log4j.appender.file.layout=org.apache.log4j.PatternLayout
     log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
    diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml
    index 29c452093502e..2fb5f0ed2f57c 100644
    --- a/external/zeromq/pom.xml
    +++ b/external/zeromq/pom.xml
    @@ -46,11 +46,6 @@
           akka-zeromq_${scala.binary.version}
           ${akka.version}
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
         
           org.scalacheck
           scalacheck_${scala.binary.version}
    @@ -70,11 +65,5 @@
       
         target/scala-${scala.binary.version}/classes
         target/scala-${scala.binary.version}/test-classes
    -    
    -      
    -        org.scalatest
    -        scalatest-maven-plugin
    -      
    -    
       
     
    diff --git a/external/zeromq/src/test/resources/log4j.properties b/external/zeromq/src/test/resources/log4j.properties
    index 4411d6e20c52a..9697237bfa1a3 100644
    --- a/external/zeromq/src/test/resources/log4j.properties
    +++ b/external/zeromq/src/test/resources/log4j.properties
    @@ -15,11 +15,10 @@
     # limitations under the License.
     #
     
    -# Set everything to be logged to the file streaming/target/unit-tests.log
    +# Set everything to be logged to the file target/unit-tests.log
     log4j.rootCategory=INFO, file
    -# log4j.appender.file=org.apache.log4j.FileAppender
     log4j.appender.file=org.apache.log4j.FileAppender
    -log4j.appender.file.append=false
    +log4j.appender.file.append=true
     log4j.appender.file.file=target/unit-tests.log
     log4j.appender.file.layout=org.apache.log4j.PatternLayout
     log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
    diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml
    index c8477a6566311..0fb431808bacd 100644
    --- a/extras/java8-tests/pom.xml
    +++ b/extras/java8-tests/pom.xml
    @@ -60,11 +60,6 @@
           junit-interface
           test
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
       
     
       
    @@ -159,16 +154,6 @@
               
             
           
    -      
    -        org.scalatest
    -        scalatest-maven-plugin
    -        
    -          
    -            test
    -            none
    -          
    -        
    -      
         
       
     
    diff --git a/extras/java8-tests/src/test/resources/log4j.properties b/extras/java8-tests/src/test/resources/log4j.properties
    index bb0ab319a0080..287c8e3563503 100644
    --- a/extras/java8-tests/src/test/resources/log4j.properties
    +++ b/extras/java8-tests/src/test/resources/log4j.properties
    @@ -18,7 +18,7 @@
     # Set everything to be logged to the file target/unit-tests.log
     log4j.rootCategory=INFO, file
     log4j.appender.file=org.apache.log4j.FileAppender
    -log4j.appender.file.append=false
    +log4j.appender.file.append=true
     log4j.appender.file.file=target/unit-tests.log
     log4j.appender.file.layout=org.apache.log4j.PatternLayout
     log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
    diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml
    index c0d3a61119113..c815eda52bda7 100644
    --- a/extras/kinesis-asl/pom.xml
    +++ b/extras/kinesis-asl/pom.xml
    @@ -57,11 +57,6 @@
           aws-java-sdk
           ${aws.java.sdk.version}
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
         
           org.mockito
           mockito-all
    @@ -86,11 +81,5 @@
       
         target/scala-${scala.binary.version}/classes
         target/scala-${scala.binary.version}/test-classes
    -    
    -      
    -        org.scalatest
    -        scalatest-maven-plugin
    -      
    -    
       
     
    diff --git a/extras/kinesis-asl/src/test/resources/log4j.properties b/extras/kinesis-asl/src/test/resources/log4j.properties
    index d9d08f68687d3..853ef0ed2986f 100644
    --- a/extras/kinesis-asl/src/test/resources/log4j.properties
    +++ b/extras/kinesis-asl/src/test/resources/log4j.properties
    @@ -14,10 +14,11 @@
     # See the License for the specific language governing permissions and
     # limitations under the License.
     #
    +
    +# Set everything to be logged to the file target/unit-tests.log
     log4j.rootCategory=INFO, file
    -# log4j.appender.file=org.apache.log4j.FileAppender
     log4j.appender.file=org.apache.log4j.FileAppender
    -log4j.appender.file.append=false
    +log4j.appender.file.append=true
     log4j.appender.file.file=target/unit-tests.log
     log4j.appender.file.layout=org.apache.log4j.PatternLayout
     log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
    diff --git a/graphx/pom.xml b/graphx/pom.xml
    index 9982b36f9b62f..91db799d244ad 100644
    --- a/graphx/pom.xml
    +++ b/graphx/pom.xml
    @@ -49,11 +49,6 @@
           org.eclipse.jetty
           jetty-server
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
         
           org.scalacheck
           scalacheck_${scala.binary.version}
    @@ -63,11 +58,5 @@
       
         target/scala-${scala.binary.version}/classes
         target/scala-${scala.binary.version}/test-classes
    -    
    -      
    -        org.scalatest
    -        scalatest-maven-plugin
    -      
    -    
       
     
    diff --git a/graphx/src/test/resources/log4j.properties b/graphx/src/test/resources/log4j.properties
    index 9dd05f17f012b..287c8e3563503 100644
    --- a/graphx/src/test/resources/log4j.properties
    +++ b/graphx/src/test/resources/log4j.properties
    @@ -15,10 +15,10 @@
     # limitations under the License.
     #
     
    -# Set everything to be logged to the file core/target/unit-tests.log
    +# Set everything to be logged to the file target/unit-tests.log
     log4j.rootCategory=INFO, file
     log4j.appender.file=org.apache.log4j.FileAppender
    -log4j.appender.file.append=false
    +log4j.appender.file.append=true
     log4j.appender.file.file=target/unit-tests.log
     log4j.appender.file.layout=org.apache.log4j.PatternLayout
     log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
    diff --git a/mllib/pom.xml b/mllib/pom.xml
    index 0a6dda0ab8c80..2198757481684 100644
    --- a/mllib/pom.xml
    +++ b/mllib/pom.xml
    @@ -80,11 +80,6 @@
           org.apache.commons
           commons-math3
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
         
           org.scalacheck
           scalacheck_${scala.binary.version}
    @@ -129,12 +124,6 @@
       
         target/scala-${scala.binary.version}/classes
         target/scala-${scala.binary.version}/test-classes
    -    
    -      
    -        org.scalatest
    -        scalatest-maven-plugin
    -      
    -    
         
           
             ../python
    diff --git a/mllib/src/test/resources/log4j.properties b/mllib/src/test/resources/log4j.properties
    index a469badf603c6..9697237bfa1a3 100644
    --- a/mllib/src/test/resources/log4j.properties
    +++ b/mllib/src/test/resources/log4j.properties
    @@ -15,10 +15,10 @@
     # limitations under the License.
     #
     
    -# Set everything to be logged to the file core/target/unit-tests.log
    +# Set everything to be logged to the file target/unit-tests.log
     log4j.rootCategory=INFO, file
     log4j.appender.file=org.apache.log4j.FileAppender
    -log4j.appender.file.append=false
    +log4j.appender.file.append=true
     log4j.appender.file.file=target/unit-tests.log
     log4j.appender.file.layout=org.apache.log4j.PatternLayout
     log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
    diff --git a/network/common/pom.xml b/network/common/pom.xml
    index baca859fa5011..245a96b8c4038 100644
    --- a/network/common/pom.xml
    +++ b/network/common/pom.xml
    @@ -75,11 +75,6 @@
           mockito-all
           test
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
       
     
       
    diff --git a/network/shuffle/pom.xml b/network/shuffle/pom.xml
    index 12468567c3aed..5bfa1ac9c373e 100644
    --- a/network/shuffle/pom.xml
    +++ b/network/shuffle/pom.xml
    @@ -83,11 +83,6 @@
           mockito-all
           test
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
       
     
       
    diff --git a/pom.xml b/pom.xml
    index 05f59a9b4140b..46ff211f91160 100644
    --- a/pom.xml
    +++ b/pom.xml
    @@ -256,7 +256,7 @@
           1.0.0
         
         
    @@ -266,6 +266,15 @@
           2.3.7
           provided
         
    +    
    +    
    +      org.scalatest
    +      scalatest_${scala.binary.version}
    +      test
    +    
       
       
         
    @@ -935,19 +944,38 @@
                 true
               
             
    +        
             
               org.apache.maven.plugins
               maven-surefire-plugin
    -          2.17
    +          2.18
    +          
               
    -            
    -            true
    +            
    +              **/Test*.java
    +              **/*Test.java
    +              **/*TestCase.java
    +              **/*Suite.java
    +            
    +            ${project.build.directory}/surefire-reports
    +            -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m
    +            
    +              true
    +              ${session.executionRootDirectory}
    +              1
    +              false
    +              false
    +              ${test_classpath}
    +              true
    +            
               
             
    +        
             
               org.scalatest
               scalatest-maven-plugin
               1.0
    +          
               
                 ${project.build.directory}/surefire-reports
                 .
    @@ -1159,6 +1187,15 @@
               
             
           
    +      
    +      
    +        org.apache.maven.plugins
    +        maven-surefire-plugin
    +      
    +      
    +        org.scalatest
    +        scalatest-maven-plugin
    +      
         
       
     
    diff --git a/repl/pom.xml b/repl/pom.xml
    index 9b2290429fee5..97165e024926e 100644
    --- a/repl/pom.xml
    +++ b/repl/pom.xml
    @@ -86,11 +86,6 @@
           org.slf4j
           jul-to-slf4j
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
         
           org.scalacheck
           scalacheck_${scala.binary.version}
    @@ -115,15 +110,6 @@
               true
             
           
    -      
    -        org.scalatest
    -        scalatest-maven-plugin
    -        
    -          
    -            ${basedir}/..
    -          
    -        
    -      
           
           
             org.codehaus.mojo
    diff --git a/repl/src/test/resources/log4j.properties b/repl/src/test/resources/log4j.properties
    index 52098993f5c3c..e7e4a4113174a 100644
    --- a/repl/src/test/resources/log4j.properties
    +++ b/repl/src/test/resources/log4j.properties
    @@ -15,10 +15,10 @@
     # limitations under the License.
     #
     
    -# Set everything to be logged to the repl/target/unit-tests.log
    +# Set everything to be logged to the target/unit-tests.log
     log4j.rootCategory=INFO, file
     log4j.appender.file=org.apache.log4j.FileAppender
    -log4j.appender.file.append=false
    +log4j.appender.file.append=true
     log4j.appender.file.file=target/unit-tests.log
     log4j.appender.file.layout=org.apache.log4j.PatternLayout
     log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
    diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
    index 1caa297e24e37..a1947fb022e54 100644
    --- a/sql/catalyst/pom.xml
    +++ b/sql/catalyst/pom.xml
    @@ -50,11 +50,6 @@
           spark-core_${scala.binary.version}
           ${project.version}
         
    -    
    -      org.scalatest
    -      scalatest_${scala.binary.version}
    -      test
    -    
         
           org.scalacheck
           scalacheck_${scala.binary.version}
    @@ -65,11 +60,6 @@
         target/scala-${scala.binary.version}/classes
         target/scala-${scala.binary.version}/test-classes
         
    -      
    -        org.scalatest
    -        scalatest-maven-plugin
    -      
    -
           
    +    
    +      hadoop-provided
    +      
    +        provided
    +      
    +    
    +    
    +      hive-provided
    +      
    +        provided
    +      
    +    
    +    
    +      parquet-provided
    +      
    +        provided
    +      
    +    
       
     
    diff --git a/bagel/pom.xml b/bagel/pom.xml
    index 3bcd38fa3245c..510e92640eff8 100644
    --- a/bagel/pom.xml
    +++ b/bagel/pom.xml
    @@ -40,10 +40,6 @@
           spark-core_${scala.binary.version}
           ${project.version}
         
    -    
    -      org.eclipse.jetty
    -      jetty-server
    -    
         
           org.scalacheck
           scalacheck_${scala.binary.version}
    diff --git a/bin/compute-classpath.cmd b/bin/compute-classpath.cmd
    index a4c099fb45b14..088f993954d9e 100644
    --- a/bin/compute-classpath.cmd
    +++ b/bin/compute-classpath.cmd
    @@ -109,6 +109,13 @@ if "x%YARN_CONF_DIR%"=="x" goto no_yarn_conf_dir
       set CLASSPATH=%CLASSPATH%;%YARN_CONF_DIR%
     :no_yarn_conf_dir
     
    +rem To allow for distributions to append needed libraries to the classpath (e.g. when
    +rem using the "hadoop-provided" profile to build Spark), check SPARK_DIST_CLASSPATH and
    +rem append it to tbe final classpath.
    +if not "x%$SPARK_DIST_CLASSPATH%"=="x" (
    +  set CLASSPATH=%CLASSPATH%;%SPARK_DIST_CLASSPATH%
    +)
    +
     rem A bit of a hack to allow calling this script within run2.cmd without seeing output
     if "%DONT_PRINT_CLASSPATH%"=="1" goto exit
     
    diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
    index a31ea73d3ce19..8f3b396ffd086 100755
    --- a/bin/compute-classpath.sh
    +++ b/bin/compute-classpath.sh
    @@ -146,4 +146,11 @@ if [ -n "$YARN_CONF_DIR" ]; then
       CLASSPATH="$CLASSPATH:$YARN_CONF_DIR"
     fi
     
    +# To allow for distributions to append needed libraries to the classpath (e.g. when
    +# using the "hadoop-provided" profile to build Spark), check SPARK_DIST_CLASSPATH and
    +# append it to tbe final classpath.
    +if [ -n "$SPARK_DIST_CLASSPATH" ]; then
    +  CLASSPATH="$CLASSPATH:$SPARK_DIST_CLASSPATH"
    +fi
    +
     echo "$CLASSPATH"
    diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
    index 8c7de75600b5f..7eb87a564d6f5 100644
    --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
    +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
    @@ -55,19 +55,26 @@ private[spark] class SparkDeploySchedulerBackend(
           "{{WORKER_URL}}")
         val extraJavaOpts = sc.conf.getOption("spark.executor.extraJavaOptions")
           .map(Utils.splitCommandString).getOrElse(Seq.empty)
    -    val classPathEntries = sc.conf.getOption("spark.executor.extraClassPath").toSeq.flatMap { cp =>
    -      cp.split(java.io.File.pathSeparator)
    -    }
    -    val libraryPathEntries =
    -      sc.conf.getOption("spark.executor.extraLibraryPath").toSeq.flatMap { cp =>
    -        cp.split(java.io.File.pathSeparator)
    +    val classPathEntries = sc.conf.getOption("spark.executor.extraClassPath")
    +      .map(_.split(java.io.File.pathSeparator).toSeq).getOrElse(Nil)
    +    val libraryPathEntries = sc.conf.getOption("spark.executor.extraLibraryPath")
    +      .map(_.split(java.io.File.pathSeparator).toSeq).getOrElse(Nil)
    +
    +    // When testing, expose the parent class path to the child. This is processed by
    +    // compute-classpath.{cmd,sh} and makes all needed jars available to child processes
    +    // when the assembly is built with the "*-provided" profiles enabled.
    +    val testingClassPath =
    +      if (sys.props.contains("spark.testing")) {
    +        sys.props("java.class.path").split(java.io.File.pathSeparator).toSeq
    +      } else {
    +        Nil
           }
     
         // Start executors with a few necessary configs for registering with the scheduler
         val sparkJavaOpts = Utils.sparkJavaOpts(conf, SparkConf.isExecutorStartupConf)
         val javaOpts = sparkJavaOpts ++ extraJavaOpts
         val command = Command("org.apache.spark.executor.CoarseGrainedExecutorBackend",
    -      args, sc.executorEnvs, classPathEntries, libraryPathEntries, javaOpts)
    +      args, sc.executorEnvs, classPathEntries ++ testingClassPath, libraryPathEntries, javaOpts)
         val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")
         val appDesc = new ApplicationDescription(sc.appName, maxCores, sc.executorMemory, command,
           appUIAddress, sc.eventLogDir)
    diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
    index 9d6b6161ce4da..c4f1898a2db15 100644
    --- a/core/src/main/scala/org/apache/spark/util/Utils.scala
    +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
    @@ -990,11 +990,12 @@ private[spark] object Utils extends Logging {
         for ((key, value) <- extraEnvironment) {
           environment.put(key, value)
         }
    +
         val process = builder.start()
         new Thread("read stderr for " + command(0)) {
           override def run() {
             for (line <- Source.fromInputStream(process.getErrorStream).getLines()) {
    -          System.err.println(line)
    +          logInfo(line)
             }
           }
         }.start()
    @@ -1089,7 +1090,7 @@ private[spark] object Utils extends Logging {
         var firstUserLine = 0
         var insideSpark = true
         var callStack = new ArrayBuffer[String]() :+ ""
    - 
    +
         Thread.currentThread.getStackTrace().foreach { ste: StackTraceElement =>
           // When running under some profilers, the current stack trace might contain some bogus
           // frames. This is intended to ensure that we don't crash in these situations by
    diff --git a/core/src/test/scala/org/apache/spark/DriverSuite.scala b/core/src/test/scala/org/apache/spark/DriverSuite.scala
    index 541d8eac80556..8a54360e81795 100644
    --- a/core/src/test/scala/org/apache/spark/DriverSuite.scala
    +++ b/core/src/test/scala/org/apache/spark/DriverSuite.scala
    @@ -35,7 +35,7 @@ class DriverSuite extends FunSuite with Timeouts {
         forAll(masters) { (master: String) =>
           failAfter(60 seconds) {
             Utils.executeAndGetOutput(
    -          Seq("./bin/spark-class", "org.apache.spark.DriverWithoutCleanup", master),
    +          Seq(s"$sparkHome/bin/spark-class", "org.apache.spark.DriverWithoutCleanup", master),
               new File(sparkHome),
               Map("SPARK_TESTING" -> "1", "SPARK_HOME" -> sparkHome))
           }
    diff --git a/examples/pom.xml b/examples/pom.xml
    index bdc5d0562f3e1..002d4458c4b3e 100644
    --- a/examples/pom.xml
    +++ b/examples/pom.xml
    @@ -98,143 +98,145 @@
           ${project.version}
         
         
    -      org.eclipse.jetty
    -      jetty-server
    +      org.apache.hbase
    +      hbase-testing-util
    +      ${hbase.version}
    +      ${hbase.deps.scope}
    +      
    +        
    +          
    +          org.apache.hbase
    +          hbase-annotations
    +        
    +        
    +          org.jruby
    +          jruby-complete
    +        
    +      
    +    
    +    
    +      org.apache.hbase
    +      hbase-protocol
    +      ${hbase.version}
    +      ${hbase.deps.scope}
    +    
    +    
    +      org.apache.hbase
    +      hbase-common
    +      ${hbase.version}
    +      ${hbase.deps.scope}
    +      
    +        
    +          
    +          org.apache.hbase
    +          hbase-annotations
    +        
    +      
    +    
    +    
    +      org.apache.hbase
    +      hbase-client
    +      ${hbase.version}
    +      ${hbase.deps.scope}
    +      
    +        
    +          
    +          org.apache.hbase
    +          hbase-annotations
    +        
    +       
    +        io.netty
    +        netty
    +       
    +     
    +    
    +    
    +      org.apache.hbase
    +      hbase-server
    +      ${hbase.version}
    +      ${hbase.deps.scope}
    +      
    +        
    +          
    +          org.apache.hbase
    +          hbase-annotations
    +        
    +        
    +          org.apache.hadoop
    +          hadoop-core
    +        
    +        
    +          org.apache.hadoop
    +          hadoop-client
    +        
    +        
    +          org.apache.hadoop
    +          hadoop-mapreduce-client-jobclient
    +        
    +        
    +          org.apache.hadoop
    +          hadoop-mapreduce-client-core
    +        
    +        
    +          org.apache.hadoop
    +          hadoop-auth
    +        
    +        
    +          org.apache.hadoop
    +          hadoop-annotations
    +        
    +        
    +          org.apache.hadoop
    +          hadoop-hdfs
    +        
    +        
    +          org.apache.hbase
    +          hbase-hadoop1-compat
    +        
    +        
    +          org.apache.commons
    +          commons-math
    +        
    +        
    +          com.sun.jersey
    +          jersey-core
    +        
    +        
    +          org.slf4j
    +          slf4j-api
    +        
    +        
    +          com.sun.jersey
    +          jersey-server
    +        
    +        
    +          com.sun.jersey
    +          jersey-core
    +        
    +        
    +          com.sun.jersey
    +          jersey-json
    +        
    +        
    +          
    +          commons-io
    +          commons-io
    +        
    +      
    +    
    +    
    +      org.apache.hbase
    +      hbase-hadoop-compat
    +      ${hbase.version}
    +      ${hbase.deps.scope}
    +    
    +    
    +      org.apache.hbase
    +      hbase-hadoop-compat
    +      ${hbase.version}
    +      test-jar
    +      test
         
    -      
    -        org.apache.hbase
    -        hbase-testing-util
    -        ${hbase.version}
    -        
    -          
    -            
    -            org.apache.hbase
    -            hbase-annotations
    -          
    -          
    -            org.jruby
    -            jruby-complete
    -          
    -        
    -      
    -      
    -        org.apache.hbase
    -        hbase-protocol
    -        ${hbase.version}
    -      
    -      
    -        org.apache.hbase
    -        hbase-common
    -        ${hbase.version}
    -        
    -          
    -            
    -            org.apache.hbase
    -            hbase-annotations
    -          
    -        
    -      
    -      
    -        org.apache.hbase
    -        hbase-client
    -        ${hbase.version}
    -        
    -          
    -            
    -            org.apache.hbase
    -            hbase-annotations
    -          
    -         
    -          io.netty
    -          netty
    -         
    -       
    -      
    -      
    -        org.apache.hbase
    -        hbase-server
    -        ${hbase.version}
    -        
    -          
    -            org.apache.hadoop
    -            hadoop-core
    -          
    -          
    -            org.apache.hadoop
    -            hadoop-client
    -          
    -          
    -            org.apache.hadoop
    -            hadoop-mapreduce-client-jobclient
    -          
    -          
    -            org.apache.hadoop
    -            hadoop-mapreduce-client-core
    -          
    -          
    -            org.apache.hadoop
    -            hadoop-auth
    -          
    -          
    -            
    -            org.apache.hbase
    -            hbase-annotations
    -          
    -          
    -            org.apache.hadoop
    -            hadoop-annotations
    -          
    -          
    -            org.apache.hadoop
    -            hadoop-hdfs
    -          
    -          
    -            org.apache.hbase
    -            hbase-hadoop1-compat
    -          
    -          
    -            org.apache.commons
    -            commons-math
    -          
    -          
    -            com.sun.jersey
    -            jersey-core
    -          
    -          
    -            org.slf4j
    -            slf4j-api
    -          
    -          
    -            com.sun.jersey
    -            jersey-server
    -          
    -          
    -            com.sun.jersey
    -            jersey-core
    -          
    -          
    -            com.sun.jersey
    -            jersey-json
    -          
    -          
    -            
    -            commons-io
    -            commons-io
    -          
    -        
    -      
    -      
    -        org.apache.hbase
    -        hbase-hadoop-compat
    -        ${hbase.version}
    -      
    -      
    -        org.apache.hbase
    -        hbase-hadoop-compat
    -        ${hbase.version}
    -        test-jar
    -        test
    -      
         
           org.apache.commons
           commons-math3
    @@ -308,31 +310,6 @@
           
             org.apache.maven.plugins
             maven-shade-plugin
    -        
    -          false
    -          ${project.build.directory}/scala-${scala.binary.version}/spark-examples-${project.version}-hadoop${hadoop.version}.jar
    -          
    -            
    -              *:*
    -            
    -          
    -          
    -            
    -              com.google.guava:guava
    -              
    -                com/google/common/base/Optional*
    -              
    -            
    -            
    -              *:*
    -              
    -                META-INF/*.SF
    -                META-INF/*.DSA
    -                META-INF/*.RSA
    -              
    -            
    -          
    -        
             
               
                 package
    @@ -340,6 +317,34 @@
                   shade
                 
                 
    +            false
    +            ${project.build.directory}/scala-${scala.binary.version}/spark-examples-${project.version}-hadoop${hadoop.version}.jar
    +            
    +              
    +                *:*
    +              
    +            
    +            
    +              
    +                com.google.guava:guava
    +                
    +                  
    +                  **
    +                
    +              
    +              
    +                *:*
    +                
    +                  META-INF/*.SF
    +                  META-INF/*.DSA
    +                  META-INF/*.RSA
    +                
    +              
    +            
                   
                     
                       com.google
    @@ -411,7 +416,7 @@
           
         
         
    -      
           scala-2.10
           
    @@ -449,5 +454,37 @@
             
           
         
    +
    +    
    +    
    +      flume-provided
    +      
    +        provided
    +      
    +    
    +    
    +      hadoop-provided
    +      
    +        provided
    +      
    +    
    +    
    +      hbase-provided
    +      
    +        provided
    +      
    +    
    +    
    +      hive-provided
    +      
    +        provided
    +      
    +    
    +    
    +      parquet-provided
    +      
    +        provided
    +      
    +    
       
     
    diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
    index 71f595d0a6800..0706f1ebf66e2 100644
    --- a/external/flume-sink/pom.xml
    +++ b/external/flume-sink/pom.xml
    @@ -38,32 +38,10 @@
         
           org.apache.flume
           flume-ng-sdk
    -      ${flume.version}
    -      
    -        
    -          io.netty
    -          netty
    -        
    -        
    -          org.apache.thrift
    -          libthrift
    -        
    -      
         
         
           org.apache.flume
           flume-ng-core
    -      ${flume.version}
    -      
    -        
    -          io.netty
    -          netty
    -        
    -        
    -          org.apache.thrift
    -          libthrift
    -        
    -      
         
         
           org.scala-lang
    diff --git a/external/flume/pom.xml b/external/flume/pom.xml
    index 0374262212e08..1f2681394c583 100644
    --- a/external/flume/pom.xml
    +++ b/external/flume/pom.xml
    @@ -46,20 +46,13 @@
           spark-streaming-flume-sink_${scala.binary.version}
           ${project.version}
         
    +    
    +      org.apache.flume
    +      flume-ng-core
    +    
         
           org.apache.flume
           flume-ng-sdk
    -      ${flume.version}
    -      
    -        
    -          io.netty
    -          netty
    -        
    -        
    -          org.apache.thrift
    -          libthrift
    -        
    -      
         
         
           org.scalacheck
    diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml
    index 2fb5f0ed2f57c..e919c2c9b19ea 100644
    --- a/external/zeromq/pom.xml
    +++ b/external/zeromq/pom.xml
    @@ -44,7 +44,6 @@
         
           ${akka.group}
           akka-zeromq_${scala.binary.version}
    -      ${akka.version}
         
         
           org.scalacheck
    diff --git a/graphx/pom.xml b/graphx/pom.xml
    index 91db799d244ad..72374aae6da9b 100644
    --- a/graphx/pom.xml
    +++ b/graphx/pom.xml
    @@ -45,10 +45,6 @@
           jblas
           ${jblas.version}
         
    -    
    -      org.eclipse.jetty
    -      jetty-server
    -    
         
           org.scalacheck
           scalacheck_${scala.binary.version}
    diff --git a/mllib/pom.xml b/mllib/pom.xml
    index 2198757481684..a0bda89ccaa71 100644
    --- a/mllib/pom.xml
    +++ b/mllib/pom.xml
    @@ -29,7 +29,7 @@
       spark-mllib_2.10
       
         mllib
    -    
    +  
       jar
       Spark Project ML Library
       http://spark.apache.org/
    @@ -50,10 +50,6 @@
           spark-sql_${scala.binary.version}
           ${project.version}
         
    -    
    -      org.eclipse.jetty
    -      jetty-server
    -    
         
           org.jblas
           jblas
    diff --git a/pom.xml b/pom.xml
    index 46ff211f91160..703e5c47bf59b 100644
    --- a/pom.xml
    +++ b/pom.xml
    @@ -123,8 +123,10 @@
         2.4.1
         ${hadoop.version}
         0.94.6
    +    hbase
         1.4.0
         3.4.5
    +    org.spark-project.hive
         
         0.13.1a
         
    @@ -143,13 +145,36 @@
         4.2.6
         3.1.1
         ${project.build.directory}/spark-test-classpath.txt
    -    64m
    -    512m
         2.10.4
         2.10
         ${scala.version}
         org.scala-lang
    -    1.8.8
    +    1.8.8
    +    1.1.1.6
    +
    +    
    +    compile
    +    compile
    +    compile
    +    compile
    +    compile
    +
    +    
    +    ${session.executionRootDirectory}
    +
    +    64m
    +    512m
    +    512m
       
     
       
    @@ -244,21 +269,20 @@
           
         
       
    -
       
    -  
    +    
         
           org.spark-project.spark
           unused
           1.0.0
         
         
         
           org.codehaus.groovy
    @@ -369,11 +393,13 @@
             org.slf4j
             slf4j-api
             ${slf4j.version}
    +        ${hadoop.deps.scope}
           
           
             org.slf4j
             slf4j-log4j12
             ${slf4j.version}
    +        ${hadoop.deps.scope}
           
           
             org.slf4j
    @@ -390,6 +416,7 @@
             log4j
             log4j
             ${log4j.version}
    +        ${hadoop.deps.scope}
           
           
             com.ning
    @@ -399,7 +426,8 @@
           
             org.xerial.snappy
             snappy-java
    -        1.1.1.6
    +        ${snappy.version}
    +        ${hadoop.deps.scope}
           
           
             net.jpountz.lz4
    @@ -427,6 +455,7 @@
             com.google.protobuf
             protobuf-java
             ${protobuf.version}
    +        ${hadoop.deps.scope}
           
           
             ${akka.group}
    @@ -448,6 +477,17 @@
             akka-testkit_${scala.binary.version}
             ${akka.version}
           
    +      
    +        ${akka.group}
    +        akka-zeromq_${scala.binary.version}
    +        ${akka.version}
    +        
    +          
    +            ${akka.group}
    +            akka-actor_${scala.binary.version}
    +          
    +        
    +      
           
             org.apache.mesos
             mesos
    @@ -577,6 +617,7 @@
             org.apache.curator
             curator-recipes
             2.4.0
    +        ${hadoop.deps.scope}
             
               
                 org.jboss.netty
    @@ -588,6 +629,7 @@
             org.apache.hadoop
             hadoop-client
             ${hadoop.version}
    +        ${hadoop.deps.scope}
             
               
                 asm
    @@ -623,11 +665,13 @@
             org.apache.avro
             avro
             ${avro.version}
    +        ${hadoop.deps.scope}
           
           
             org.apache.avro
             avro-ipc
             ${avro.version}
    +        ${hadoop.deps.scope}
             
               
                 io.netty
    @@ -656,6 +700,7 @@
             avro-mapred
             ${avro.version}
             ${avro.mapred.classifier}
    +        ${hive.deps.scope}
             
               
                 io.netty
    @@ -684,6 +729,7 @@
             net.java.dev.jets3t
             jets3t
             ${jets3t.version}
    +        ${hadoop.deps.scope}
             
               
                 commons-logging
    @@ -695,6 +741,7 @@
             org.apache.hadoop
             hadoop-yarn-api
             ${yarn.version}
    +        ${hadoop.deps.scope}
             
               
                 javax.servlet
    @@ -722,6 +769,7 @@
             org.apache.hadoop
             hadoop-yarn-common
             ${yarn.version}
    +        ${hadoop.deps.scope}
             
               
                 asm
    @@ -778,6 +826,7 @@
             org.apache.hadoop
             hadoop-yarn-server-web-proxy
             ${yarn.version}
    +        ${hadoop.deps.scope}
             
               
                 asm
    @@ -805,6 +854,7 @@
             org.apache.hadoop
             hadoop-yarn-client
             ${yarn.version}
    +        ${hadoop.deps.scope}
             
               
                 asm
    @@ -829,15 +879,126 @@
             
           
           
    -        
    -        org.codehaus.jackson
    -        jackson-mapper-asl
    -        ${jackson.version}
    +        org.apache.zookeeper
    +        zookeeper
    +        ${zookeeper.version}
    +        ${hadoop.deps.scope}
           
           
             org.codehaus.jackson
             jackson-core-asl
    -        ${jackson.version}
    +        ${codehaus.jackson.version}
    +        ${hadoop.deps.scope}
    +      
    +      
    +        org.codehaus.jackson
    +        jackson-mapper-asl
    +        ${codehaus.jackson.version}
    +        ${hadoop.deps.scope}
    +      
    +      
    +        ${hive.group}
    +        hive-beeline
    +        ${hive.version}
    +        ${hive.deps.scope}
    +      
    +      
    +        ${hive.group}
    +        hive-cli
    +        ${hive.version}
    +        ${hive.deps.scope}
    +      
    +      
    +        ${hive.group}
    +        hive-exec
    +        ${hive.version}
    +        ${hive.deps.scope}
    +        
    +          
    +            commons-logging
    +            commons-logging
    +          
    +          
    +            com.esotericsoftware.kryo
    +            kryo
    +          
    +        
    +      
    +      
    +        ${hive.group}
    +        hive-jdbc
    +        ${hive.version}
    +        ${hive.deps.scope}
    +      
    +      
    +        ${hive.group}
    +        hive-metastore
    +        ${hive.version}
    +        ${hive.deps.scope}
    +      
    +      
    +        ${hive.group}
    +        hive-serde
    +        ${hive.version}
    +        ${hive.deps.scope}
    +        
    +          
    +            commons-logging
    +            commons-logging
    +          
    +          
    +            commons-logging
    +            commons-logging-api
    +          
    +        
    +      
    +      
    +        com.twitter
    +        parquet-column
    +        ${parquet.version}
    +        ${parquet.deps.scope}
    +      
    +      
    +        com.twitter
    +        parquet-hadoop
    +        ${parquet.version}
    +        ${parquet.deps.scope}
    +      
    +      
    +        org.apache.flume
    +        flume-ng-core
    +        ${flume.version}
    +        ${flume.deps.scope}
    +        
    +          
    +            io.netty
    +            netty
    +          
    +          
    +            org.apache.thrift
    +            libthrift
    +          
    +          
    +            org.mortbay.jetty
    +            servlet-api
    +          
    +        
    +      
    +      
    +        org.apache.flume
    +        flume-ng-sdk
    +        ${flume.version}
    +        ${flume.deps.scope}
    +        
    +          
    +            io.netty
    +            netty
    +          
    +          
    +            org.apache.thrift
    +            libthrift
    +          
    +        
           
         
       
    @@ -914,6 +1075,7 @@
                   -Xmx1024m
                   -XX:PermSize=${PermGen}
                   -XX:MaxPermSize=${MaxPermGen}
    +              -XX:ReservedCodeCacheSize=${CodeCacheSize}
                 
                 
                   -source
    @@ -980,15 +1142,21 @@
                 ${project.build.directory}/surefire-reports
                 .
                 SparkTestSuite.txt
    -            -ea -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m
    +            -ea -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize}
                 
    +            
    +              
    +              ${test_classpath}
    +            
                 
                   true
    -              ${session.executionRootDirectory}
    +              ${spark.test.home}
                   1
                   false
                   false
    -              ${test_classpath}
                   true
                 
               
    @@ -1011,11 +1179,6 @@
               maven-antrun-plugin
               1.7
             
    -        
    -          org.apache.maven.plugins
    -          maven-shade-plugin
    -          2.2
    -        
             
               org.apache.maven.plugins
               maven-source-plugin
    @@ -1104,6 +1267,7 @@
           
             org.apache.maven.plugins
             maven-shade-plugin
    +        2.2
             
               false
               
    @@ -1373,53 +1537,6 @@
           
         
     
    -    
    -    
    -      hadoop-provided
    -      
    -        
    -          org.apache.hadoop
    -          hadoop-client
    -          provided
    -        
    -        
    -          org.apache.hadoop
    -          hadoop-yarn-api
    -          provided
    -        
    -        
    -          org.apache.hadoop
    -          hadoop-yarn-common
    -          provided
    -        
    -        
    -          org.apache.hadoop
    -          hadoop-yarn-server-web-proxy
    -          provided
    -        
    -        
    -          org.apache.hadoop
    -          hadoop-yarn-client
    -          provided
    -        
    -        
    -          org.apache.avro
    -          avro
    -          provided
    -        
    -        
    -          org.apache.avro
    -          avro-ipc
    -          provided
    -        
    -        
    -          org.apache.zookeeper
    -          zookeeper
    -          ${zookeeper.version}
    -          provided
    -        
    -      
    -    
         
           hive-thriftserver
           
    @@ -1472,5 +1589,25 @@
           
         
     
    +    
    +    
    +      flume-provided
    +    
    +    
    +      hadoop-provided
    +    
    +    
    +      hbase-provided
    +    
    +    
    +      hive-provided
    +    
    +    
    +      parquet-provided
    +    
       
     
    diff --git a/repl/pom.xml b/repl/pom.xml
    index 97165e024926e..0bc8bccf90a6d 100644
    --- a/repl/pom.xml
    +++ b/repl/pom.xml
    @@ -68,10 +68,6 @@
           ${project.version}
           test
         
    -    
    -      org.eclipse.jetty
    -      jetty-server
    -    
         
           org.scala-lang
           scala-compiler
    @@ -103,13 +99,6 @@
               true
             
           
    -      
    -        org.apache.maven.plugins
    -        maven-install-plugin
    -        
    -          true
    -        
    -      
           
           
             org.codehaus.mojo
    diff --git a/sql/core/pom.xml b/sql/core/pom.xml
    index 023ce2041bb86..3e9ef07df9db6 100644
    --- a/sql/core/pom.xml
    +++ b/sql/core/pom.xml
    @@ -56,12 +56,10 @@
         
           com.twitter
           parquet-column
    -      ${parquet.version}
         
         
           com.twitter
           parquet-hadoop
    -      ${parquet.version}
         
         
           com.fasterxml.jackson.core
    diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
    index d3a517375cf25..259eef0b80d03 100644
    --- a/sql/hive-thriftserver/pom.xml
    +++ b/sql/hive-thriftserver/pom.xml
    @@ -42,19 +42,16 @@
           ${project.version}
         
         
    -      org.spark-project.hive
    +      ${hive.group}
           hive-cli
    -      ${hive.version}
         
         
    -      org.spark-project.hive
    +      ${hive.group}
           hive-jdbc
    -      ${hive.version}
         
         
    -      org.spark-project.hive
    +      ${hive.group}
           hive-beeline
    -      ${hive.version}
         
       
       
    diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
    index e8ffbc5b954d4..60953576d0e37 100644
    --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
    +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
    @@ -48,6 +48,7 @@ class CliSuite extends FunSuite with BeforeAndAfterAll with Logging {
              |  --master local
              |  --hiveconf ${ConfVars.METASTORECONNECTURLKEY}=$jdbcUrl
              |  --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath
    +         |  --driver-class-path ${sys.props("java.class.path")}
            """.stripMargin.split("\\s+").toSeq ++ extraArgs
         }
     
    @@ -70,7 +71,7 @@ class CliSuite extends FunSuite with BeforeAndAfterAll with Logging {
         }
     
         // Searching expected output line from both stdout and stderr of the CLI process
    -    val process = (Process(command) #< queryStream).run(
    +    val process = (Process(command, None) #< queryStream).run(
           ProcessLogger(captureOutput("stdout"), captureOutput("stderr")))
     
         try {
    diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala
    index 94d5ed4f1d15e..7814aa38f4146 100644
    --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala
    +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala
    @@ -142,6 +142,7 @@ class HiveThriftServer2Suite extends FunSuite with Logging {
                  |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST}=localhost
                  |  --hiveconf ${ConfVars.HIVE_SERVER2_TRANSPORT_MODE}=http
                  |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_HTTP_PORT}=$port
    +             |  --driver-class-path ${sys.props("java.class.path")}
                """.stripMargin.split("\\s+").toSeq
           } else {
               s"""$startScript
    @@ -151,6 +152,7 @@ class HiveThriftServer2Suite extends FunSuite with Logging {
                  |  --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath
                  |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST}=localhost
                  |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_PORT}=$port
    +             |  --driver-class-path ${sys.props("java.class.path")}
                """.stripMargin.split("\\s+").toSeq
           }
     
    @@ -179,8 +181,9 @@ class HiveThriftServer2Suite extends FunSuite with Logging {
           }
         }
     
    -    // Resets SPARK_TESTING to avoid loading Log4J configurations in testing class paths
    -    val env = Seq("SPARK_TESTING" -> "0")
    +    val env = Seq(
    +      // Resets SPARK_TESTING to avoid loading Log4J configurations in testing class paths
    +      "SPARK_TESTING" -> "0")
     
         Process(command, None, env: _*).run(ProcessLogger(
           captureThriftServerOutput("stdout"),
    @@ -214,7 +217,7 @@ class HiveThriftServer2Suite extends FunSuite with Logging {
         } finally {
           warehousePath.delete()
           metastorePath.delete()
    -      Process(stopScript).run().exitValue()
    +      Process(stopScript, None, env: _*).run().exitValue()
           // The `spark-daemon.sh' script uses kill, which is not synchronous, have to wait for a while.
           Thread.sleep(3.seconds.toMillis)
           Option(logTailingProcess).map(_.destroy())
    diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
    index 46aacad01113f..58b0722464be8 100644
    --- a/sql/hive/pom.xml
    +++ b/sql/hive/pom.xml
    @@ -47,9 +47,8 @@
           ${project.version}
         
         
    -      org.spark-project.hive
    +      ${hive.group}
           hive-metastore
    -      ${hive.version}
         
         
           commons-httpclient
    @@ -57,51 +56,27 @@
           3.1
         
         
    -      org.spark-project.hive
    +      ${hive.group}
           hive-exec
    -      ${hive.version}
    -      
    -        
    -          commons-logging
    -          commons-logging
    -        
    -        
    -          com.esotericsoftware.kryo
    -          kryo
    -        
    -      
         
         
           org.codehaus.jackson
           jackson-mapper-asl
         
         
    -      org.spark-project.hive
    +      ${hive.group}
           hive-serde
    -      ${hive.version}
    -      
    -        
    -          commons-logging
    -          commons-logging
    -        
    -        
    -          commons-logging
    -          commons-logging-api
    -        
    -      
         
         
         
           org.apache.avro
           avro
    -      ${avro.version}
         
         
         
           org.apache.avro
           avro-mapred
    -      ${avro.version}
           ${avro.mapred.classifier}
         
         
    diff --git a/streaming/pom.xml b/streaming/pom.xml
    index 2023210d9b9be..d3c6d0347a622 100644
    --- a/streaming/pom.xml
    +++ b/streaming/pom.xml
    @@ -68,13 +68,13 @@
         target/scala-${scala.binary.version}/classes
         target/scala-${scala.binary.version}/test-classes
         
    -      
           
    diff --git a/yarn/pom.xml b/yarn/pom.xml
    index bcb77b3e3c70e..b86857db7bde6 100644
    --- a/yarn/pom.xml
    +++ b/yarn/pom.xml
    @@ -131,13 +131,6 @@
               true
             
           
    -      
    -        org.apache.maven.plugins
    -        maven-install-plugin
    -        
    -          true
    -        
    -      
         
     
         target/scala-${scala.binary.version}/classes
    diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
    index 8d0543771309b..c363d755c1752 100644
    --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
    +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
    @@ -367,6 +367,10 @@ private[spark] class Client(
           }
         }
     
    +    sys.env.get(ENV_DIST_CLASSPATH).foreach { dcp =>
    +      env(ENV_DIST_CLASSPATH) = dcp
    +    }
    +
         env
       }
     
    @@ -652,6 +656,9 @@ object Client extends Logging {
       val APP_FILE_PERMISSION: FsPermission =
         FsPermission.createImmutable(Integer.parseInt("644", 8).toShort)
     
    +  // Distribution-defined classpath to add to processes
    +  val ENV_DIST_CLASSPATH = "SPARK_DIST_CLASSPATH"
    +
       /**
        * Find the user-defined Spark jar if configured, or return the jar containing this
        * class if not.
    
    From 167a5ab0bd1d37f3ac23bec49e484a238610cf75 Mon Sep 17 00:00:00 2001
    From: Nicholas Chammas 
    Date: Thu, 8 Jan 2015 17:42:08 -0800
    Subject: [PATCH 088/116] [SPARK-5122] Remove Shark from spark-ec2
    
    I moved the Spark-Shark version map [to the wiki](https://cwiki.apache.org/confluence/display/SPARK/Spark-Shark+version+mapping).
    
    This PR has a [matching PR in mesos/spark-ec2](https://github.com/mesos/spark-ec2/pull/89).
    
    Author: Nicholas Chammas 
    
    Closes #3939 from nchammas/remove-shark and squashes the following commits:
    
    66e0841 [Nicholas Chammas] fix style
    ceeab85 [Nicholas Chammas] show default Spark GitHub repo
    7270126 [Nicholas Chammas] validate Spark hashes
    db4935d [Nicholas Chammas] validate spark version upfront
    fc0d5b9 [Nicholas Chammas] remove Shark
    ---
     ec2/spark_ec2.py | 78 +++++++++++++++++++++++++++---------------------
     1 file changed, 44 insertions(+), 34 deletions(-)
    
    diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
    index 485eea4f5e683..abab209a05ba0 100755
    --- a/ec2/spark_ec2.py
    +++ b/ec2/spark_ec2.py
    @@ -39,10 +39,26 @@
     from optparse import OptionParser
     from sys import stderr
     
    +VALID_SPARK_VERSIONS = set([
    +    "0.7.3",
    +    "0.8.0",
    +    "0.8.1",
    +    "0.9.0",
    +    "0.9.1",
    +    "0.9.2",
    +    "1.0.0",
    +    "1.0.1",
    +    "1.0.2",
    +    "1.1.0",
    +    "1.1.1",
    +    "1.2.0",
    +])
    +
     DEFAULT_SPARK_VERSION = "1.2.0"
    +DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark"
     SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
    -
     MESOS_SPARK_EC2_BRANCH = "branch-1.3"
    +
     # A URL prefix from which to fetch AMI information
     AMI_PREFIX = "https://raw.github.com/mesos/spark-ec2/{b}/ami-list".format(b=MESOS_SPARK_EC2_BRANCH)
     
    @@ -126,8 +142,8 @@ def parse_args():
             help="Version of Spark to use: 'X.Y.Z' or a specific git hash (default: %default)")
         parser.add_option(
             "--spark-git-repo",
    -        default="https://github.com/apache/spark",
    -        help="Github repo from which to checkout supplied commit hash")
    +        default=DEFAULT_SPARK_GITHUB_REPO,
    +        help="Github repo from which to checkout supplied commit hash (default: %default)")
         parser.add_option(
             "--hadoop-major-version", default="1",
             help="Major version of Hadoop (default: %default)")
    @@ -236,6 +252,26 @@ def get_or_make_group(conn, name, vpc_id):
             return conn.create_security_group(name, "Spark EC2 group", vpc_id)
     
     
    +def get_validate_spark_version(version, repo):
    +    if "." in version:
    +        version = version.replace("v", "")
    +        if version not in VALID_SPARK_VERSIONS:
    +            print >> stderr, "Don't know about Spark version: {v}".format(v=version)
    +            sys.exit(1)
    +        return version
    +    else:
    +        github_commit_url = "{repo}/commit/{commit_hash}".format(repo=repo, commit_hash=version)
    +        request = urllib2.Request(github_commit_url)
    +        request.get_method = lambda: 'HEAD'
    +        try:
    +            response = urllib2.urlopen(request)
    +        except urllib2.HTTPError, e:
    +            print >> stderr, "Couldn't validate Spark commit: {url}".format(url=github_commit_url)
    +            print >> stderr, "Received HTTP response code of {code}.".format(code=e.code)
    +            sys.exit(1)
    +        return version
    +
    +
     # Check whether a given EC2 instance object is in a state we consider active,
     # i.e. not terminating or terminated. We count both stopping and stopped as
     # active since we can restart stopped clusters.
    @@ -243,29 +279,6 @@ def is_active(instance):
         return (instance.state in ['pending', 'running', 'stopping', 'stopped'])
     
     
    -# Return correct versions of Spark and Shark, given the supplied Spark version
    -def get_spark_shark_version(opts):
    -    spark_shark_map = {
    -        "0.7.3": "0.7.1",
    -        "0.8.0": "0.8.0",
    -        "0.8.1": "0.8.1",
    -        "0.9.0": "0.9.0",
    -        "0.9.1": "0.9.1",
    -        # These are dummy versions (no Shark versions after this)
    -        "1.0.0": "1.0.0",
    -        "1.0.1": "1.0.1",
    -        "1.0.2": "1.0.2",
    -        "1.1.0": "1.1.0",
    -        "1.1.1": "1.1.1",
    -        "1.2.0": "1.2.0",
    -    }
    -    version = opts.spark_version.replace("v", "")
    -    if version not in spark_shark_map:
    -        print >> stderr, "Don't know about Spark version: %s" % version
    -        sys.exit(1)
    -    return (version, spark_shark_map[version])
    -
    -
     # Attempt to resolve an appropriate AMI given the architecture and region of the request.
     # Source: http://aws.amazon.com/amazon-linux-ami/instance-type-matrix/
     # Last Updated: 2014-06-20
    @@ -619,7 +632,7 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
                 print slave.public_dns_name
                 ssh_write(slave.public_dns_name, opts, ['tar', 'x'], dot_ssh_tar)
     
    -    modules = ['spark', 'shark', 'ephemeral-hdfs', 'persistent-hdfs',
    +    modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs',
                    'mapreduce', 'spark-standalone', 'tachyon']
     
         if opts.hadoop_major_version == "1":
    @@ -706,9 +719,7 @@ def wait_for_cluster_state(conn, opts, cluster_instances, cluster_state):
         sys.stdout.flush()
     
         start_time = datetime.now()
    -
         num_attempts = 0
    -    conn = ec2.connect_to_region(opts.region)
     
         while True:
             time.sleep(5 * num_attempts)  # seconds
    @@ -815,13 +826,11 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
         cluster_url = "%s:7077" % active_master
     
         if "." in opts.spark_version:
    -        # Pre-built spark & shark deploy
    -        (spark_v, shark_v) = get_spark_shark_version(opts)
    +        # Pre-built Spark deploy
    +        spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
         else:
             # Spark-only custom deploy
             spark_v = "%s|%s" % (opts.spark_git_repo, opts.spark_version)
    -        shark_v = ""
    -        modules = filter(lambda x: x != "shark", modules)
     
         template_vars = {
             "master_list": '\n'.join([i.public_dns_name for i in master_nodes]),
    @@ -834,7 +843,6 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
             "swap": str(opts.swap),
             "modules": '\n'.join(modules),
             "spark_version": spark_v,
    -        "shark_version": shark_v,
             "hadoop_major_version": opts.hadoop_major_version,
             "spark_worker_instances": "%d" % opts.worker_instances,
             "spark_master_opts": opts.master_opts
    @@ -983,6 +991,8 @@ def real_main():
         (opts, action, cluster_name) = parse_args()
     
         # Input parameter validation
    +    get_validate_spark_version(opts.spark_version, opts.spark_git_repo)
    +
         if opts.wait is not None:
             # NOTE: DeprecationWarnings are silent in 2.7+ by default.
             #       To show them, run Python with the -Wdefault switch.
    
    From f3da4bd7289d493014ad3c5176ada60794dfcfe0 Mon Sep 17 00:00:00 2001
    From: WangTaoTheTonic 
    Date: Fri, 9 Jan 2015 08:10:09 -0600
    Subject: [PATCH 089/116] [SPARK-5169][YARN]fetch the correct max attempts
    
    Soryy for fetching the wrong max attempts in this commit https://github.com/apache/spark/commit/8fdd48959c93b9cf809f03549e2ae6c4687d1fcd.
    We need to fix it now.
    
    tgravescs
    
    If we set an spark.yarn.maxAppAttempts which is larger than `yarn.resourcemanager.am.max-attempts` in yarn side, it will be overrided as described here:
    >The maximum number of application attempts. It's a global setting for all application masters. Each application master can specify its individual maximum number of application attempts via the API, but the individual number cannot be more than the global upper bound. If it is, the resourcemanager will override it. The default number is set to 2, to allow at least one retry for AM.
    
    http://hadoop.apache.org/docs/r2.6.0/hadoop-yarn/hadoop-yarn-common/yarn-default.xml
    
    Author: WangTaoTheTonic 
    
    Closes #3942 from WangTaoTheTonic/HOTFIX and squashes the following commits:
    
    9ac16ce [WangTaoTheTonic] fetch the correct max attempts
    ---
     .../org/apache/spark/deploy/yarn/YarnRMClient.scala  | 12 +++++++++---
     1 file changed, 9 insertions(+), 3 deletions(-)
    
    diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
    index e183efccbb6f7..b45e599588ad3 100644
    --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
    +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
    @@ -121,9 +121,15 @@ private[spark] class YarnRMClient(args: ApplicationMasterArguments) extends Logg
     
       /** Returns the maximum number of attempts to register the AM. */
       def getMaxRegAttempts(sparkConf: SparkConf, yarnConf: YarnConfiguration): Int = {
    -    sparkConf.getOption("spark.yarn.maxAppAttempts").map(_.toInt).getOrElse(
    -      yarnConf.getInt(
    -        YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS))
    +    val sparkMaxAttempts = sparkConf.getOption("spark.yarn.maxAppAttempts").map(_.toInt)
    +    val yarnMaxAttempts = yarnConf.getInt(
    +      YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)
    +    val retval: Int = sparkMaxAttempts match {
    +      case Some(x) => if (x <= yarnMaxAttempts) x else yarnMaxAttempts
    +      case None => yarnMaxAttempts
    +    }
    +
    +    retval
       }
     
     }
    
    From b4034c3f889bf24f60eb806802866b48e4cbe55c Mon Sep 17 00:00:00 2001
    From: Aaron Davidson 
    Date: Fri, 9 Jan 2015 09:20:16 -0800
    Subject: [PATCH 090/116] [Minor] Fix test RetryingBlockFetcherSuite after
     changed config name
    
    Flakey due to the default retry interval being the same as our test's wait timeout.
    
    Author: Aaron Davidson 
    
    Closes #3972 from aarondav/fix-test and squashes the following commits:
    
    db77cab [Aaron Davidson] [Minor] Fix test after changed config name
    ---
     .../spark/network/shuffle/RetryingBlockFetcherSuite.java      | 4 ++--
     1 file changed, 2 insertions(+), 2 deletions(-)
    
    diff --git a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java
    index 0191fe529e1be..1ad0d72ae5ec5 100644
    --- a/network/shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java
    +++ b/network/shuffle/src/test/java/org/apache/spark/network/shuffle/RetryingBlockFetcherSuite.java
    @@ -54,13 +54,13 @@ public class RetryingBlockFetcherSuite {
       @Before
       public void beforeEach() {
         System.setProperty("spark.shuffle.io.maxRetries", "2");
    -    System.setProperty("spark.shuffle.io.retryWaitMs", "0");
    +    System.setProperty("spark.shuffle.io.retryWait", "0");
       }
     
       @After
       public void afterEach() {
         System.clearProperty("spark.shuffle.io.maxRetries");
    -    System.clearProperty("spark.shuffle.io.retryWaitMs");
    +    System.clearProperty("spark.shuffle.io.retryWait");
       }
     
       @Test
    
    From 547df97715580f99ae573a49a86da12bf20cbc3d Mon Sep 17 00:00:00 2001
    From: Sean Owen 
    Date: Fri, 9 Jan 2015 09:35:46 -0800
    Subject: [PATCH 091/116] SPARK-5136 [DOCS] Improve documentation around
     setting up Spark IntelliJ project
    
    This PR simply points to the IntelliJ wiki page instead of also including IntelliJ notes in the docs. The intent however is to also update the wiki page with updated tips. This is the text I propose for the IntelliJ section on the wiki. I realize it omits some of the existing instructions on the wiki, about enabling Hive, but I think those are actually optional.
    
    ------
    
    IntelliJ supports both Maven- and SBT-based projects. It is recommended, however, to import Spark as a Maven project. Choose "Import Project..." from the File menu, and select the `pom.xml` file in the Spark root directory.
    
    It is fine to leave all settings at their default values in the Maven import wizard, with two caveats. First, it is usually useful to enable "Import Maven projects automatically", sincchanges to the project structure will automatically update the IntelliJ project.
    
    Second, note the step that prompts you to choose active Maven build profiles. As documented above, some build configuration require specific profiles to be enabled. The same profiles that are enabled with `-P[profile name]` above may be enabled on this screen. For example, if developing for Hadoop 2.4 with YARN support, enable profiles `yarn` and `hadoop-2.4`.
    
    These selections can be changed later by accessing the "Maven Projects" tool window from the View menu, and expanding the Profiles section.
    
    "Rebuild Project" can fail the first time the project is compiled, because generate source files are not automatically generated. Try clicking the  "Generate Sources and Update Folders For All Projects" button in the "Maven Projects" tool window to manually generate these sources.
    
    Compilation may fail with an error like "scalac: bad option: -P:/home/jakub/.m2/repository/org/scalamacros/paradise_2.10.4/2.0.1/paradise_2.10.4-2.0.1.jar". If so, go to Preferences > Build, Execution, Deployment > Scala Compiler and clear the "Additional compiler options" field. It will work then although the option will come back when the project reimports.
    
    Author: Sean Owen 
    
    Closes #3952 from srowen/SPARK-5136 and squashes the following commits:
    
    f3baa66 [Sean Owen] Point to new IJ / Eclipse wiki link
    016b7df [Sean Owen] Point to IntelliJ wiki page instead of also including IntelliJ notes in the docs
    ---
     docs/building-spark.md | 5 +++--
     1 file changed, 3 insertions(+), 2 deletions(-)
    
    diff --git a/docs/building-spark.md b/docs/building-spark.md
    index c1bcd91b5b853..fb93017861ed0 100644
    --- a/docs/building-spark.md
    +++ b/docs/building-spark.md
    @@ -151,9 +151,10 @@ Thus, the full flow for running continuous-compilation of the `core` submodule m
      $ mvn scala:cc
     ```
     
    -# Using With IntelliJ IDEA
    +# Building Spark with IntelliJ IDEA or Eclipse
     
    -This setup works fine in IntelliJ IDEA 11.1.4. After opening the project via the pom.xml file in the project root folder, you only need to activate either the hadoop1 or hadoop2 profile in the "Maven Properties" popout. We have not tried Eclipse/Scala IDE with this.
    +For help in setting up IntelliJ IDEA or Eclipse for Spark development, and troubleshooting, refer to the
    +[wiki page for IDE setup](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-IDESetup).
     
     # Building Spark Debian Packages
     
    
    From 1790b38695b46400a24b0b7e278e8e8388748211 Mon Sep 17 00:00:00 2001
    From: Patrick Wendell 
    Date: Fri, 9 Jan 2015 09:40:18 -0800
    Subject: [PATCH 092/116] HOTFIX: Minor improvements to make-distribution.sh
    
    1. Renames $FWDIR to $SPARK_HOME (vast majority of diff).
    2. Use Spark-provided Maven.
    3. Logs build flags in the RELEASE file.
    
    Author: Patrick Wendell 
    
    Closes #3973 from pwendell/master and squashes the following commits:
    
    340a2fa [Patrick Wendell] HOTFIX: Minor improvements to make-distribution.sh
    ---
     make-distribution.sh | 61 ++++++++++++++++++++++++--------------------
     1 file changed, 34 insertions(+), 27 deletions(-)
    
    diff --git a/make-distribution.sh b/make-distribution.sh
    index 45c99e42e5a5b..4e2f400be3053 100755
    --- a/make-distribution.sh
    +++ b/make-distribution.sh
    @@ -28,18 +28,20 @@ set -o pipefail
     set -e
     
     # Figure out where the Spark framework is installed
    -FWDIR="$(cd "`dirname "$0"`"; pwd)"
    -DISTDIR="$FWDIR/dist"
    +SPARK_HOME="$(cd "`dirname "$0"`"; pwd)"
    +DISTDIR="$SPARK_HOME/dist"
     
     SPARK_TACHYON=false
     MAKE_TGZ=false
     NAME=none
    +MVN="$SPARK_HOME/build/mvn"
     
     function exit_with_usage {
       echo "make-distribution.sh - tool for making binary distributions of Spark"
       echo ""
       echo "usage:"
    -  echo "./make-distribution.sh [--name] [--tgz] [--with-tachyon] "
    +  cl_options="[--name] [--tgz] [--mvn ] [--with-tachyon]"
    +  echo "./make-distribution.sh $cl_options "
       echo "See Spark's \"Building Spark\" doc for correct Maven options."
       echo ""
       exit 1
    @@ -71,6 +73,10 @@ while (( "$#" )); do
         --tgz)
           MAKE_TGZ=true
           ;;
    +    --mvn)
    +      MVN="$2"
    +      shift
    +      ;;
         --name)
           NAME="$2"
           shift
    @@ -109,9 +115,9 @@ if which git &>/dev/null; then
         unset GITREV
     fi
     
    -if ! which mvn &>/dev/null; then
    -    echo -e "You need Maven installed to build Spark."
    -    echo -e "Download Maven from https://maven.apache.org/"
    +if ! which $MVN &>/dev/null; then
    +    echo -e "Could not locate Maven command: '$MVN'."
    +    echo -e "Specify the Maven command with the --mvn flag"
         exit -1;
     fi
     
    @@ -119,7 +125,7 @@ VERSION=$(mvn help:evaluate -Dexpression=project.version 2>/dev/null | grep -v "
     SPARK_HADOOP_VERSION=$(mvn help:evaluate -Dexpression=hadoop.version $@ 2>/dev/null\
         | grep -v "INFO"\
         | tail -n 1)
    -SPARK_HIVE=$(mvn help:evaluate -Dexpression=project.activeProfiles -pl sql/hive $@ 2>/dev/null\
    +SPARK_HIVE=$($MVN help:evaluate -Dexpression=project.activeProfiles -pl sql/hive $@ 2>/dev/null\
         | grep -v "INFO"\
         | fgrep --count "hive";\
         # Reset exit status to 0, otherwise the script stops here if the last grep finds nothing\
    @@ -161,11 +167,11 @@ else
     fi
     
     # Build uber fat JAR
    -cd "$FWDIR"
    +cd "$SPARK_HOME"
     
     export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"
     
    -BUILD_COMMAND="mvn clean package -DskipTests $@"
    +BUILD_COMMAND="$MVN clean package -DskipTests $@"
     
     # Actually build the jar
     echo -e "\nBuilding with..."
    @@ -177,41 +183,42 @@ ${BUILD_COMMAND}
     rm -rf "$DISTDIR"
     mkdir -p "$DISTDIR/lib"
     echo "Spark $VERSION$GITREVSTRING built for Hadoop $SPARK_HADOOP_VERSION" > "$DISTDIR/RELEASE"
    +echo "Build flags: $@" >> "$DISTDIR/RELEASE"
     
     # Copy jars
    -cp "$FWDIR"/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/"
    -cp "$FWDIR"/examples/target/scala*/spark-examples*.jar "$DISTDIR/lib/"
    +cp "$SPARK_HOME"/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/"
    +cp "$SPARK_HOME"/examples/target/scala*/spark-examples*.jar "$DISTDIR/lib/"
     # This will fail if the -Pyarn profile is not provided
     # In this case, silence the error and ignore the return code of this command
    -cp "$FWDIR"/network/yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/lib/" &> /dev/null || :
    +cp "$SPARK_HOME"/network/yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/lib/" &> /dev/null || :
     
     # Copy example sources (needed for python and SQL)
     mkdir -p "$DISTDIR/examples/src/main"
    -cp -r "$FWDIR"/examples/src/main "$DISTDIR/examples/src/"
    +cp -r "$SPARK_HOME"/examples/src/main "$DISTDIR/examples/src/"
     
     if [ "$SPARK_HIVE" == "1" ]; then
    -  cp "$FWDIR"/lib_managed/jars/datanucleus*.jar "$DISTDIR/lib/"
    +  cp "$SPARK_HOME"/lib_managed/jars/datanucleus*.jar "$DISTDIR/lib/"
     fi
     
     # Copy license and ASF files
    -cp "$FWDIR/LICENSE" "$DISTDIR"
    -cp "$FWDIR/NOTICE" "$DISTDIR"
    +cp "$SPARK_HOME/LICENSE" "$DISTDIR"
    +cp "$SPARK_HOME/NOTICE" "$DISTDIR"
     
    -if [ -e "$FWDIR"/CHANGES.txt ]; then
    -  cp "$FWDIR/CHANGES.txt" "$DISTDIR"
    +if [ -e "$SPARK_HOME"/CHANGES.txt ]; then
    +  cp "$SPARK_HOME/CHANGES.txt" "$DISTDIR"
     fi
     
     # Copy data files
    -cp -r "$FWDIR/data" "$DISTDIR"
    +cp -r "$SPARK_HOME/data" "$DISTDIR"
     
     # Copy other things
     mkdir "$DISTDIR"/conf
    -cp "$FWDIR"/conf/*.template "$DISTDIR"/conf
    -cp "$FWDIR/README.md" "$DISTDIR"
    -cp -r "$FWDIR/bin" "$DISTDIR"
    -cp -r "$FWDIR/python" "$DISTDIR"
    -cp -r "$FWDIR/sbin" "$DISTDIR"
    -cp -r "$FWDIR/ec2" "$DISTDIR"
    +cp "$SPARK_HOME"/conf/*.template "$DISTDIR"/conf
    +cp "$SPARK_HOME/README.md" "$DISTDIR"
    +cp -r "$SPARK_HOME/bin" "$DISTDIR"
    +cp -r "$SPARK_HOME/python" "$DISTDIR"
    +cp -r "$SPARK_HOME/sbin" "$DISTDIR"
    +cp -r "$SPARK_HOME/ec2" "$DISTDIR"
     
     # Download and copy in tachyon, if requested
     if [ "$SPARK_TACHYON" == "true" ]; then
    @@ -243,9 +250,9 @@ fi
     
     if [ "$MAKE_TGZ" == "true" ]; then
       TARDIR_NAME=spark-$VERSION-bin-$NAME
    -  TARDIR="$FWDIR/$TARDIR_NAME"
    +  TARDIR="$SPARK_HOME/$TARDIR_NAME"
       rm -rf "$TARDIR"
       cp -r "$DISTDIR" "$TARDIR"
    -  tar czf "spark-$VERSION-bin-$NAME.tgz" -C "$FWDIR" "$TARDIR_NAME"
    +  tar czf "spark-$VERSION-bin-$NAME.tgz" -C "$SPARK_HOME" "$TARDIR_NAME"
       rm -rf "$TARDIR"
     fi
    
    From b6aa557300275b835cce7baa7bc8a80eb5425cbb Mon Sep 17 00:00:00 2001
    From: Kay Ousterhout 
    Date: Fri, 9 Jan 2015 09:47:06 -0800
    Subject: [PATCH 093/116] [SPARK-1143] Separate pool tests into their own
     suite.
    
    The current TaskSchedulerImplSuite includes some tests that are
    actually for the TaskSchedulerImpl, but the remainder of the tests avoid using
    the TaskSchedulerImpl entirely, and actually test the pool and scheduling
    algorithm mechanisms. This commit separates the pool/scheduling algorithm
    tests into their own suite, and also simplifies those tests.
    
    The pull request replaces #339.
    
    Author: Kay Ousterhout 
    
    Closes #3967 from kayousterhout/SPARK-1143 and squashes the following commits:
    
    8a898c4 [Kay Ousterhout] [SPARK-1143] Separate pool tests into their own suite.
    ---
     .../apache/spark/scheduler/PoolSuite.scala    | 183 ++++++++++++++
     .../scheduler/TaskSchedulerImplSuite.scala    | 230 ------------------
     2 files changed, 183 insertions(+), 230 deletions(-)
     create mode 100644 core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala
    
    diff --git a/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala
    new file mode 100644
    index 0000000000000..e8f461e2f56c9
    --- /dev/null
    +++ b/core/src/test/scala/org/apache/spark/scheduler/PoolSuite.scala
    @@ -0,0 +1,183 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.scheduler
    +
    +import java.util.Properties
    +
    +import org.scalatest.FunSuite
    +
    +import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext}
    +
    +/**
    + * Tests that pools and the associated scheduling algorithms for FIFO and fair scheduling work
    + * correctly.
    + */
    +class PoolSuite extends FunSuite with LocalSparkContext {
    +
    +  def createTaskSetManager(stageId: Int, numTasks: Int, taskScheduler: TaskSchedulerImpl)
    +    : TaskSetManager = {
    +    val tasks = Array.tabulate[Task[_]](numTasks) { i =>
    +      new FakeTask(i, Nil)
    +    }
    +    new TaskSetManager(taskScheduler, new TaskSet(tasks, stageId, 0, 0, null), 0)
    +  }
    +
    +  def scheduleTaskAndVerifyId(taskId: Int, rootPool: Pool, expectedStageId: Int) {
    +    val taskSetQueue = rootPool.getSortedTaskSetQueue
    +    val nextTaskSetToSchedule =
    +      taskSetQueue.find(t => (t.runningTasks + t.tasksSuccessful) < t.numTasks)
    +    assert(nextTaskSetToSchedule.isDefined)
    +    nextTaskSetToSchedule.get.addRunningTask(taskId)
    +    assert(nextTaskSetToSchedule.get.stageId === expectedStageId)
    +  }
    +
    +  test("FIFO Scheduler Test") {
    +    sc = new SparkContext("local", "TaskSchedulerImplSuite")
    +    val taskScheduler = new TaskSchedulerImpl(sc)
    +
    +    val rootPool = new Pool("", SchedulingMode.FIFO, 0, 0)
    +    val schedulableBuilder = new FIFOSchedulableBuilder(rootPool)
    +    schedulableBuilder.buildPools()
    +
    +    val taskSetManager0 = createTaskSetManager(0, 2, taskScheduler)
    +    val taskSetManager1 = createTaskSetManager(1, 2, taskScheduler)
    +    val taskSetManager2 = createTaskSetManager(2, 2, taskScheduler)
    +    schedulableBuilder.addTaskSetManager(taskSetManager0, null)
    +    schedulableBuilder.addTaskSetManager(taskSetManager1, null)
    +    schedulableBuilder.addTaskSetManager(taskSetManager2, null)
    +
    +    scheduleTaskAndVerifyId(0, rootPool, 0)
    +    scheduleTaskAndVerifyId(1, rootPool, 0)
    +    scheduleTaskAndVerifyId(2, rootPool, 1)
    +    scheduleTaskAndVerifyId(3, rootPool, 1)
    +    scheduleTaskAndVerifyId(4, rootPool, 2)
    +    scheduleTaskAndVerifyId(5, rootPool, 2)
    +  }
    +
    +  /**
    +   * This test creates three scheduling pools, and creates task set managers in the first
    +   * two scheduling pools. The test verifies that as tasks are scheduled, the fair scheduling
    +   * algorithm properly orders the two scheduling pools.
    +   */
    +  test("Fair Scheduler Test") {
    +    val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile()
    +    val conf = new SparkConf().set("spark.scheduler.allocation.file", xmlPath)
    +    sc = new SparkContext("local", "TaskSchedulerImplSuite", conf)
    +    val taskScheduler = new TaskSchedulerImpl(sc)
    +
    +    val rootPool = new Pool("", SchedulingMode.FAIR, 0, 0)
    +    val schedulableBuilder = new FairSchedulableBuilder(rootPool, sc.conf)
    +    schedulableBuilder.buildPools()
    +
    +    // Ensure that the XML file was read in correctly.
    +    assert(rootPool.getSchedulableByName("default") != null)
    +    assert(rootPool.getSchedulableByName("1") != null)
    +    assert(rootPool.getSchedulableByName("2") != null)
    +    assert(rootPool.getSchedulableByName("3") != null)
    +    assert(rootPool.getSchedulableByName("1").minShare === 2)
    +    assert(rootPool.getSchedulableByName("1").weight === 1)
    +    assert(rootPool.getSchedulableByName("2").minShare === 3)
    +    assert(rootPool.getSchedulableByName("2").weight === 1)
    +    assert(rootPool.getSchedulableByName("3").minShare === 0)
    +    assert(rootPool.getSchedulableByName("3").weight === 1)
    +
    +    val properties1 = new Properties()
    +    properties1.setProperty("spark.scheduler.pool","1")
    +    val properties2 = new Properties()
    +    properties2.setProperty("spark.scheduler.pool","2")
    +
    +    val taskSetManager10 = createTaskSetManager(0, 1, taskScheduler)
    +    val taskSetManager11 = createTaskSetManager(1, 1, taskScheduler)
    +    val taskSetManager12 = createTaskSetManager(2, 2, taskScheduler)
    +    schedulableBuilder.addTaskSetManager(taskSetManager10, properties1)
    +    schedulableBuilder.addTaskSetManager(taskSetManager11, properties1)
    +    schedulableBuilder.addTaskSetManager(taskSetManager12, properties1)
    +
    +    val taskSetManager23 = createTaskSetManager(3, 2, taskScheduler)
    +    val taskSetManager24 = createTaskSetManager(4, 2, taskScheduler)
    +    schedulableBuilder.addTaskSetManager(taskSetManager23, properties2)
    +    schedulableBuilder.addTaskSetManager(taskSetManager24, properties2)
    +
    +    // Pool 1 share ratio: 0. Pool 2 share ratio: 0. 1 gets scheduled based on ordering of names.
    +    scheduleTaskAndVerifyId(0, rootPool, 0)
    +    // Pool 1 share ratio: 1/2. Pool 2 share ratio: 0. 2 gets scheduled because ratio is lower.
    +    scheduleTaskAndVerifyId(1, rootPool, 3)
    +    // Pool 1 share ratio: 1/2. Pool 2 share ratio: 1/3. 2 gets scheduled because ratio is lower.
    +    scheduleTaskAndVerifyId(2, rootPool, 3)
    +    // Pool 1 share ratio: 1/2. Pool 2 share ratio: 2/3. 1 gets scheduled because ratio is lower.
    +    scheduleTaskAndVerifyId(3, rootPool, 1)
    +    // Pool 1 share ratio: 1. Pool 2 share ratio: 2/3. 2 gets scheduled because ratio is lower.
    +    scheduleTaskAndVerifyId(4, rootPool, 4)
    +    // Neither pool is needy so ordering is based on number of running tasks.
    +    // Pool 1 running tasks: 2, Pool 2 running tasks: 3. 1 gets scheduled because fewer running
    +    // tasks.
    +    scheduleTaskAndVerifyId(5, rootPool, 2)
    +    // Pool 1 running tasks: 3, Pool 2 running tasks: 3. 1 gets scheduled because of naming
    +    // ordering.
    +    scheduleTaskAndVerifyId(6, rootPool, 2)
    +    // Pool 1 running tasks: 4, Pool 2 running tasks: 3. 2 gets scheduled because fewer running
    +    // tasks.
    +    scheduleTaskAndVerifyId(7, rootPool, 4)
    +  }
    +
    +  test("Nested Pool Test") {
    +    sc = new SparkContext("local", "TaskSchedulerImplSuite")
    +    val taskScheduler = new TaskSchedulerImpl(sc)
    +
    +    val rootPool = new Pool("", SchedulingMode.FAIR, 0, 0)
    +    val pool0 = new Pool("0", SchedulingMode.FAIR, 3, 1)
    +    val pool1 = new Pool("1", SchedulingMode.FAIR, 4, 1)
    +    rootPool.addSchedulable(pool0)
    +    rootPool.addSchedulable(pool1)
    +
    +    val pool00 = new Pool("00", SchedulingMode.FAIR, 2, 2)
    +    val pool01 = new Pool("01", SchedulingMode.FAIR, 1, 1)
    +    pool0.addSchedulable(pool00)
    +    pool0.addSchedulable(pool01)
    +
    +    val pool10 = new Pool("10", SchedulingMode.FAIR, 2, 2)
    +    val pool11 = new Pool("11", SchedulingMode.FAIR, 2, 1)
    +    pool1.addSchedulable(pool10)
    +    pool1.addSchedulable(pool11)
    +
    +    val taskSetManager000 = createTaskSetManager(0, 5, taskScheduler)
    +    val taskSetManager001 = createTaskSetManager(1, 5, taskScheduler)
    +    pool00.addSchedulable(taskSetManager000)
    +    pool00.addSchedulable(taskSetManager001)
    +
    +    val taskSetManager010 = createTaskSetManager(2, 5, taskScheduler)
    +    val taskSetManager011 = createTaskSetManager(3, 5, taskScheduler)
    +    pool01.addSchedulable(taskSetManager010)
    +    pool01.addSchedulable(taskSetManager011)
    +
    +    val taskSetManager100 = createTaskSetManager(4, 5, taskScheduler)
    +    val taskSetManager101 = createTaskSetManager(5, 5, taskScheduler)
    +    pool10.addSchedulable(taskSetManager100)
    +    pool10.addSchedulable(taskSetManager101)
    +
    +    val taskSetManager110 = createTaskSetManager(6, 5, taskScheduler)
    +    val taskSetManager111 = createTaskSetManager(7, 5, taskScheduler)
    +    pool11.addSchedulable(taskSetManager110)
    +    pool11.addSchedulable(taskSetManager111)
    +
    +    scheduleTaskAndVerifyId(0, rootPool, 0)
    +    scheduleTaskAndVerifyId(1, rootPool, 4)
    +    scheduleTaskAndVerifyId(2, rootPool, 6)
    +    scheduleTaskAndVerifyId(3, rootPool, 2)
    +  }
    +}
    diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
    index 00812e6018d1f..8874cf00e9993 100644
    --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
    +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
    @@ -30,238 +30,8 @@ class FakeSchedulerBackend extends SchedulerBackend {
       def defaultParallelism() = 1
     }
     
    -class FakeTaskSetManager(
    -    initPriority: Int,
    -    initStageId: Int,
    -    initNumTasks: Int,
    -    taskScheduler: TaskSchedulerImpl,
    -    taskSet: TaskSet)
    -  extends TaskSetManager(taskScheduler, taskSet, 0) {
    -
    -  parent = null
    -  weight = 1
    -  minShare = 2
    -  priority = initPriority
    -  stageId = initStageId
    -  name = "TaskSet_"+stageId
    -  override val numTasks = initNumTasks
    -  tasksSuccessful = 0
    -
    -  var numRunningTasks = 0
    -  override def runningTasks = numRunningTasks
    -
    -  def increaseRunningTasks(taskNum: Int) {
    -    numRunningTasks += taskNum
    -    if (parent != null) {
    -      parent.increaseRunningTasks(taskNum)
    -    }
    -  }
    -
    -  def decreaseRunningTasks(taskNum: Int) {
    -    numRunningTasks -= taskNum
    -    if (parent != null) {
    -      parent.decreaseRunningTasks(taskNum)
    -    }
    -  }
    -
    -  override def addSchedulable(schedulable: Schedulable) {
    -  }
    -
    -  override def removeSchedulable(schedulable: Schedulable) {
    -  }
    -
    -  override def getSchedulableByName(name: String): Schedulable = {
    -    null
    -  }
    -
    -  override def executorLost(executorId: String, host: String): Unit = {
    -  }
    -
    -  override def resourceOffer(
    -      execId: String,
    -      host: String,
    -      maxLocality: TaskLocality.TaskLocality)
    -    : Option[TaskDescription] =
    -  {
    -    if (tasksSuccessful + numRunningTasks < numTasks) {
    -      increaseRunningTasks(1)
    -      Some(new TaskDescription(0, execId, "task 0:0", 0, null))
    -    } else {
    -      None
    -    }
    -  }
    -
    -  override def checkSpeculatableTasks(): Boolean = {
    -    true
    -  }
    -
    -  def taskFinished() {
    -    decreaseRunningTasks(1)
    -    tasksSuccessful +=1
    -    if (tasksSuccessful == numTasks) {
    -      parent.removeSchedulable(this)
    -    }
    -  }
    -
    -  def abort() {
    -    decreaseRunningTasks(numRunningTasks)
    -    parent.removeSchedulable(this)
    -  }
    -}
    -
     class TaskSchedulerImplSuite extends FunSuite with LocalSparkContext with Logging {
     
    -  def createDummyTaskSetManager(priority: Int, stage: Int, numTasks: Int, cs: TaskSchedulerImpl,
    -      taskSet: TaskSet): FakeTaskSetManager = {
    -    new FakeTaskSetManager(priority, stage, numTasks, cs , taskSet)
    -  }
    -
    -  def resourceOffer(rootPool: Pool): Int = {
    -    val taskSetQueue = rootPool.getSortedTaskSetQueue
    -    /* Just for Test*/
    -    for (manager <- taskSetQueue) {
    -       logInfo("parentName:%s, parent running tasks:%d, name:%s,runningTasks:%d".format(
    -         manager.parent.name, manager.parent.runningTasks, manager.name, manager.runningTasks))
    -    }
    -    for (taskSet <- taskSetQueue) {
    -      taskSet.resourceOffer("execId_1", "hostname_1", TaskLocality.ANY) match {
    -        case Some(task) =>
    -          return taskSet.stageId
    -        case None => {}
    -      }
    -    }
    -    -1
    -  }
    -
    -  def checkTaskSetId(rootPool: Pool, expectedTaskSetId: Int) {
    -    assert(resourceOffer(rootPool) === expectedTaskSetId)
    -  }
    -
    -  test("FIFO Scheduler Test") {
    -    sc = new SparkContext("local", "TaskSchedulerImplSuite")
    -    val taskScheduler = new TaskSchedulerImpl(sc)
    -    val taskSet = FakeTask.createTaskSet(1)
    -
    -    val rootPool = new Pool("", SchedulingMode.FIFO, 0, 0)
    -    val schedulableBuilder = new FIFOSchedulableBuilder(rootPool)
    -    schedulableBuilder.buildPools()
    -
    -    val taskSetManager0 = createDummyTaskSetManager(0, 0, 2, taskScheduler, taskSet)
    -    val taskSetManager1 = createDummyTaskSetManager(0, 1, 2, taskScheduler, taskSet)
    -    val taskSetManager2 = createDummyTaskSetManager(0, 2, 2, taskScheduler, taskSet)
    -    schedulableBuilder.addTaskSetManager(taskSetManager0, null)
    -    schedulableBuilder.addTaskSetManager(taskSetManager1, null)
    -    schedulableBuilder.addTaskSetManager(taskSetManager2, null)
    -
    -    checkTaskSetId(rootPool, 0)
    -    resourceOffer(rootPool)
    -    checkTaskSetId(rootPool, 1)
    -    resourceOffer(rootPool)
    -    taskSetManager1.abort()
    -    checkTaskSetId(rootPool, 2)
    -  }
    -
    -  test("Fair Scheduler Test") {
    -    val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile()
    -    val conf = new SparkConf().set("spark.scheduler.allocation.file", xmlPath)
    -    sc = new SparkContext("local", "TaskSchedulerImplSuite", conf)
    -    val taskScheduler = new TaskSchedulerImpl(sc)
    -    val taskSet = FakeTask.createTaskSet(1)
    -
    -    val rootPool = new Pool("", SchedulingMode.FAIR, 0, 0)
    -    val schedulableBuilder = new FairSchedulableBuilder(rootPool, sc.conf)
    -    schedulableBuilder.buildPools()
    -
    -    assert(rootPool.getSchedulableByName("default") != null)
    -    assert(rootPool.getSchedulableByName("1") != null)
    -    assert(rootPool.getSchedulableByName("2") != null)
    -    assert(rootPool.getSchedulableByName("3") != null)
    -    assert(rootPool.getSchedulableByName("1").minShare === 2)
    -    assert(rootPool.getSchedulableByName("1").weight === 1)
    -    assert(rootPool.getSchedulableByName("2").minShare === 3)
    -    assert(rootPool.getSchedulableByName("2").weight === 1)
    -    assert(rootPool.getSchedulableByName("3").minShare === 0)
    -    assert(rootPool.getSchedulableByName("3").weight === 1)
    -
    -    val properties1 = new Properties()
    -    properties1.setProperty("spark.scheduler.pool","1")
    -    val properties2 = new Properties()
    -    properties2.setProperty("spark.scheduler.pool","2")
    -
    -    val taskSetManager10 = createDummyTaskSetManager(1, 0, 1, taskScheduler, taskSet)
    -    val taskSetManager11 = createDummyTaskSetManager(1, 1, 1, taskScheduler, taskSet)
    -    val taskSetManager12 = createDummyTaskSetManager(1, 2, 2, taskScheduler, taskSet)
    -    schedulableBuilder.addTaskSetManager(taskSetManager10, properties1)
    -    schedulableBuilder.addTaskSetManager(taskSetManager11, properties1)
    -    schedulableBuilder.addTaskSetManager(taskSetManager12, properties1)
    -
    -    val taskSetManager23 = createDummyTaskSetManager(2, 3, 2, taskScheduler, taskSet)
    -    val taskSetManager24 = createDummyTaskSetManager(2, 4, 2, taskScheduler, taskSet)
    -    schedulableBuilder.addTaskSetManager(taskSetManager23, properties2)
    -    schedulableBuilder.addTaskSetManager(taskSetManager24, properties2)
    -
    -    checkTaskSetId(rootPool, 0)
    -    checkTaskSetId(rootPool, 3)
    -    checkTaskSetId(rootPool, 3)
    -    checkTaskSetId(rootPool, 1)
    -    checkTaskSetId(rootPool, 4)
    -    checkTaskSetId(rootPool, 2)
    -    checkTaskSetId(rootPool, 2)
    -    checkTaskSetId(rootPool, 4)
    -
    -    taskSetManager12.taskFinished()
    -    assert(rootPool.getSchedulableByName("1").runningTasks === 3)
    -    taskSetManager24.abort()
    -    assert(rootPool.getSchedulableByName("2").runningTasks === 2)
    -  }
    -
    -  test("Nested Pool Test") {
    -    sc = new SparkContext("local", "TaskSchedulerImplSuite")
    -    val taskScheduler = new TaskSchedulerImpl(sc)
    -    val taskSet = FakeTask.createTaskSet(1)
    -
    -    val rootPool = new Pool("", SchedulingMode.FAIR, 0, 0)
    -    val pool0 = new Pool("0", SchedulingMode.FAIR, 3, 1)
    -    val pool1 = new Pool("1", SchedulingMode.FAIR, 4, 1)
    -    rootPool.addSchedulable(pool0)
    -    rootPool.addSchedulable(pool1)
    -
    -    val pool00 = new Pool("00", SchedulingMode.FAIR, 2, 2)
    -    val pool01 = new Pool("01", SchedulingMode.FAIR, 1, 1)
    -    pool0.addSchedulable(pool00)
    -    pool0.addSchedulable(pool01)
    -
    -    val pool10 = new Pool("10", SchedulingMode.FAIR, 2, 2)
    -    val pool11 = new Pool("11", SchedulingMode.FAIR, 2, 1)
    -    pool1.addSchedulable(pool10)
    -    pool1.addSchedulable(pool11)
    -
    -    val taskSetManager000 = createDummyTaskSetManager(0, 0, 5, taskScheduler, taskSet)
    -    val taskSetManager001 = createDummyTaskSetManager(0, 1, 5, taskScheduler, taskSet)
    -    pool00.addSchedulable(taskSetManager000)
    -    pool00.addSchedulable(taskSetManager001)
    -
    -    val taskSetManager010 = createDummyTaskSetManager(1, 2, 5, taskScheduler, taskSet)
    -    val taskSetManager011 = createDummyTaskSetManager(1, 3, 5, taskScheduler, taskSet)
    -    pool01.addSchedulable(taskSetManager010)
    -    pool01.addSchedulable(taskSetManager011)
    -
    -    val taskSetManager100 = createDummyTaskSetManager(2, 4, 5, taskScheduler, taskSet)
    -    val taskSetManager101 = createDummyTaskSetManager(2, 5, 5, taskScheduler, taskSet)
    -    pool10.addSchedulable(taskSetManager100)
    -    pool10.addSchedulable(taskSetManager101)
    -
    -    val taskSetManager110 = createDummyTaskSetManager(3, 6, 5, taskScheduler, taskSet)
    -    val taskSetManager111 = createDummyTaskSetManager(3, 7, 5, taskScheduler, taskSet)
    -    pool11.addSchedulable(taskSetManager110)
    -    pool11.addSchedulable(taskSetManager111)
    -
    -    checkTaskSetId(rootPool, 0)
    -    checkTaskSetId(rootPool, 4)
    -    checkTaskSetId(rootPool, 6)
    -    checkTaskSetId(rootPool, 2)
    -  }
    -
       test("Scheduler does not always schedule tasks on the same workers") {
         sc = new SparkContext("local", "TaskSchedulerImplSuite")
         val taskScheduler = new TaskSchedulerImpl(sc)
    
    From e9ca16ec943b9553056482d0c085eacb6046821e Mon Sep 17 00:00:00 2001
    From: Liang-Chi Hsieh 
    Date: Fri, 9 Jan 2015 10:27:33 -0800
    Subject: [PATCH 094/116] [SPARK-5145][Mllib] Add BLAS.dsyr and use it in
     GaussianMixtureEM
    
    This pr uses BLAS.dsyr to replace few implementations in GaussianMixtureEM.
    
    Author: Liang-Chi Hsieh 
    
    Closes #3949 from viirya/blas_dsyr and squashes the following commits:
    
    4e4d6cf [Liang-Chi Hsieh] Add unit test. Rename function name, modify doc and style.
    3f57fd2 [Liang-Chi Hsieh] Add BLAS.dsyr and use it in GaussianMixtureEM.
    ---
     .../mllib/clustering/GaussianMixtureEM.scala  | 10 +++--
     .../org/apache/spark/mllib/linalg/BLAS.scala  | 26 ++++++++++++
     .../apache/spark/mllib/linalg/BLASSuite.scala | 41 +++++++++++++++++++
     3 files changed, 73 insertions(+), 4 deletions(-)
    
    diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala
    index bdf984aee4dae..3a6c0e681e3fa 100644
    --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala
    +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala
    @@ -21,7 +21,7 @@ import scala.collection.mutable.IndexedSeq
     
     import breeze.linalg.{DenseVector => BreezeVector, DenseMatrix => BreezeMatrix, diag, Transpose}
     import org.apache.spark.rdd.RDD
    -import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors}
    +import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors, DenseVector, DenseMatrix, BLAS}
     import org.apache.spark.mllib.stat.impl.MultivariateGaussian
     import org.apache.spark.mllib.util.MLUtils
     
    @@ -151,9 +151,10 @@ class GaussianMixtureEM private (
           var i = 0
           while (i < k) {
             val mu = sums.means(i) / sums.weights(i)
    -        val sigma = sums.sigmas(i) / sums.weights(i) - mu * new Transpose(mu) // TODO: Use BLAS.dsyr
    +        BLAS.syr(-sums.weights(i), Vectors.fromBreeze(mu).asInstanceOf[DenseVector],
    +          Matrices.fromBreeze(sums.sigmas(i)).asInstanceOf[DenseMatrix])
             weights(i) = sums.weights(i) / sumWeights
    -        gaussians(i) = new MultivariateGaussian(mu, sigma)
    +        gaussians(i) = new MultivariateGaussian(mu, sums.sigmas(i) / sums.weights(i))
             i = i + 1
           }
        
    @@ -211,7 +212,8 @@ private object ExpectationSum {
           p(i) /= pSum
           sums.weights(i) += p(i)
           sums.means(i) += x * p(i)
    -      sums.sigmas(i) += xxt * p(i) // TODO: use BLAS.dsyr
    +      BLAS.syr(p(i), Vectors.fromBreeze(x).asInstanceOf[DenseVector],
    +        Matrices.fromBreeze(sums.sigmas(i)).asInstanceOf[DenseMatrix])
           i = i + 1
         }
         sums
    diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
    index 9fed513becddc..3414daccd7ca4 100644
    --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
    +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
    @@ -228,6 +228,32 @@ private[spark] object BLAS extends Serializable with Logging {
         }
         _nativeBLAS
       }
    + 
    +  /**
    +   * A := alpha * x * x^T^ + A
    +   * @param alpha a real scalar that will be multiplied to x * x^T^.
    +   * @param x the vector x that contains the n elements.
    +   * @param A the symmetric matrix A. Size of n x n.
    +   */
    +  def syr(alpha: Double, x: DenseVector, A: DenseMatrix) {
    +    val mA = A.numRows
    +    val nA = A.numCols
    +    require(mA == nA, s"A is not a symmetric matrix. A: $mA x $nA")
    +    require(mA == x.size, s"The size of x doesn't match the rank of A. A: $mA x $nA, x: ${x.size}")
    +
    +    nativeBLAS.dsyr("U", x.size, alpha, x.values, 1, A.values, nA)
    +
    +    // Fill lower triangular part of A
    +    var i = 0
    +    while (i < mA) {
    +      var j = i + 1
    +      while (j < nA) {
    +        A(j, i) = A(i, j)
    +        j += 1
    +      }
    +      i += 1
    +    }    
    +  }
     
       /**
        * C := alpha * A * B + beta * C
    diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
    index 5d70c914f14b0..771878e925ea7 100644
    --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
    +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
    @@ -127,6 +127,47 @@ class BLASSuite extends FunSuite {
         }
       }
     
    +  test("syr") {
    +    val dA = new DenseMatrix(4, 4,
    +      Array(0.0, 1.2, 2.2, 3.1, 1.2, 3.2, 5.3, 4.6, 2.2, 5.3, 1.8, 3.0, 3.1, 4.6, 3.0, 0.8))
    +    val x = new DenseVector(Array(0.0, 2.7, 3.5, 2.1))
    +    val alpha = 0.15
    +
    +    val expected = new DenseMatrix(4, 4,
    +      Array(0.0, 1.2, 2.2, 3.1, 1.2, 4.2935, 6.7175, 5.4505, 2.2, 6.7175, 3.6375, 4.1025, 3.1,
    +        5.4505, 4.1025, 1.4615))
    +
    +    syr(alpha, x, dA)
    +
    +    assert(dA ~== expected absTol 1e-15)
    + 
    +    val dB =
    +      new DenseMatrix(3, 4, Array(0.0, 1.2, 2.2, 3.1, 1.2, 3.2, 5.3, 4.6, 2.2, 5.3, 1.8, 3.0))
    +
    +    withClue("Matrix A must be a symmetric Matrix") {
    +      intercept[Exception] {
    +        syr(alpha, x, dB)
    +      }
    +    }
    + 
    +    val dC =
    +      new DenseMatrix(3, 3, Array(0.0, 1.2, 2.2, 1.2, 3.2, 5.3, 2.2, 5.3, 1.8))
    +
    +    withClue("Size of vector must match the rank of matrix") {
    +      intercept[Exception] {
    +        syr(alpha, x, dC)
    +      }
    +    }
    + 
    +    val y = new DenseVector(Array(0.0, 2.7, 3.5, 2.1, 1.5))
    +
    +    withClue("Size of vector must match the rank of matrix") {
    +      intercept[Exception] {
    +        syr(alpha, y, dA)
    +      }
    +    }
    +  }
    +
       test("gemm") {
     
         val dA =
    
    From 454fe129ee97b859bf079db8b9158e115a219ad5 Mon Sep 17 00:00:00 2001
    From: Jongyoul Lee 
    Date: Fri, 9 Jan 2015 10:47:08 -0800
    Subject: [PATCH 095/116] [SPARK-3619] Upgrade to Mesos 0.21 to work around
     MESOS-1688
    
    - update version from 0.18.1 to 0.21.0
    - I'm doing some tests in order to verify some spark jobs work fine on mesos 0.21.0 environment.
    
    Author: Jongyoul Lee 
    
    Closes #3934 from jongyoul/SPARK-3619 and squashes the following commits:
    
    ab994fa [Jongyoul Lee] [SPARK-3619] Upgrade to Mesos 0.21 to work around MESOS-1688 - update version from 0.18.1 to 0.21.0
    ---
     pom.xml | 2 +-
     1 file changed, 1 insertion(+), 1 deletion(-)
    
    diff --git a/pom.xml b/pom.xml
    index 703e5c47bf59b..aadcdfd1083c5 100644
    --- a/pom.xml
    +++ b/pom.xml
    @@ -115,7 +115,7 @@
         1.6
         spark
         2.0.1
    -    0.18.1
    +    0.21.0
         shaded-protobuf
         1.7.5
         1.2.17
    
    From 7e8e62aec11c43c983055adc475b96006412199a Mon Sep 17 00:00:00 2001
    From: "Joseph K. Bradley" 
    Date: Fri, 9 Jan 2015 13:00:15 -0800
    Subject: [PATCH 096/116] [SPARK-5015] [mllib] Random seed for GMM + make test
     suite deterministic
    
    Issues:
    * From JIRA: GaussianMixtureEM uses randomness but does not take a random seed. It should take one as a parameter.
    * This also makes the test suite flaky since initialization can fail due to stochasticity.
    
    Fix:
    * Add random seed
    * Use it in test suite
    
    CC: mengxr  tgaloppo
    
    Author: Joseph K. Bradley 
    
    Closes #3981 from jkbradley/gmm-seed and squashes the following commits:
    
    f0df4fd [Joseph K. Bradley] Added seed parameter to GMM.  Updated test suite to use seed to prevent flakiness
    ---
     .../mllib/clustering/GaussianMixtureEM.scala  | 26 ++++++++++++++-----
     .../GMMExpectationMaximizationSuite.scala     | 14 +++++-----
     2 files changed, 27 insertions(+), 13 deletions(-)
    
    diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala
    index 3a6c0e681e3fa..b3c5631cc4cc6 100644
    --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala
    +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala
    @@ -24,6 +24,7 @@ import org.apache.spark.rdd.RDD
     import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors, DenseVector, DenseMatrix, BLAS}
     import org.apache.spark.mllib.stat.impl.MultivariateGaussian
     import org.apache.spark.mllib.util.MLUtils
    +import org.apache.spark.util.Utils
     
     /**
      * This class performs expectation maximization for multivariate Gaussian
    @@ -45,10 +46,11 @@ import org.apache.spark.mllib.util.MLUtils
     class GaussianMixtureEM private (
         private var k: Int, 
         private var convergenceTol: Double, 
    -    private var maxIterations: Int) extends Serializable {
    +    private var maxIterations: Int,
    +    private var seed: Long) extends Serializable {
       
       /** A default instance, 2 Gaussians, 100 iterations, 0.01 log-likelihood threshold */
    -  def this() = this(2, 0.01, 100)
    +  def this() = this(2, 0.01, 100, Utils.random.nextLong())
       
       // number of samples per cluster to use when initializing Gaussians
       private val nSamples = 5
    @@ -100,11 +102,21 @@ class GaussianMixtureEM private (
         this
       }
       
    -  /** Return the largest change in log-likelihood at which convergence is
    -   *  considered to have occurred.
    +  /**
    +   * Return the largest change in log-likelihood at which convergence is
    +   * considered to have occurred.
        */
       def getConvergenceTol: Double = convergenceTol
    -  
    +
    +  /** Set the random seed */
    +  def setSeed(seed: Long): this.type = {
    +    this.seed = seed
    +    this
    +  }
    +
    +  /** Return the random seed */
    +  def getSeed: Long = seed
    +
       /** Perform expectation maximization */
       def run(data: RDD[Vector]): GaussianMixtureModel = {
         val sc = data.sparkContext
    @@ -113,7 +125,7 @@ class GaussianMixtureEM private (
         val breezeData = data.map(u => u.toBreeze.toDenseVector).cache()
         
         // Get length of the input vectors
    -    val d = breezeData.first.length 
    +    val d = breezeData.first().length
         
         // Determine initial weights and corresponding Gaussians.
         // If the user supplied an initial GMM, we use those values, otherwise
    @@ -126,7 +138,7 @@ class GaussianMixtureEM private (
           })
           
           case None => {
    -        val samples = breezeData.takeSample(true, k * nSamples, scala.util.Random.nextInt)
    +        val samples = breezeData.takeSample(withReplacement = true, k * nSamples, seed)
             (Array.fill(k)(1.0 / k), Array.tabulate(k) { i => 
               val slice = samples.view(i * nSamples, (i + 1) * nSamples)
               new MultivariateGaussian(vectorMean(slice), initCovariance(slice)) 
    diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala
    index 23feb82874b70..9da5495741a80 100644
    --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala
    +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala
    @@ -35,12 +35,14 @@ class GMMExpectationMaximizationSuite extends FunSuite with MLlibTestSparkContex
         val Ew = 1.0
         val Emu = Vectors.dense(5.0, 10.0)
         val Esigma = Matrices.dense(2, 2, Array(2.0 / 3.0, -2.0 / 3.0, -2.0 / 3.0, 2.0 / 3.0))
    -    
    -    val gmm = new GaussianMixtureEM().setK(1).run(data)
    -                
    -    assert(gmm.weight(0) ~== Ew absTol 1E-5)
    -    assert(gmm.mu(0) ~== Emu absTol 1E-5)
    -    assert(gmm.sigma(0) ~== Esigma absTol 1E-5)
    +
    +    val seeds = Array(314589, 29032897, 50181, 494821, 4660)
    +    seeds.foreach { seed =>
    +      val gmm = new GaussianMixtureEM().setK(1).setSeed(seed).run(data)
    +      assert(gmm.weight(0) ~== Ew absTol 1E-5)
    +      assert(gmm.mu(0) ~== Emu absTol 1E-5)
    +      assert(gmm.sigma(0) ~== Esigma absTol 1E-5)
    +    }
       }
       
       test("two clusters") {
    
    From e96645206006a009e5c1a23bbd177dcaf3ef9b83 Mon Sep 17 00:00:00 2001
    From: WangTaoTheTonic 
    Date: Fri, 9 Jan 2015 13:20:32 -0800
    Subject: [PATCH 097/116] [SPARK-1953][YARN]yarn client mode Application Master
     memory size is same as driver memory...
    
    ... size
    
    Ways to set Application Master's memory on yarn-client mode:
    1.  `spark.yarn.am.memory` in SparkConf or System Properties
    2.  default value 512m
    
    Note: this arguments is only available in yarn-client mode.
    
    Author: WangTaoTheTonic 
    
    Closes #3607 from WangTaoTheTonic/SPARK4181 and squashes the following commits:
    
    d5ceb1b [WangTaoTheTonic] spark.driver.memeory is used in both modes
    6c1b264 [WangTaoTheTonic] rebase
    b8410c0 [WangTaoTheTonic] minor optiminzation
    ddcd592 [WangTaoTheTonic] fix the bug produced in rebase and some improvements
    3bf70cc [WangTaoTheTonic] rebase and give proper hint
    987b99d [WangTaoTheTonic] disable --driver-memory in client mode
    2b27928 [WangTaoTheTonic] inaccurate description
    b7acbb2 [WangTaoTheTonic] incorrect method invoked
    2557c5e [WangTaoTheTonic] missing a single blank
    42075b0 [WangTaoTheTonic] arrange the args and warn logging
    69c7dba [WangTaoTheTonic] rebase
    1960d16 [WangTaoTheTonic] fix wrong comment
    7fa9e2e [WangTaoTheTonic] log a warning
    f6bee0e [WangTaoTheTonic] docs issue
    d619996 [WangTaoTheTonic] Merge branch 'master' into SPARK4181
    b09c309 [WangTaoTheTonic] use code format
    ab16bb5 [WangTaoTheTonic] fix bug and add comments
    44e48c2 [WangTaoTheTonic] minor fix
    6fd13e1 [WangTaoTheTonic] add overhead mem and remove some configs
    0566bb8 [WangTaoTheTonic] yarn client mode Application Master memory size is same as driver memory size
    ---
     .../spark/deploy/SparkSubmitArguments.scala   |  3 +-
     docs/running-on-yarn.md                       | 19 +++++++++-
     .../org/apache/spark/deploy/yarn/Client.scala |  2 +-
     .../spark/deploy/yarn/ClientArguments.scala   | 37 ++++++++++++++-----
     .../cluster/YarnClientSchedulerBackend.scala  |  2 -
     5 files changed, 48 insertions(+), 15 deletions(-)
    
    diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
    index 1faabe91f49a8..f14ef4d299383 100644
    --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
    +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
    @@ -405,7 +405,8 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
             |  --queue QUEUE_NAME          The YARN queue to submit to (Default: "default").
             |  --num-executors NUM         Number of executors to launch (Default: 2).
             |  --archives ARCHIVES         Comma separated list of archives to be extracted into the
    -        |                              working directory of each executor.""".stripMargin
    +        |                              working directory of each executor.
    +      """.stripMargin
         )
         SparkSubmit.exitFn()
       }
    diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
    index 183698ffe9304..4f273098c5db3 100644
    --- a/docs/running-on-yarn.md
    +++ b/docs/running-on-yarn.md
    @@ -21,6 +21,14 @@ Most of the configs are the same for Spark on YARN as for other deployment modes
     
     
    spark.ui.retainedJobs 1000 - How many stages the Spark UI and status APIs remember before garbage + How many jobs the Spark UI and status APIs remember before garbage collecting.
    If set to true, validates the output specification (e.g. checking if the output directory already exists) used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing output directories. We recommend that users do not disable this except if trying to achieve compatibility with - previous versions of Spark. Simply use Hadoop's FileSystem API to delete output directories by hand.
    spark.hadoop.cloneConf
    spark.network.timeout100 + Default timeout for all network interactions, in seconds. This config will be used in + place of spark.core.connection.ack.wait.timeout, spark.akka.timeout, + spark.storage.blockManagerSlaveTimeoutMs or spark.shuffle.io.connectionTimeout, + if they are not configured. +
    spark.akka.heartbeat.pauses 6000
    spark.network.timeout100120 Default timeout for all network interactions, in seconds. This config will be used in place of spark.core.connection.ack.wait.timeout, spark.akka.timeout, - spark.storage.blockManagerSlaveTimeoutMs or spark.shuffle.io.connectionTimeout, - if they are not configured. + spark.storage.blockManagerSlaveTimeoutMs or + spark.shuffle.io.connectionTimeout, if they are not configured.
    + + + + + @@ -90,7 +98,14 @@ Most of the configs are the same for Spark on YARN as for other deployment modes + + + + + @@ -145,7 +160,7 @@ Most of the configs are the same for Spark on YARN as for other deployment modes diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index c363d755c1752..032106371cd60 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -65,7 +65,7 @@ private[spark] class Client( private val amMemoryOverhead = args.amMemoryOverhead // MB private val executorMemoryOverhead = args.executorMemoryOverhead // MB private val distCacheMgr = new ClientDistributedCacheManager() - private val isClusterMode = args.userClass != null + private val isClusterMode = args.isClusterMode def stop(): Unit = yarnClient.stop() diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala index 39f1021c9d942..fdbf9f8eed029 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala @@ -38,23 +38,27 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf) var amMemory: Int = 512 // MB var appName: String = "Spark" var priority = 0 + def isClusterMode: Boolean = userClass != null + + private var driverMemory: Int = 512 // MB + private val driverMemOverheadKey = "spark.yarn.driver.memoryOverhead" + private val amMemKey = "spark.yarn.am.memory" + private val amMemOverheadKey = "spark.yarn.am.memoryOverhead" + private val isDynamicAllocationEnabled = + sparkConf.getBoolean("spark.dynamicAllocation.enabled", false) parseArgs(args.toList) + loadEnvironmentArgs() + validateArgs() // Additional memory to allocate to containers - // For now, use driver's memory overhead as our AM container's memory overhead - val amMemoryOverhead = sparkConf.getInt("spark.yarn.driver.memoryOverhead", + val amMemoryOverheadConf = if (isClusterMode) driverMemOverheadKey else amMemOverheadKey + val amMemoryOverhead = sparkConf.getInt(amMemoryOverheadConf, math.max((MEMORY_OVERHEAD_FACTOR * amMemory).toInt, MEMORY_OVERHEAD_MIN)) val executorMemoryOverhead = sparkConf.getInt("spark.yarn.executor.memoryOverhead", math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toInt, MEMORY_OVERHEAD_MIN)) - private val isDynamicAllocationEnabled = - sparkConf.getBoolean("spark.dynamicAllocation.enabled", false) - - loadEnvironmentArgs() - validateArgs() - /** Load any default arguments provided through environment variables and Spark properties. */ private def loadEnvironmentArgs(): Unit = { // For backward compatibility, SPARK_YARN_DIST_{ARCHIVES/FILES} should be resolved to hdfs://, @@ -87,6 +91,21 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf) throw new IllegalArgumentException( "You must specify at least 1 executor!\n" + getUsageMessage()) } + if (isClusterMode) { + for (key <- Seq(amMemKey, amMemOverheadKey)) { + if (sparkConf.contains(key)) { + println(s"$key is set but does not apply in cluster mode.") + } + } + amMemory = driverMemory + } else { + if (sparkConf.contains(driverMemOverheadKey)) { + println(s"$driverMemOverheadKey is set but does not apply in client mode.") + } + sparkConf.getOption(amMemKey) + .map(Utils.memoryStringToMb) + .foreach { mem => amMemory = mem } + } } private def parseArgs(inputArgs: List[String]): Unit = { @@ -118,7 +137,7 @@ private[spark] class ClientArguments(args: Array[String], sparkConf: SparkConf) if (args(0) == "--master-memory") { println("--master-memory is deprecated. Use --driver-memory instead.") } - amMemory = value + driverMemory = value args = tail case ("--num-workers" | "--num-executors") :: IntParam(value) :: tail => diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala index 09597bd0e6ab9..f99291553b7b8 100644 --- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala +++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala @@ -68,8 +68,6 @@ private[spark] class YarnClientSchedulerBackend( // List of (target Client argument, environment variable, Spark property) val optionTuples = List( - ("--driver-memory", "SPARK_MASTER_MEMORY", "spark.master.memory"), - ("--driver-memory", "SPARK_DRIVER_MEMORY", "spark.driver.memory"), ("--num-executors", "SPARK_WORKER_INSTANCES", "spark.executor.instances"), ("--num-executors", "SPARK_EXECUTOR_INSTANCES", "spark.executor.instances"), ("--executor-memory", "SPARK_WORKER_MEMORY", "spark.executor.memory"), From e0f28e010cdd67a2a4c8aebd35323d69a3182ba8 Mon Sep 17 00:00:00 2001 From: mcheah Date: Fri, 9 Jan 2015 14:16:20 -0800 Subject: [PATCH 098/116] [SPARK-4737] Task set manager properly handles serialization errors Dealing with [SPARK-4737], the handling of serialization errors should not be the DAGScheduler's responsibility. The task set manager now catches the error and aborts the stage. If the TaskSetManager throws a TaskNotSerializableException, the TaskSchedulerImpl will return an empty list of task descriptions, because no tasks were started. The scheduler should abort the stage gracefully. Note that I'm not too familiar with this part of the codebase and its place in the overall architecture of the Spark stack. If implementing it this way will have any averse side effects please voice that loudly. Author: mcheah Closes #3638 from mccheah/task-set-manager-properly-handle-ser-err and squashes the following commits: 1545984 [mcheah] Some more style fixes from Andrew Or. 5267929 [mcheah] Fixing style suggestions from Andrew Or. dfa145b [mcheah] Fixing style from Josh Rosen's feedback b2a430d [mcheah] Not returning empty seq when a task set cannot be serialized. 94844d7 [mcheah] Fixing compilation error, one brace too many 5f486f4 [mcheah] Adding license header for fake task class bf5e706 [mcheah] Fixing indentation. 097e7a2 [mcheah] [SPARK-4737] Catching task serialization exception in TaskSetManager --- .../spark/TaskNotSerializableException.scala | 25 +++++++++ .../apache/spark/scheduler/DAGScheduler.scala | 20 ------- .../spark/scheduler/TaskSchedulerImpl.scala | 54 +++++++++++++------ .../spark/scheduler/TaskSetManager.scala | 18 +++++-- .../org/apache/spark/SharedSparkContext.scala | 2 +- .../scala/org/apache/spark/rdd/RDDSuite.scala | 21 ++++++++ .../scheduler/NotSerializableFakeTask.scala | 40 ++++++++++++++ .../scheduler/TaskSchedulerImplSuite.scala | 30 +++++++++++ .../spark/scheduler/TaskSetManagerSuite.scala | 14 +++++ 9 files changed, 182 insertions(+), 42 deletions(-) create mode 100644 core/src/main/scala/org/apache/spark/TaskNotSerializableException.scala create mode 100644 core/src/test/scala/org/apache/spark/scheduler/NotSerializableFakeTask.scala diff --git a/core/src/main/scala/org/apache/spark/TaskNotSerializableException.scala b/core/src/main/scala/org/apache/spark/TaskNotSerializableException.scala new file mode 100644 index 0000000000000..9df61062e1f85 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/TaskNotSerializableException.scala @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark + +import org.apache.spark.annotation.DeveloperApi + +/** + * Exception thrown when a task cannot be serialized. + */ +private[spark] class TaskNotSerializableException(error: Throwable) extends Exception(error) diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 259621d263d7c..61d09d73e17cb 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -866,26 +866,6 @@ class DAGScheduler( } if (tasks.size > 0) { - // Preemptively serialize a task to make sure it can be serialized. We are catching this - // exception here because it would be fairly hard to catch the non-serializable exception - // down the road, where we have several different implementations for local scheduler and - // cluster schedulers. - // - // We've already serialized RDDs and closures in taskBinary, but here we check for all other - // objects such as Partition. - try { - closureSerializer.serialize(tasks.head) - } catch { - case e: NotSerializableException => - abortStage(stage, "Task not serializable: " + e.toString) - runningStages -= stage - return - case NonFatal(e) => // Other exceptions, such as IllegalArgumentException from Kryo. - abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}") - runningStages -= stage - return - } - logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")") stage.pendingTasks ++= tasks logDebug("New pending tasks: " + stage.pendingTasks) diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index a41f3eef195d2..a1dfb01062591 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -31,6 +31,7 @@ import scala.util.Random import org.apache.spark._ import org.apache.spark.TaskState.TaskState import org.apache.spark.scheduler.SchedulingMode.SchedulingMode +import org.apache.spark.scheduler.TaskLocality.TaskLocality import org.apache.spark.util.Utils import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockManagerId @@ -209,6 +210,40 @@ private[spark] class TaskSchedulerImpl( .format(manager.taskSet.id, manager.parent.name)) } + private def resourceOfferSingleTaskSet( + taskSet: TaskSetManager, + maxLocality: TaskLocality, + shuffledOffers: Seq[WorkerOffer], + availableCpus: Array[Int], + tasks: Seq[ArrayBuffer[TaskDescription]]) : Boolean = { + var launchedTask = false + for (i <- 0 until shuffledOffers.size) { + val execId = shuffledOffers(i).executorId + val host = shuffledOffers(i).host + if (availableCpus(i) >= CPUS_PER_TASK) { + try { + for (task <- taskSet.resourceOffer(execId, host, maxLocality)) { + tasks(i) += task + val tid = task.taskId + taskIdToTaskSetId(tid) = taskSet.taskSet.id + taskIdToExecutorId(tid) = execId + executorsByHost(host) += execId + availableCpus(i) -= CPUS_PER_TASK + assert(availableCpus(i) >= 0) + launchedTask = true + } + } catch { + case e: TaskNotSerializableException => + logError(s"Resource offer failed, task set ${taskSet.name} was not serializable") + // Do not offer resources for this task, but don't throw an error to allow other + // task sets to be submitted. + return launchedTask + } + } + } + return launchedTask + } + /** * Called by cluster manager to offer resources on slaves. We respond by asking our active task * sets for tasks in order of priority. We fill each node with tasks in a round-robin manner so @@ -251,23 +286,8 @@ private[spark] class TaskSchedulerImpl( var launchedTask = false for (taskSet <- sortedTaskSets; maxLocality <- taskSet.myLocalityLevels) { do { - launchedTask = false - for (i <- 0 until shuffledOffers.size) { - val execId = shuffledOffers(i).executorId - val host = shuffledOffers(i).host - if (availableCpus(i) >= CPUS_PER_TASK) { - for (task <- taskSet.resourceOffer(execId, host, maxLocality)) { - tasks(i) += task - val tid = task.taskId - taskIdToTaskSetId(tid) = taskSet.taskSet.id - taskIdToExecutorId(tid) = execId - executorsByHost(host) += execId - availableCpus(i) -= CPUS_PER_TASK - assert(availableCpus(i) >= 0) - launchedTask = true - } - } - } + launchedTask = resourceOfferSingleTaskSet( + taskSet, maxLocality, shuffledOffers, availableCpus, tasks) } while (launchedTask) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index 28e6147509f78..4667850917151 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -18,12 +18,14 @@ package org.apache.spark.scheduler import java.io.NotSerializableException +import java.nio.ByteBuffer import java.util.Arrays import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import scala.math.{min, max} +import scala.util.control.NonFatal import org.apache.spark._ import org.apache.spark.executor.TaskMetrics @@ -417,6 +419,7 @@ private[spark] class TaskSetManager( * @param host the host Id of the offered resource * @param maxLocality the maximum locality we want to schedule the tasks at */ + @throws[TaskNotSerializableException] def resourceOffer( execId: String, host: String, @@ -456,10 +459,17 @@ private[spark] class TaskSetManager( } // Serialize and return the task val startTime = clock.getTime() - // We rely on the DAGScheduler to catch non-serializable closures and RDDs, so in here - // we assume the task can be serialized without exceptions. - val serializedTask = Task.serializeWithDependencies( - task, sched.sc.addedFiles, sched.sc.addedJars, ser) + val serializedTask: ByteBuffer = try { + Task.serializeWithDependencies(task, sched.sc.addedFiles, sched.sc.addedJars, ser) + } catch { + // If the task cannot be serialized, then there's no point to re-attempt the task, + // as it will always fail. So just abort the whole task-set. + case NonFatal(e) => + val msg = s"Failed to serialize task $taskId, not attempting to retry it." + logError(msg, e) + abort(s"$msg Exception during serialization: $e") + throw new TaskNotSerializableException(e) + } if (serializedTask.limit > TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024 && !emittedTaskSizeWarning) { emittedTaskSizeWarning = true diff --git a/core/src/test/scala/org/apache/spark/SharedSparkContext.scala b/core/src/test/scala/org/apache/spark/SharedSparkContext.scala index 0b6511a80df1d..3d2700b7e6be4 100644 --- a/core/src/test/scala/org/apache/spark/SharedSparkContext.scala +++ b/core/src/test/scala/org/apache/spark/SharedSparkContext.scala @@ -30,7 +30,7 @@ trait SharedSparkContext extends BeforeAndAfterAll { self: Suite => var conf = new SparkConf(false) override def beforeAll() { - _sc = new SparkContext("local", "test", conf) + _sc = new SparkContext("local[4]", "test", conf) super.beforeAll() } diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index 6836e9ab0fd6b..0deb9b18b8688 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -17,6 +17,10 @@ package org.apache.spark.rdd +import java.io.{ObjectInputStream, ObjectOutputStream, IOException} + +import com.esotericsoftware.kryo.KryoException + import scala.collection.mutable.{ArrayBuffer, HashMap} import scala.collection.JavaConverters._ import scala.reflect.ClassTag @@ -887,6 +891,23 @@ class RDDSuite extends FunSuite with SharedSparkContext { assert(ancestors6.count(_.isInstanceOf[CyclicalDependencyRDD[_]]) === 3) } + test("task serialization exception should not hang scheduler") { + class BadSerializable extends Serializable { + @throws(classOf[IOException]) + private def writeObject(out: ObjectOutputStream): Unit = throw new KryoException("Bad serialization") + + @throws(classOf[IOException]) + private def readObject(in: ObjectInputStream): Unit = {} + } + // Note that in the original bug, SPARK-4349, that this verifies, the job would only hang if there were + // more threads in the Spark Context than there were number of objects in this sequence. + intercept[Throwable] { + sc.parallelize(Seq(new BadSerializable, new BadSerializable)).collect + } + // Check that the context has not crashed + sc.parallelize(1 to 100).map(x => x*2).collect + } + /** A contrived RDD that allows the manual addition of dependencies after creation. */ private class CyclicalDependencyRDD[T: ClassTag] extends RDD[T](sc, Nil) { private val mutableDependencies: ArrayBuffer[Dependency[_]] = ArrayBuffer.empty diff --git a/core/src/test/scala/org/apache/spark/scheduler/NotSerializableFakeTask.scala b/core/src/test/scala/org/apache/spark/scheduler/NotSerializableFakeTask.scala new file mode 100644 index 0000000000000..6b75c98839e03 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/scheduler/NotSerializableFakeTask.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.scheduler + +import java.io.{ObjectInputStream, ObjectOutputStream, IOException} + +import org.apache.spark.TaskContext + +/** + * A Task implementation that fails to serialize. + */ +private[spark] class NotSerializableFakeTask(myId: Int, stageId: Int) extends Task[Array[Byte]](stageId, 0) { + override def runTask(context: TaskContext): Array[Byte] = Array.empty[Byte] + override def preferredLocations: Seq[TaskLocation] = Seq[TaskLocation]() + + @throws(classOf[IOException]) + private def writeObject(out: ObjectOutputStream): Unit = { + if (stageId == 0) { + throw new IllegalStateException("Cannot serialize") + } + } + + @throws(classOf[IOException]) + private def readObject(in: ObjectInputStream): Unit = {} +} diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala index 8874cf00e9993..add13f5b21765 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala @@ -100,4 +100,34 @@ class TaskSchedulerImplSuite extends FunSuite with LocalSparkContext with Loggin assert(1 === taskDescriptions.length) assert("executor0" === taskDescriptions(0).executorId) } + + test("Scheduler does not crash when tasks are not serializable") { + sc = new SparkContext("local", "TaskSchedulerImplSuite") + val taskCpus = 2 + + sc.conf.set("spark.task.cpus", taskCpus.toString) + val taskScheduler = new TaskSchedulerImpl(sc) + taskScheduler.initialize(new FakeSchedulerBackend) + // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks. + val dagScheduler = new DAGScheduler(sc, taskScheduler) { + override def taskStarted(task: Task[_], taskInfo: TaskInfo) {} + override def executorAdded(execId: String, host: String) {} + } + val numFreeCores = 1 + taskScheduler.setDAGScheduler(dagScheduler) + var taskSet = new TaskSet(Array(new NotSerializableFakeTask(1, 0), new NotSerializableFakeTask(0, 1)), 0, 0, 0, null) + val multiCoreWorkerOffers = Seq(new WorkerOffer("executor0", "host0", taskCpus), + new WorkerOffer("executor1", "host1", numFreeCores)) + taskScheduler.submitTasks(taskSet) + var taskDescriptions = taskScheduler.resourceOffers(multiCoreWorkerOffers).flatten + assert(0 === taskDescriptions.length) + + // Now check that we can still submit tasks + // Even if one of the tasks has not-serializable tasks, the other task set should still be processed without error + taskScheduler.submitTasks(taskSet) + taskScheduler.submitTasks(FakeTask.createTaskSet(1)) + taskDescriptions = taskScheduler.resourceOffers(multiCoreWorkerOffers).flatten + assert(taskDescriptions.map(_.executorId) === Seq("executor0")) + } + } diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala index 472191551a01f..84b9b788237bf 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.scheduler +import java.io.{ObjectInputStream, ObjectOutputStream, IOException} import java.util.Random import scala.collection.mutable.ArrayBuffer @@ -563,6 +564,19 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging { assert(manager.emittedTaskSizeWarning) } + test("Not serializable exception thrown if the task cannot be serialized") { + sc = new SparkContext("local", "test") + val sched = new FakeTaskScheduler(sc, ("exec1", "host1")) + + val taskSet = new TaskSet(Array(new NotSerializableFakeTask(1, 0), new NotSerializableFakeTask(0, 1)), 0, 0, 0, null) + val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES) + + intercept[TaskNotSerializableException] { + manager.resourceOffer("exec1", "host1", ANY) + } + assert(manager.isZombie) + } + test("abort the job if total size of results is too large") { val conf = new SparkConf().set("spark.driver.maxResultSize", "2m") sc = new SparkContext("local", "test", conf) From ae628725abce9ffe34b9a7110d5ac51a076454aa Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Fri, 9 Jan 2015 14:40:45 -0800 Subject: [PATCH 099/116] [DOC] Fixed Mesos version in doc from 0.18.1 to 0.21.0 #3934 upgraded Mesos version so we should also fix docs right? This issue is really minor so I don't file in JIRA. Author: Kousuke Saruta Closes #3982 from sarutak/fix-mesos-version and squashes the following commits: 9a86ee3 [Kousuke Saruta] Fixed mesos version from 0.18.1 to 0.21.0 --- docs/_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_config.yml b/docs/_config.yml index a96a76dd9ab5e..e2db274e1f619 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -17,6 +17,6 @@ SPARK_VERSION: 1.3.0-SNAPSHOT SPARK_VERSION_SHORT: 1.3.0 SCALA_BINARY_VERSION: "2.10" SCALA_VERSION: "2.10.4" -MESOS_VERSION: 0.18.1 +MESOS_VERSION: 0.21.0 SPARK_ISSUE_TRACKER_URL: https://issues.apache.org/jira/browse/SPARK SPARK_GITHUB_URL: https://github.com/apache/spark From 4e1f12d997426560226648d62ee17c90352613e7 Mon Sep 17 00:00:00 2001 From: bilna Date: Fri, 9 Jan 2015 14:45:28 -0800 Subject: [PATCH 100/116] [Minor] Fix import order and other coding style fixed import order and other coding style Author: bilna Author: Bilna P Closes #3966 from Bilna/master and squashes the following commits: 5e76f04 [bilna] fix import order and other coding style 5718d66 [bilna] Merge remote-tracking branch 'upstream/master' ae56514 [bilna] Merge remote-tracking branch 'upstream/master' acea3a3 [bilna] Adding dependency with scope test 28681fa [bilna] Merge remote-tracking branch 'upstream/master' fac3904 [bilna] Correction in Indentation and coding style ed9db4c [bilna] Merge remote-tracking branch 'upstream/master' 4b34ee7 [Bilna P] Update MQTTStreamSuite.scala 04503cf [bilna] Added embedded broker service for mqtt test 89d804e [bilna] Merge remote-tracking branch 'upstream/master' fc8eb28 [bilna] Merge remote-tracking branch 'upstream/master' 4b58094 [Bilna P] Update MQTTStreamSuite.scala b1ac4ad [bilna] Added BeforeAndAfter 5f6bfd2 [bilna] Added BeforeAndAfter e8b6623 [Bilna P] Update MQTTStreamSuite.scala 5ca6691 [Bilna P] Update MQTTStreamSuite.scala 8616495 [bilna] [SPARK-4631] unit test for MQTT --- .../spark/streaming/mqtt/MQTTStreamSuite.scala | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala index 98fe6cb301f52..39eb8b183488f 100644 --- a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala +++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala @@ -19,16 +19,19 @@ package org.apache.spark.streaming.mqtt import java.net.{URI, ServerSocket} +import scala.concurrent.duration._ + import org.apache.activemq.broker.{TransportConnector, BrokerService} -import org.apache.spark.util.Utils +import org.eclipse.paho.client.mqttv3._ +import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence + import org.scalatest.{BeforeAndAfter, FunSuite} import org.scalatest.concurrent.Eventually -import scala.concurrent.duration._ + import org.apache.spark.streaming.{Milliseconds, StreamingContext} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.ReceiverInputDStream -import org.eclipse.paho.client.mqttv3._ -import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence +import org.apache.spark.util.Utils class MQTTStreamSuite extends FunSuite with Eventually with BeforeAndAfter { @@ -38,8 +41,9 @@ class MQTTStreamSuite extends FunSuite with Eventually with BeforeAndAfter { private val freePort = findFreePort() private val brokerUri = "//localhost:" + freePort private val topic = "def" - private var ssc: StreamingContext = _ private val persistenceDir = Utils.createTempDir() + + private var ssc: StreamingContext = _ private var broker: BrokerService = _ private var connector: TransportConnector = _ @@ -115,8 +119,9 @@ class MQTTStreamSuite extends FunSuite with Eventually with BeforeAndAfter { val message: MqttMessage = new MqttMessage(data.getBytes("utf-8")) message.setQos(1) message.setRetained(true) - for (i <- 0 to 100) + for (i <- 0 to 100) { msgTopic.publish(message) + } } } finally { client.disconnect() From 8782eb992f461502238c41ece3a3002efa67a792 Mon Sep 17 00:00:00 2001 From: WangTaoTheTonic Date: Fri, 9 Jan 2015 17:10:02 -0800 Subject: [PATCH 101/116] [SPARK-4990][Deploy]to find default properties file, search SPARK_CONF_DIR first https://issues.apache.org/jira/browse/SPARK-4990 Author: WangTaoTheTonic Author: WangTao Closes #3823 from WangTaoTheTonic/SPARK-4990 and squashes the following commits: 133c43e [WangTao] Update spark-submit2.cmd b1ab402 [WangTao] Update spark-submit 4cc7f34 [WangTaoTheTonic] rebase 55300bc [WangTaoTheTonic] use export to make it global d8d3cb7 [WangTaoTheTonic] remove blank line 07b9ebf [WangTaoTheTonic] check SPARK_CONF_DIR instead of checking properties file c5a85eb [WangTaoTheTonic] to find default properties file, search SPARK_CONF_DIR first --- bin/spark-submit | 5 ++++- bin/spark-submit2.cmd | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/bin/spark-submit b/bin/spark-submit index aefd38a0a2b90..3e5cbdbb24394 100755 --- a/bin/spark-submit +++ b/bin/spark-submit @@ -44,7 +44,10 @@ while (($#)); do shift done -DEFAULT_PROPERTIES_FILE="$SPARK_HOME/conf/spark-defaults.conf" +if [ -z "$SPARK_CONF_DIR" ]; then + export SPARK_CONF_DIR="$SPARK_HOME/conf" +fi +DEFAULT_PROPERTIES_FILE="$SPARK_CONF_DIR/spark-defaults.conf" if [ "$MASTER" == "yarn-cluster" ]; then SPARK_SUBMIT_DEPLOY_MODE=cluster fi diff --git a/bin/spark-submit2.cmd b/bin/spark-submit2.cmd index daf0284db9230..12244a9cb04fb 100644 --- a/bin/spark-submit2.cmd +++ b/bin/spark-submit2.cmd @@ -24,7 +24,11 @@ set ORIG_ARGS=%* rem Reset the values of all variables used set SPARK_SUBMIT_DEPLOY_MODE=client -set SPARK_SUBMIT_PROPERTIES_FILE=%SPARK_HOME%\conf\spark-defaults.conf + +if not defined %SPARK_CONF_DIR% ( + set SPARK_CONF_DIR=%SPARK_HOME%\conf +) +set SPARK_SUBMIT_PROPERTIES_FILE=%SPARK_CONF_DIR%\spark-defaults.conf set SPARK_SUBMIT_DRIVER_MEMORY= set SPARK_SUBMIT_LIBRARY_PATH= set SPARK_SUBMIT_CLASSPATH= From 4554529dce8fe8ca937d887109ef072eef52bf51 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Fri, 9 Jan 2015 17:45:18 -0800 Subject: [PATCH 102/116] [SPARK-4406] [MLib] FIX: Validate k in SVD Raise exception when k is non-positive in SVD Author: MechCoder Closes #3945 from MechCoder/spark-4406 and squashes the following commits: 64e6d2d [MechCoder] TST: Add better test errors and messages 12dae73 [MechCoder] [SPARK-4406] FIX: Validate k in SVD --- .../spark/mllib/linalg/distributed/IndexedRowMatrix.scala | 3 +++ .../apache/spark/mllib/linalg/distributed/RowMatrix.scala | 2 +- .../mllib/linalg/distributed/IndexedRowMatrixSuite.scala | 7 +++++++ .../spark/mllib/linalg/distributed/RowMatrixSuite.scala | 8 ++++++++ 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala index 36d8cadd2bdd7..181f507516485 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala @@ -102,6 +102,9 @@ class IndexedRowMatrix( k: Int, computeU: Boolean = false, rCond: Double = 1e-9): SingularValueDecomposition[IndexedRowMatrix, Matrix] = { + + val n = numCols().toInt + require(k > 0 && k <= n, s"Requested k singular values but got k=$k and numCols=$n.") val indices = rows.map(_.index) val svd = toRowMatrix().computeSVD(k, computeU, rCond) val U = if (computeU) { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index fbd35e372f9b1..d5abba6a4b645 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -212,7 +212,7 @@ class RowMatrix( tol: Double, mode: String): SingularValueDecomposition[RowMatrix, Matrix] = { val n = numCols().toInt - require(k > 0 && k <= n, s"Request up to n singular values but got k=$k and n=$n.") + require(k > 0 && k <= n, s"Requested k singular values but got k=$k and numCols=$n.") object SVDMode extends Enumeration { val LocalARPACK, LocalLAPACK, DistARPACK = Value diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala index e25bc02b06c9a..741cd4997b853 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala @@ -113,6 +113,13 @@ class IndexedRowMatrixSuite extends FunSuite with MLlibTestSparkContext { assert(closeToZero(U * brzDiag(s) * V.t - localA)) } + test("validate k in svd") { + val A = new IndexedRowMatrix(indexedRows) + intercept[IllegalArgumentException] { + A.computeSVD(-1) + } + } + def closeToZero(G: BDM[Double]): Boolean = { G.valuesIterator.map(math.abs).sum < 1e-6 } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala index dbf55ff81ca99..3309713e91f87 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala @@ -171,6 +171,14 @@ class RowMatrixSuite extends FunSuite with MLlibTestSparkContext { } } + test("validate k in svd") { + for (mat <- Seq(denseMat, sparseMat)) { + intercept[IllegalArgumentException] { + mat.computeSVD(-1) + } + } + } + def closeToZero(G: BDM[Double]): Boolean = { G.valuesIterator.map(math.abs).sum < 1e-6 } From 545dfcb92f2ff82a51877d35d9669094ea81f466 Mon Sep 17 00:00:00 2001 From: luogankun Date: Fri, 9 Jan 2015 20:38:41 -0800 Subject: [PATCH 103/116] [SPARK-5141][SQL]CaseInsensitiveMap throws java.io.NotSerializableException CaseInsensitiveMap throws java.io.NotSerializableException. Author: luogankun Closes #3944 from luogankun/SPARK-5141 and squashes the following commits: b6d63d5 [luogankun] [SPARK-5141]CaseInsensitiveMap throws java.io.NotSerializableException --- sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala index 8a66ac31f2dfb..364bacec83b98 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala @@ -110,7 +110,8 @@ private[sql] case class CreateTableUsing( /** * Builds a map in which keys are case insensitive */ -protected class CaseInsensitiveMap(map: Map[String, String]) extends Map[String, String] { +protected class CaseInsensitiveMap(map: Map[String, String]) extends Map[String, String] + with Serializable { val baseMap = map.map(kv => kv.copy(_1 = kv._1.toLowerCase)) From 1e56eba5d906bef793dfd6f199db735a6116a764 Mon Sep 17 00:00:00 2001 From: Alex Liu Date: Sat, 10 Jan 2015 13:19:12 -0800 Subject: [PATCH 104/116] [SPARK-4925][SQL] Publish Spark SQL hive-thriftserver maven artifact Author: Alex Liu Closes #3766 from alexliu68/SPARK-SQL-4925 and squashes the following commits: 3137b51 [Alex Liu] [SPARK-4925][SQL] Remove sql/hive-thriftserver module from pom.xml 15f2e38 [Alex Liu] [SPARK-4925][SQL] Publish Spark SQL hive-thriftserver maven artifact --- sql/hive-thriftserver/pom.xml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index 259eef0b80d03..123a1f629ab1c 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -76,13 +76,6 @@ - - org.apache.maven.plugins - maven-deploy-plugin - - true - - From 4b39fd1e63188821fc84a13f7ccb6e94277f4be7 Mon Sep 17 00:00:00 2001 From: Alex Liu Date: Sat, 10 Jan 2015 13:23:09 -0800 Subject: [PATCH 105/116] [SPARK-4943][SQL] Allow table name having dot for db/catalog The pull only fixes the parsing error and changes API to use tableIdentifier. Joining different catalog datasource related change is not done in this pull. Author: Alex Liu Closes #3941 from alexliu68/SPARK-SQL-4943-3 and squashes the following commits: 343ae27 [Alex Liu] [SPARK-4943][SQL] refactoring according to review 29e5e55 [Alex Liu] [SPARK-4943][SQL] fix failed Hive CTAS tests 6ae77ce [Alex Liu] [SPARK-4943][SQL] fix TestHive matching error 3652997 [Alex Liu] [SPARK-4943][SQL] Allow table name having dot to support db/catalog ... --- .../apache/spark/sql/catalyst/SqlParser.scala | 6 +- .../sql/catalyst/analysis/Analyzer.scala | 8 +- .../spark/sql/catalyst/analysis/Catalog.scala | 106 +++++++++--------- .../sql/catalyst/analysis/unresolved.scala | 3 +- .../spark/sql/catalyst/dsl/package.scala | 2 +- .../sql/catalyst/analysis/AnalysisSuite.scala | 20 ++-- .../analysis/DecimalPrecisionSuite.scala | 2 +- .../org/apache/spark/sql/SQLContext.scala | 6 +- .../org/apache/spark/sql/SchemaRDDLike.scala | 4 +- .../org/apache/spark/sql/JoinSuite.scala | 4 +- .../apache/spark/sql/hive/HiveContext.scala | 2 +- .../spark/sql/hive/HiveMetastoreCatalog.scala | 52 ++++++--- .../org/apache/spark/sql/hive/HiveQl.scala | 29 +++-- .../org/apache/spark/sql/hive/TestHive.scala | 2 +- .../hive/execution/CreateTableAsSelect.scala | 4 +- .../spark/sql/hive/execution/commands.scala | 2 +- .../spark/sql/hive/StatisticsSuite.scala | 4 +- 17 files changed, 143 insertions(+), 113 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index f79d4ff444dc0..fc7b8745590d1 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala @@ -178,10 +178,10 @@ class SqlParser extends AbstractSparkSQLParser { joinedRelation | relationFactor protected lazy val relationFactor: Parser[LogicalPlan] = - ( ident ~ (opt(AS) ~> opt(ident)) ^^ { - case tableName ~ alias => UnresolvedRelation(None, tableName, alias) + ( rep1sep(ident, ".") ~ (opt(AS) ~> opt(ident)) ^^ { + case tableIdent ~ alias => UnresolvedRelation(tableIdent, alias) } - | ("(" ~> start <~ ")") ~ (AS.? ~> ident) ^^ { case s ~ a => Subquery(a, s) } + | ("(" ~> start <~ ")") ~ (AS.? ~> ident) ^^ { case s ~ a => Subquery(a, s) } ) protected lazy val joinedRelation: Parser[LogicalPlan] = diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 72680f37a0b4d..c009cc1e1e85c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -228,11 +228,11 @@ class Analyzer(catalog: Catalog, */ object ResolveRelations extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan transform { - case i @ InsertIntoTable(UnresolvedRelation(databaseName, name, alias), _, _, _) => + case i @ InsertIntoTable(UnresolvedRelation(tableIdentifier, alias), _, _, _) => i.copy( - table = EliminateAnalysisOperators(catalog.lookupRelation(databaseName, name, alias))) - case UnresolvedRelation(databaseName, name, alias) => - catalog.lookupRelation(databaseName, name, alias) + table = EliminateAnalysisOperators(catalog.lookupRelation(tableIdentifier, alias))) + case UnresolvedRelation(tableIdentifier, alias) => + catalog.lookupRelation(tableIdentifier, alias) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala index 0415d74bd8141..df8d03b86c533 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala @@ -28,77 +28,74 @@ trait Catalog { def caseSensitive: Boolean - def tableExists(db: Option[String], tableName: String): Boolean + def tableExists(tableIdentifier: Seq[String]): Boolean def lookupRelation( - databaseName: Option[String], - tableName: String, - alias: Option[String] = None): LogicalPlan + tableIdentifier: Seq[String], + alias: Option[String] = None): LogicalPlan - def registerTable(databaseName: Option[String], tableName: String, plan: LogicalPlan): Unit + def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit - def unregisterTable(databaseName: Option[String], tableName: String): Unit + def unregisterTable(tableIdentifier: Seq[String]): Unit def unregisterAllTables(): Unit - protected def processDatabaseAndTableName( - databaseName: Option[String], - tableName: String): (Option[String], String) = { + protected def processTableIdentifier(tableIdentifier: Seq[String]): Seq[String] = { if (!caseSensitive) { - (databaseName.map(_.toLowerCase), tableName.toLowerCase) + tableIdentifier.map(_.toLowerCase) } else { - (databaseName, tableName) + tableIdentifier } } - protected def processDatabaseAndTableName( - databaseName: String, - tableName: String): (String, String) = { - if (!caseSensitive) { - (databaseName.toLowerCase, tableName.toLowerCase) + protected def getDbTableName(tableIdent: Seq[String]): String = { + val size = tableIdent.size + if (size <= 2) { + tableIdent.mkString(".") } else { - (databaseName, tableName) + tableIdent.slice(size - 2, size).mkString(".") } } + + protected def getDBTable(tableIdent: Seq[String]) : (Option[String], String) = { + (tableIdent.lift(tableIdent.size - 2), tableIdent.last) + } } class SimpleCatalog(val caseSensitive: Boolean) extends Catalog { val tables = new mutable.HashMap[String, LogicalPlan]() override def registerTable( - databaseName: Option[String], - tableName: String, + tableIdentifier: Seq[String], plan: LogicalPlan): Unit = { - val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName) - tables += ((tblName, plan)) + val tableIdent = processTableIdentifier(tableIdentifier) + tables += ((getDbTableName(tableIdent), plan)) } - override def unregisterTable( - databaseName: Option[String], - tableName: String) = { - val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName) - tables -= tblName + override def unregisterTable(tableIdentifier: Seq[String]) = { + val tableIdent = processTableIdentifier(tableIdentifier) + tables -= getDbTableName(tableIdent) } override def unregisterAllTables() = { tables.clear() } - override def tableExists(db: Option[String], tableName: String): Boolean = { - val (dbName, tblName) = processDatabaseAndTableName(db, tableName) - tables.get(tblName) match { + override def tableExists(tableIdentifier: Seq[String]): Boolean = { + val tableIdent = processTableIdentifier(tableIdentifier) + tables.get(getDbTableName(tableIdent)) match { case Some(_) => true case None => false } } override def lookupRelation( - databaseName: Option[String], - tableName: String, + tableIdentifier: Seq[String], alias: Option[String] = None): LogicalPlan = { - val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName) - val table = tables.getOrElse(tblName, sys.error(s"Table Not Found: $tableName")) - val tableWithQualifiers = Subquery(tblName, table) + val tableIdent = processTableIdentifier(tableIdentifier) + val tableFullName = getDbTableName(tableIdent) + val table = tables.getOrElse(tableFullName, sys.error(s"Table Not Found: $tableFullName")) + val tableWithQualifiers = Subquery(tableIdent.last, table) // If an alias was specified by the lookup, wrap the plan in a subquery so that attributes are // properly qualified with this alias. @@ -117,41 +114,39 @@ trait OverrideCatalog extends Catalog { // TODO: This doesn't work when the database changes... val overrides = new mutable.HashMap[(Option[String],String), LogicalPlan]() - abstract override def tableExists(db: Option[String], tableName: String): Boolean = { - val (dbName, tblName) = processDatabaseAndTableName(db, tableName) - overrides.get((dbName, tblName)) match { + abstract override def tableExists(tableIdentifier: Seq[String]): Boolean = { + val tableIdent = processTableIdentifier(tableIdentifier) + overrides.get(getDBTable(tableIdent)) match { case Some(_) => true - case None => super.tableExists(db, tableName) + case None => super.tableExists(tableIdentifier) } } abstract override def lookupRelation( - databaseName: Option[String], - tableName: String, + tableIdentifier: Seq[String], alias: Option[String] = None): LogicalPlan = { - val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName) - val overriddenTable = overrides.get((dbName, tblName)) - val tableWithQualifers = overriddenTable.map(r => Subquery(tblName, r)) + val tableIdent = processTableIdentifier(tableIdentifier) + val overriddenTable = overrides.get(getDBTable(tableIdent)) + val tableWithQualifers = overriddenTable.map(r => Subquery(tableIdent.last, r)) // If an alias was specified by the lookup, wrap the plan in a subquery so that attributes are // properly qualified with this alias. val withAlias = tableWithQualifers.map(r => alias.map(a => Subquery(a, r)).getOrElse(r)) - withAlias.getOrElse(super.lookupRelation(dbName, tblName, alias)) + withAlias.getOrElse(super.lookupRelation(tableIdentifier, alias)) } override def registerTable( - databaseName: Option[String], - tableName: String, + tableIdentifier: Seq[String], plan: LogicalPlan): Unit = { - val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName) - overrides.put((dbName, tblName), plan) + val tableIdent = processTableIdentifier(tableIdentifier) + overrides.put(getDBTable(tableIdent), plan) } - override def unregisterTable(databaseName: Option[String], tableName: String): Unit = { - val (dbName, tblName) = processDatabaseAndTableName(databaseName, tableName) - overrides.remove((dbName, tblName)) + override def unregisterTable(tableIdentifier: Seq[String]): Unit = { + val tableIdent = processTableIdentifier(tableIdentifier) + overrides.remove(getDBTable(tableIdent)) } override def unregisterAllTables(): Unit = { @@ -167,22 +162,21 @@ object EmptyCatalog extends Catalog { val caseSensitive: Boolean = true - def tableExists(db: Option[String], tableName: String): Boolean = { + def tableExists(tableIdentifier: Seq[String]): Boolean = { throw new UnsupportedOperationException } def lookupRelation( - databaseName: Option[String], - tableName: String, + tableIdentifier: Seq[String], alias: Option[String] = None) = { throw new UnsupportedOperationException } - def registerTable(databaseName: Option[String], tableName: String, plan: LogicalPlan): Unit = { + def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit = { throw new UnsupportedOperationException } - def unregisterTable(databaseName: Option[String], tableName: String): Unit = { + def unregisterTable(tableIdentifier: Seq[String]): Unit = { throw new UnsupportedOperationException } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index 77d84e1687e1b..71a738a0b2ca0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -34,8 +34,7 @@ class UnresolvedException[TreeType <: TreeNode[_]](tree: TreeType, function: Str * Holds the name of a relation that has yet to be looked up in a [[Catalog]]. */ case class UnresolvedRelation( - databaseName: Option[String], - tableName: String, + tableIdentifier: Seq[String], alias: Option[String] = None) extends LeafNode { override def output = Nil override lazy val resolved = false diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 9608e15c0f302..b2262e5e6efb6 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -290,7 +290,7 @@ package object dsl { def insertInto(tableName: String, overwrite: Boolean = false) = InsertIntoTable( - analysis.UnresolvedRelation(None, tableName), Map.empty, logicalPlan, overwrite) + analysis.UnresolvedRelation(Seq(tableName)), Map.empty, logicalPlan, overwrite) def analyze = analysis.SimpleAnalyzer(logicalPlan) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index 82f2101d8ce17..f430057ef7191 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -44,8 +44,8 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter { AttributeReference("e", ShortType)()) before { - caseSensitiveCatalog.registerTable(None, "TaBlE", testRelation) - caseInsensitiveCatalog.registerTable(None, "TaBlE", testRelation) + caseSensitiveCatalog.registerTable(Seq("TaBlE"), testRelation) + caseInsensitiveCatalog.registerTable(Seq("TaBlE"), testRelation) } test("union project *") { @@ -64,45 +64,45 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter { assert( caseSensitiveAnalyze( Project(Seq(UnresolvedAttribute("TbL.a")), - UnresolvedRelation(None, "TaBlE", Some("TbL")))) === + UnresolvedRelation(Seq("TaBlE"), Some("TbL")))) === Project(testRelation.output, testRelation)) val e = intercept[TreeNodeException[_]] { caseSensitiveAnalyze( Project(Seq(UnresolvedAttribute("tBl.a")), - UnresolvedRelation(None, "TaBlE", Some("TbL")))) + UnresolvedRelation(Seq("TaBlE"), Some("TbL")))) } assert(e.getMessage().toLowerCase.contains("unresolved")) assert( caseInsensitiveAnalyze( Project(Seq(UnresolvedAttribute("TbL.a")), - UnresolvedRelation(None, "TaBlE", Some("TbL")))) === + UnresolvedRelation(Seq("TaBlE"), Some("TbL")))) === Project(testRelation.output, testRelation)) assert( caseInsensitiveAnalyze( Project(Seq(UnresolvedAttribute("tBl.a")), - UnresolvedRelation(None, "TaBlE", Some("TbL")))) === + UnresolvedRelation(Seq("TaBlE"), Some("TbL")))) === Project(testRelation.output, testRelation)) } test("resolve relations") { val e = intercept[RuntimeException] { - caseSensitiveAnalyze(UnresolvedRelation(None, "tAbLe", None)) + caseSensitiveAnalyze(UnresolvedRelation(Seq("tAbLe"), None)) } assert(e.getMessage == "Table Not Found: tAbLe") assert( - caseSensitiveAnalyze(UnresolvedRelation(None, "TaBlE", None)) === + caseSensitiveAnalyze(UnresolvedRelation(Seq("TaBlE"), None)) === testRelation) assert( - caseInsensitiveAnalyze(UnresolvedRelation(None, "tAbLe", None)) === + caseInsensitiveAnalyze(UnresolvedRelation(Seq("tAbLe"), None)) === testRelation) assert( - caseInsensitiveAnalyze(UnresolvedRelation(None, "TaBlE", None)) === + caseInsensitiveAnalyze(UnresolvedRelation(Seq("TaBlE"), None)) === testRelation) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala index 3677a6e72e23a..bbbeb4f2e4fe3 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala @@ -41,7 +41,7 @@ class DecimalPrecisionSuite extends FunSuite with BeforeAndAfter { val f: Expression = UnresolvedAttribute("f") before { - catalog.registerTable(None, "table", relation) + catalog.registerTable(Seq("table"), relation) } private def checkType(expression: Expression, expectedType: DataType): Unit = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 6a1a4d995bf61..9962937277dad 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -276,7 +276,7 @@ class SQLContext(@transient val sparkContext: SparkContext) * @group userf */ def registerRDDAsTable(rdd: SchemaRDD, tableName: String): Unit = { - catalog.registerTable(None, tableName, rdd.queryExecution.logical) + catalog.registerTable(Seq(tableName), rdd.queryExecution.logical) } /** @@ -289,7 +289,7 @@ class SQLContext(@transient val sparkContext: SparkContext) */ def dropTempTable(tableName: String): Unit = { tryUncacheQuery(table(tableName)) - catalog.unregisterTable(None, tableName) + catalog.unregisterTable(Seq(tableName)) } /** @@ -308,7 +308,7 @@ class SQLContext(@transient val sparkContext: SparkContext) /** Returns the specified table as a SchemaRDD */ def table(tableName: String): SchemaRDD = - new SchemaRDD(this, catalog.lookupRelation(None, tableName)) + new SchemaRDD(this, catalog.lookupRelation(Seq(tableName))) /** * :: DeveloperApi :: diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala index fd5f4abcbcd65..3cf9209465b76 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDDLike.scala @@ -97,8 +97,8 @@ private[sql] trait SchemaRDDLike { */ @Experimental def insertInto(tableName: String, overwrite: Boolean): Unit = - sqlContext.executePlan( - InsertIntoTable(UnresolvedRelation(None, tableName), Map.empty, logicalPlan, overwrite)).toRdd + sqlContext.executePlan(InsertIntoTable(UnresolvedRelation(Seq(tableName)), + Map.empty, logicalPlan, overwrite)).toRdd /** * :: Experimental :: diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala index 1a4232dab86e7..c7e136388fce8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala @@ -302,8 +302,8 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach { upperCaseData.where('N <= 4).registerTempTable("left") upperCaseData.where('N >= 3).registerTempTable("right") - val left = UnresolvedRelation(None, "left", None) - val right = UnresolvedRelation(None, "right", None) + val left = UnresolvedRelation(Seq("left"), None) + val right = UnresolvedRelation(Seq("right"), None) checkAnswer( left.join(right, FullOuter, Some("left.N".attr === "right.N".attr)), diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala index 982e0593fcfd1..1648fa826b900 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala @@ -124,7 +124,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { * in the Hive metastore. */ def analyze(tableName: String) { - val relation = EliminateAnalysisOperators(catalog.lookupRelation(None, tableName)) + val relation = EliminateAnalysisOperators(catalog.lookupRelation(Seq(tableName))) relation match { case relation: MetastoreRelation => diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index b31a3ec25096b..2c859894cf8d3 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -30,6 +30,7 @@ import org.apache.hadoop.hive.metastore.TableType import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.hadoop.hive.metastore.api.{Table => TTable, Partition => TPartition} import org.apache.hadoop.hive.ql.metadata.{Hive, Partition, Table, HiveException} +import org.apache.hadoop.hive.ql.metadata.InvalidTableException import org.apache.hadoop.hive.ql.plan.CreateTableDesc import org.apache.hadoop.hive.serde.serdeConstants import org.apache.hadoop.hive.serde2.{Deserializer, SerDeException} @@ -57,18 +58,25 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with val caseSensitive: Boolean = false - def tableExists(db: Option[String], tableName: String): Boolean = { - val (databaseName, tblName) = processDatabaseAndTableName( - db.getOrElse(hive.sessionState.getCurrentDatabase), tableName) - client.getTable(databaseName, tblName, false) != null + def tableExists(tableIdentifier: Seq[String]): Boolean = { + val tableIdent = processTableIdentifier(tableIdentifier) + val databaseName = tableIdent.lift(tableIdent.size - 2).getOrElse( + hive.sessionState.getCurrentDatabase) + val tblName = tableIdent.last + try { + client.getTable(databaseName, tblName) != null + } catch { + case ie: InvalidTableException => false + } } def lookupRelation( - db: Option[String], - tableName: String, + tableIdentifier: Seq[String], alias: Option[String]): LogicalPlan = synchronized { - val (databaseName, tblName) = - processDatabaseAndTableName(db.getOrElse(hive.sessionState.getCurrentDatabase), tableName) + val tableIdent = processTableIdentifier(tableIdentifier) + val databaseName = tableIdent.lift(tableIdent.size - 2).getOrElse( + hive.sessionState.getCurrentDatabase) + val tblName = tableIdent.last val table = client.getTable(databaseName, tblName) if (table.isView) { // if the unresolved relation is from hive view @@ -251,6 +259,26 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with } } + protected def processDatabaseAndTableName( + databaseName: Option[String], + tableName: String): (Option[String], String) = { + if (!caseSensitive) { + (databaseName.map(_.toLowerCase), tableName.toLowerCase) + } else { + (databaseName, tableName) + } + } + + protected def processDatabaseAndTableName( + databaseName: String, + tableName: String): (String, String) = { + if (!caseSensitive) { + (databaseName.toLowerCase, tableName.toLowerCase) + } else { + (databaseName, tableName) + } + } + /** * Creates any tables required for query execution. * For example, because of a CREATE TABLE X AS statement. @@ -270,7 +298,7 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with val databaseName = dbName.getOrElse(hive.sessionState.getCurrentDatabase) // Get the CreateTableDesc from Hive SemanticAnalyzer - val desc: Option[CreateTableDesc] = if (tableExists(Some(databaseName), tblName)) { + val desc: Option[CreateTableDesc] = if (tableExists(Seq(databaseName, tblName))) { None } else { val sa = new SemanticAnalyzer(hive.hiveconf) { @@ -352,15 +380,13 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with * UNIMPLEMENTED: It needs to be decided how we will persist in-memory tables to the metastore. * For now, if this functionality is desired mix in the in-memory [[OverrideCatalog]]. */ - override def registerTable( - databaseName: Option[String], tableName: String, plan: LogicalPlan): Unit = ??? + override def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit = ??? /** * UNIMPLEMENTED: It needs to be decided how we will persist in-memory tables to the metastore. * For now, if this functionality is desired mix in the in-memory [[OverrideCatalog]]. */ - override def unregisterTable( - databaseName: Option[String], tableName: String): Unit = ??? + override def unregisterTable(tableIdentifier: Seq[String]): Unit = ??? override def unregisterAllTables() = {} } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index 8a9613cf96e54..c2ab3579c1f95 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -386,6 +386,15 @@ private[hive] object HiveQl { (db, tableName) } + protected def extractTableIdent(tableNameParts: Node): Seq[String] = { + tableNameParts.getChildren.map { case Token(part, Nil) => cleanIdentifier(part) } match { + case Seq(tableOnly) => Seq(tableOnly) + case Seq(databaseName, table) => Seq(databaseName, table) + case other => sys.error("Hive only supports tables names like 'tableName' " + + s"or 'databaseName.tableName', found '$other'") + } + } + /** * SELECT MAX(value) FROM src GROUP BY k1, k2, k3 GROUPING SETS((k1, k2), (k2)) * is equivalent to @@ -475,16 +484,16 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C case Token(".", dbName :: tableName :: Nil) => // It is describing a table with the format like "describe db.table". // TODO: Actually, a user may mean tableName.columnName. Need to resolve this issue. - val (db, tableName) = extractDbNameTableName(nameParts.head) + val tableIdent = extractTableIdent(nameParts.head) DescribeCommand( - UnresolvedRelation(db, tableName, None), extended.isDefined) + UnresolvedRelation(tableIdent, None), extended.isDefined) case Token(".", dbName :: tableName :: colName :: Nil) => // It is describing a column with the format like "describe db.table column". NativePlaceholder case tableName => // It is describing a table with the format like "describe table". DescribeCommand( - UnresolvedRelation(None, tableName.getText, None), + UnresolvedRelation(Seq(tableName.getText), None), extended.isDefined) } } @@ -757,13 +766,15 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C nonAliasClauses) } - val (db, tableName) = + val tableIdent = tableNameParts.getChildren.map{ case Token(part, Nil) => cleanIdentifier(part)} match { - case Seq(tableOnly) => (None, tableOnly) - case Seq(databaseName, table) => (Some(databaseName), table) + case Seq(tableOnly) => Seq(tableOnly) + case Seq(databaseName, table) => Seq(databaseName, table) + case other => sys.error("Hive only supports tables names like 'tableName' " + + s"or 'databaseName.tableName', found '$other'") } val alias = aliasClause.map { case Token(a, Nil) => cleanIdentifier(a) } - val relation = UnresolvedRelation(db, tableName, alias) + val relation = UnresolvedRelation(tableIdent, alias) // Apply sampling if requested. (bucketSampleClause orElse splitSampleClause).map { @@ -882,7 +893,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C val Some(tableNameParts) :: partitionClause :: Nil = getClauses(Seq("TOK_TABNAME", "TOK_PARTSPEC"), tableArgs) - val (db, tableName) = extractDbNameTableName(tableNameParts) + val tableIdent = extractTableIdent(tableNameParts) val partitionKeys = partitionClause.map(_.getChildren.map { // Parse partitions. We also make keys case insensitive. @@ -892,7 +903,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C cleanIdentifier(key.toLowerCase) -> None }.toMap).getOrElse(Map.empty) - InsertIntoTable(UnresolvedRelation(db, tableName, None), partitionKeys, query, overwrite) + InsertIntoTable(UnresolvedRelation(tableIdent, None), partitionKeys, query, overwrite) case a: ASTNode => throw new NotImplementedError(s"No parse rules for:\n ${dumpTree(a).toString} ") diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala index b2149bd95a336..8f2311cf83eb8 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala @@ -167,7 +167,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) { // Make sure any test tables referenced are loaded. val referencedTables = describedTables ++ - logical.collect { case UnresolvedRelation(databaseName, name, _) => name } + logical.collect { case UnresolvedRelation(tableIdent, _) => tableIdent.last } val referencedTestTables = referencedTables.filter(testTables.contains) logDebug(s"Query references test tables: ${referencedTestTables.mkString(", ")}") referencedTestTables.foreach(loadTestTable) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala index fe21454e7fb38..a547babcebfff 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala @@ -53,14 +53,14 @@ case class CreateTableAsSelect( hiveContext.catalog.createTable(database, tableName, query.output, allowExisting, desc) // Get the Metastore Relation - hiveContext.catalog.lookupRelation(Some(database), tableName, None) match { + hiveContext.catalog.lookupRelation(Seq(database, tableName), None) match { case r: MetastoreRelation => r } } // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. - if (hiveContext.catalog.tableExists(Some(database), tableName)) { + if (hiveContext.catalog.tableExists(Seq(database, tableName))) { if (allowExisting) { // table already exists, will do nothing, to keep consistent with Hive } else { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala index 6fc4153f6a5df..6b733a280e6d5 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala @@ -53,7 +53,7 @@ case class DropTable( val hiveContext = sqlContext.asInstanceOf[HiveContext] val ifExistsClause = if (ifExists) "IF EXISTS " else "" hiveContext.runSqlHive(s"DROP TABLE $ifExistsClause$tableName") - hiveContext.catalog.unregisterTable(None, tableName) + hiveContext.catalog.unregisterTable(Seq(tableName)) Seq.empty[Row] } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index 4b6a9308b9811..a758f921e0417 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -72,7 +72,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll { test("analyze MetastoreRelations") { def queryTotalSize(tableName: String): BigInt = - catalog.lookupRelation(None, tableName).statistics.sizeInBytes + catalog.lookupRelation(Seq(tableName)).statistics.sizeInBytes // Non-partitioned table sql("CREATE TABLE analyzeTable (key STRING, value STRING)").collect() @@ -123,7 +123,7 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll { intercept[NotImplementedError] { analyze("tempTable") } - catalog.unregisterTable(None, "tempTable") + catalog.unregisterTable(Seq("tempTable")) } test("estimates the size of a test MetastoreRelation") { From 693a323a70aba91e6c100dd5561d218a75b7895e Mon Sep 17 00:00:00 2001 From: scwf Date: Sat, 10 Jan 2015 13:53:21 -0800 Subject: [PATCH 106/116] [SPARK-4574][SQL] Adding support for defining schema in foreign DDL commands. Adding support for defining schema in foreign DDL commands. Now foreign DDL support commands like: ``` CREATE TEMPORARY TABLE avroTable USING org.apache.spark.sql.avro OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro") ``` With this PR user can define schema instead of infer from file, so support ddl command as follows: ``` CREATE TEMPORARY TABLE avroTable(a int, b string) USING org.apache.spark.sql.avro OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro") ``` Author: scwf Author: Yin Huai Author: Fei Wang Author: wangfei Closes #3431 from scwf/ddl and squashes the following commits: 7e79ce5 [Fei Wang] Merge pull request #22 from yhuai/pr3431yin 38f634e [Yin Huai] Remove Option from createRelation. 65e9c73 [Yin Huai] Revert all changes since applying a given schema has not been testd. a852b10 [scwf] remove cleanIdentifier f336a16 [Fei Wang] Merge pull request #21 from yhuai/pr3431yin baf79b5 [Yin Huai] Test special characters quoted by backticks. 50a03b0 [Yin Huai] Use JsonRDD.nullTypeToStringType to convert NullType to StringType. 1eeb769 [Fei Wang] Merge pull request #20 from yhuai/pr3431yin f5c22b0 [Yin Huai] Refactor code and update test cases. f1cffe4 [Yin Huai] Revert "minor refactory" b621c8f [scwf] minor refactory d02547f [scwf] fix HiveCompatibilitySuite test failure 8dfbf7a [scwf] more tests for complex data type ddab984 [Fei Wang] Merge pull request #19 from yhuai/pr3431yin 91ad91b [Yin Huai] Parse data types in DDLParser. cf982d2 [scwf] fixed test failure 445b57b [scwf] address comments 02a662c [scwf] style issue 44eb70c [scwf] fix decimal parser issue 83b6fc3 [scwf] minor fix 9bf12f8 [wangfei] adding test case 7787ec7 [wangfei] added SchemaRelationProvider 0ba70df [wangfei] draft version --- .../apache/spark/sql/json/JSONRelation.scala | 35 +++- .../org/apache/spark/sql/sources/ddl.scala | 138 +++++++++++-- .../apache/spark/sql/sources/interfaces.scala | 29 ++- .../spark/sql/sources/TableScanSuite.scala | 192 ++++++++++++++++++ .../spark/sql/hive/HiveMetastoreCatalog.scala | 114 +++-------- .../sql/hive/HiveMetastoreCatalogSuite.scala | 5 +- 6 files changed, 400 insertions(+), 113 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala index fc70c183437f6..a9a6696cb15e4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala @@ -18,31 +18,48 @@ package org.apache.spark.sql.json import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.catalyst.types.StructType import org.apache.spark.sql.sources._ -private[sql] class DefaultSource extends RelationProvider { - /** Returns a new base relation with the given parameters. */ +private[sql] class DefaultSource extends RelationProvider with SchemaRelationProvider { + + /** Returns a new base relation with the parameters. */ override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val fileName = parameters.getOrElse("path", sys.error("Option 'path' not specified")) val samplingRatio = parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0) - JSONRelation(fileName, samplingRatio)(sqlContext) + JSONRelation(fileName, samplingRatio, None)(sqlContext) + } + + /** Returns a new base relation with the given schema and parameters. */ + override def createRelation( + sqlContext: SQLContext, + parameters: Map[String, String], + schema: StructType): BaseRelation = { + val fileName = parameters.getOrElse("path", sys.error("Option 'path' not specified")) + val samplingRatio = parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0) + + JSONRelation(fileName, samplingRatio, Some(schema))(sqlContext) } } -private[sql] case class JSONRelation(fileName: String, samplingRatio: Double)( +private[sql] case class JSONRelation( + fileName: String, + samplingRatio: Double, + userSpecifiedSchema: Option[StructType])( @transient val sqlContext: SQLContext) extends TableScan { private def baseRDD = sqlContext.sparkContext.textFile(fileName) - override val schema = - JsonRDD.inferSchema( - baseRDD, - samplingRatio, - sqlContext.columnNameOfCorruptRecord) + override val schema = userSpecifiedSchema.getOrElse( + JsonRDD.nullTypeToStringType( + JsonRDD.inferSchema( + baseRDD, + samplingRatio, + sqlContext.columnNameOfCorruptRecord))) override def buildScan() = JsonRDD.jsonStringToRow(baseRDD, schema, sqlContext.columnNameOfCorruptRecord) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala index 364bacec83b98..fe2c4d8436b2b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala @@ -17,16 +17,15 @@ package org.apache.spark.sql.sources -import org.apache.spark.Logging -import org.apache.spark.sql.SQLContext -import org.apache.spark.sql.execution.RunnableCommand -import org.apache.spark.util.Utils - import scala.language.implicitConversions -import scala.util.parsing.combinator.lexical.StdLexical import scala.util.parsing.combinator.syntactical.StandardTokenParsers import scala.util.parsing.combinator.PackratParsers +import org.apache.spark.Logging +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.catalyst.types._ +import org.apache.spark.sql.execution.RunnableCommand +import org.apache.spark.util.Utils import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.SqlLexical @@ -44,6 +43,14 @@ private[sql] class DDLParser extends StandardTokenParsers with PackratParsers wi } } + def parseType(input: String): DataType = { + phrase(dataType)(new lexical.Scanner(input)) match { + case Success(r, x) => r + case x => + sys.error(s"Unsupported dataType: $x") + } + } + protected case class Keyword(str: String) protected implicit def asParser(k: Keyword): Parser[String] = @@ -55,6 +62,24 @@ private[sql] class DDLParser extends StandardTokenParsers with PackratParsers wi protected val USING = Keyword("USING") protected val OPTIONS = Keyword("OPTIONS") + // Data types. + protected val STRING = Keyword("STRING") + protected val BINARY = Keyword("BINARY") + protected val BOOLEAN = Keyword("BOOLEAN") + protected val TINYINT = Keyword("TINYINT") + protected val SMALLINT = Keyword("SMALLINT") + protected val INT = Keyword("INT") + protected val BIGINT = Keyword("BIGINT") + protected val FLOAT = Keyword("FLOAT") + protected val DOUBLE = Keyword("DOUBLE") + protected val DECIMAL = Keyword("DECIMAL") + protected val DATE = Keyword("DATE") + protected val TIMESTAMP = Keyword("TIMESTAMP") + protected val VARCHAR = Keyword("VARCHAR") + protected val ARRAY = Keyword("ARRAY") + protected val MAP = Keyword("MAP") + protected val STRUCT = Keyword("STRUCT") + // Use reflection to find the reserved words defined in this class. protected val reservedWords = this.getClass @@ -67,15 +92,25 @@ private[sql] class DDLParser extends StandardTokenParsers with PackratParsers wi protected lazy val ddl: Parser[LogicalPlan] = createTable /** - * CREATE TEMPORARY TABLE avroTable + * `CREATE TEMPORARY TABLE avroTable * USING org.apache.spark.sql.avro - * OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro") + * OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")` + * or + * `CREATE TEMPORARY TABLE avroTable(intField int, stringField string...) + * USING org.apache.spark.sql.avro + * OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")` */ protected lazy val createTable: Parser[LogicalPlan] = - CREATE ~ TEMPORARY ~ TABLE ~> ident ~ (USING ~> className) ~ (OPTIONS ~> options) ^^ { - case tableName ~ provider ~ opts => - CreateTableUsing(tableName, provider, opts) + ( + CREATE ~ TEMPORARY ~ TABLE ~> ident + ~ (tableCols).? ~ (USING ~> className) ~ (OPTIONS ~> options) ^^ { + case tableName ~ columns ~ provider ~ opts => + val userSpecifiedSchema = columns.flatMap(fields => Some(StructType(fields))) + CreateTableUsing(tableName, userSpecifiedSchema, provider, opts) } + ) + + protected lazy val tableCols: Parser[Seq[StructField]] = "(" ~> repsep(column, ",") <~ ")" protected lazy val options: Parser[Map[String, String]] = "(" ~> repsep(pair, ",") <~ ")" ^^ { case s: Seq[(String, String)] => s.toMap } @@ -83,10 +118,66 @@ private[sql] class DDLParser extends StandardTokenParsers with PackratParsers wi protected lazy val className: Parser[String] = repsep(ident, ".") ^^ { case s => s.mkString(".")} protected lazy val pair: Parser[(String, String)] = ident ~ stringLit ^^ { case k ~ v => (k,v) } + + protected lazy val column: Parser[StructField] = + ident ~ dataType ^^ { case columnName ~ typ => + StructField(columnName, typ) + } + + protected lazy val primitiveType: Parser[DataType] = + STRING ^^^ StringType | + BINARY ^^^ BinaryType | + BOOLEAN ^^^ BooleanType | + TINYINT ^^^ ByteType | + SMALLINT ^^^ ShortType | + INT ^^^ IntegerType | + BIGINT ^^^ LongType | + FLOAT ^^^ FloatType | + DOUBLE ^^^ DoubleType | + fixedDecimalType | // decimal with precision/scale + DECIMAL ^^^ DecimalType.Unlimited | // decimal with no precision/scale + DATE ^^^ DateType | + TIMESTAMP ^^^ TimestampType | + VARCHAR ~ "(" ~ numericLit ~ ")" ^^^ StringType + + protected lazy val fixedDecimalType: Parser[DataType] = + (DECIMAL ~ "(" ~> numericLit) ~ ("," ~> numericLit <~ ")") ^^ { + case precision ~ scale => DecimalType(precision.toInt, scale.toInt) + } + + protected lazy val arrayType: Parser[DataType] = + ARRAY ~> "<" ~> dataType <~ ">" ^^ { + case tpe => ArrayType(tpe) + } + + protected lazy val mapType: Parser[DataType] = + MAP ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ { + case t1 ~ _ ~ t2 => MapType(t1, t2) + } + + protected lazy val structField: Parser[StructField] = + ident ~ ":" ~ dataType ^^ { + case fieldName ~ _ ~ tpe => StructField(fieldName, tpe, nullable = true) + } + + protected lazy val structType: Parser[DataType] = + (STRUCT ~> "<" ~> repsep(structField, ",") <~ ">" ^^ { + case fields => new StructType(fields) + }) | + (STRUCT ~> "<>" ^^ { + case fields => new StructType(Nil) + }) + + private[sql] lazy val dataType: Parser[DataType] = + arrayType | + mapType | + structType | + primitiveType } private[sql] case class CreateTableUsing( tableName: String, + userSpecifiedSchema: Option[StructType], provider: String, options: Map[String, String]) extends RunnableCommand { @@ -99,8 +190,29 @@ private[sql] case class CreateTableUsing( sys.error(s"Failed to load class for data source: $provider") } } - val dataSource = clazz.newInstance().asInstanceOf[org.apache.spark.sql.sources.RelationProvider] - val relation = dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options)) + + val relation = userSpecifiedSchema match { + case Some(schema: StructType) => { + clazz.newInstance match { + case dataSource: org.apache.spark.sql.sources.SchemaRelationProvider => + dataSource + .asInstanceOf[org.apache.spark.sql.sources.SchemaRelationProvider] + .createRelation(sqlContext, new CaseInsensitiveMap(options), schema) + case _ => + sys.error(s"${clazz.getCanonicalName} should extend SchemaRelationProvider.") + } + } + case None => { + clazz.newInstance match { + case dataSource: org.apache.spark.sql.sources.RelationProvider => + dataSource + .asInstanceOf[org.apache.spark.sql.sources.RelationProvider] + .createRelation(sqlContext, new CaseInsensitiveMap(options)) + case _ => + sys.error(s"${clazz.getCanonicalName} should extend RelationProvider.") + } + } + } sqlContext.baseRelationToSchemaRDD(relation).registerTempTable(tableName) Seq.empty diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala index 02eff80456dbe..990f7e0e74bcf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.sources import org.apache.spark.annotation.{Experimental, DeveloperApi} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{SQLConf, Row, SQLContext, StructType} +import org.apache.spark.sql.{Row, SQLContext, StructType} import org.apache.spark.sql.catalyst.expressions.{Expression, Attribute} /** @@ -44,6 +44,33 @@ trait RelationProvider { def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation } +/** + * ::DeveloperApi:: + * Implemented by objects that produce relations for a specific kind of data source. When + * Spark SQL is given a DDL operation with + * 1. USING clause: to specify the implemented SchemaRelationProvider + * 2. User defined schema: users can define schema optionally when create table + * + * Users may specify the fully qualified class name of a given data source. When that class is + * not found Spark SQL will append the class name `DefaultSource` to the path, allowing for + * less verbose invocation. For example, 'org.apache.spark.sql.json' would resolve to the + * data source 'org.apache.spark.sql.json.DefaultSource' + * + * A new instance of this class with be instantiated each time a DDL call is made. + */ +@DeveloperApi +trait SchemaRelationProvider { + /** + * Returns a new base relation with the given parameters and user defined schema. + * Note: the parameters' keywords are case insensitive and this insensitivity is enforced + * by the Map that is passed to the function. + */ + def createRelation( + sqlContext: SQLContext, + parameters: Map[String, String], + schema: StructType): BaseRelation +} + /** * ::DeveloperApi:: * Represents a collection of tuples with a known schema. Classes that extend BaseRelation must diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala index 3cd7b0115d567..605190f5ae6a2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala @@ -17,7 +17,10 @@ package org.apache.spark.sql.sources +import java.sql.{Timestamp, Date} + import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.types.DecimalType class DefaultSource extends SimpleScanSource @@ -38,9 +41,77 @@ case class SimpleScan(from: Int, to: Int)(@transient val sqlContext: SQLContext) override def buildScan() = sqlContext.sparkContext.parallelize(from to to).map(Row(_)) } +class AllDataTypesScanSource extends SchemaRelationProvider { + override def createRelation( + sqlContext: SQLContext, + parameters: Map[String, String], + schema: StructType): BaseRelation = { + AllDataTypesScan(parameters("from").toInt, parameters("TO").toInt, schema)(sqlContext) + } +} + +case class AllDataTypesScan( + from: Int, + to: Int, + userSpecifiedSchema: StructType)(@transient val sqlContext: SQLContext) + extends TableScan { + + override def schema = userSpecifiedSchema + + override def buildScan() = { + sqlContext.sparkContext.parallelize(from to to).map { i => + Row( + s"str_$i", + s"str_$i".getBytes(), + i % 2 == 0, + i.toByte, + i.toShort, + i, + i.toLong, + i.toFloat, + i.toDouble, + BigDecimal(i), + BigDecimal(i), + new Date((i + 1) * 8640000), + new Timestamp(20000 + i), + s"varchar_$i", + Seq(i, i + 1), + Seq(Map(s"str_$i" -> Row(i.toLong))), + Map(i -> i.toString), + Map(Map(s"str_$i" -> i.toFloat) -> Row(i.toLong)), + Row(i, i.toString), + Row(Seq(s"str_$i", s"str_${i + 1}"), Row(Seq(new Date((i + 2) * 8640000))))) + } + } +} + class TableScanSuite extends DataSourceTest { import caseInsensisitiveContext._ + var tableWithSchemaExpected = (1 to 10).map { i => + Row( + s"str_$i", + s"str_$i", + i % 2 == 0, + i.toByte, + i.toShort, + i, + i.toLong, + i.toFloat, + i.toDouble, + BigDecimal(i), + BigDecimal(i), + new Date((i + 1) * 8640000), + new Timestamp(20000 + i), + s"varchar_$i", + Seq(i, i + 1), + Seq(Map(s"str_$i" -> Row(i.toLong))), + Map(i -> i.toString), + Map(Map(s"str_$i" -> i.toFloat) -> Row(i.toLong)), + Row(i, i.toString), + Row(Seq(s"str_$i", s"str_${i + 1}"), Row(Seq(new Date((i + 2) * 8640000))))) + }.toSeq + before { sql( """ @@ -51,6 +122,37 @@ class TableScanSuite extends DataSourceTest { | To '10' |) """.stripMargin) + + sql( + """ + |CREATE TEMPORARY TABLE tableWithSchema ( + |`string$%Field` stRIng, + |binaryField binary, + |`booleanField` boolean, + |ByteField tinyint, + |shortField smaLlint, + |int_Field iNt, + |`longField_:,<>=+/~^` Bigint, + |floatField flOat, + |doubleField doubLE, + |decimalField1 decimal, + |decimalField2 decimal(9,2), + |dateField dAte, + |timestampField tiMestamp, + |varcharField varchaR(12), + |arrayFieldSimple Array, + |arrayFieldComplex Array>>, + |mapFieldSimple MAP, + |mapFieldComplex Map, Struct>, + |structFieldSimple StRuct, + |structFieldComplex StRuct, Value:struct<`value_(2)`:Array>> + |) + |USING org.apache.spark.sql.sources.AllDataTypesScanSource + |OPTIONS ( + | From '1', + | To '10' + |) + """.stripMargin) } sqlTest( @@ -73,6 +175,96 @@ class TableScanSuite extends DataSourceTest { "SELECT a.i, b.i FROM oneToTen a JOIN oneToTen b ON a.i = b.i + 1", (2 to 10).map(i => Row(i, i - 1)).toSeq) + test("Schema and all fields") { + val expectedSchema = StructType( + StructField("string$%Field", StringType, true) :: + StructField("binaryField", BinaryType, true) :: + StructField("booleanField", BooleanType, true) :: + StructField("ByteField", ByteType, true) :: + StructField("shortField", ShortType, true) :: + StructField("int_Field", IntegerType, true) :: + StructField("longField_:,<>=+/~^", LongType, true) :: + StructField("floatField", FloatType, true) :: + StructField("doubleField", DoubleType, true) :: + StructField("decimalField1", DecimalType.Unlimited, true) :: + StructField("decimalField2", DecimalType(9, 2), true) :: + StructField("dateField", DateType, true) :: + StructField("timestampField", TimestampType, true) :: + StructField("varcharField", StringType, true) :: + StructField("arrayFieldSimple", ArrayType(IntegerType), true) :: + StructField("arrayFieldComplex", + ArrayType( + MapType(StringType, StructType(StructField("key", LongType, true) :: Nil))), true) :: + StructField("mapFieldSimple", MapType(IntegerType, StringType), true) :: + StructField("mapFieldComplex", + MapType( + MapType(StringType, FloatType), + StructType(StructField("key", LongType, true) :: Nil)), true) :: + StructField("structFieldSimple", + StructType( + StructField("key", IntegerType, true) :: + StructField("Value", StringType, true) :: Nil), true) :: + StructField("structFieldComplex", + StructType( + StructField("key", ArrayType(StringType), true) :: + StructField("Value", + StructType( + StructField("value_(2)", ArrayType(DateType), true) :: Nil), true) :: Nil), true) :: + Nil + ) + + assert(expectedSchema == table("tableWithSchema").schema) + + checkAnswer( + sql( + """SELECT + | `string$%Field`, + | cast(binaryField as string), + | booleanField, + | byteField, + | shortField, + | int_Field, + | `longField_:,<>=+/~^`, + | floatField, + | doubleField, + | decimalField1, + | decimalField2, + | dateField, + | timestampField, + | varcharField, + | arrayFieldSimple, + | arrayFieldComplex, + | mapFieldSimple, + | mapFieldComplex, + | structFieldSimple, + | structFieldComplex FROM tableWithSchema""".stripMargin), + tableWithSchemaExpected + ) + } + + sqlTest( + "SELECT count(*) FROM tableWithSchema", + 10) + + sqlTest( + "SELECT `string$%Field` FROM tableWithSchema", + (1 to 10).map(i => Row(s"str_$i")).toSeq) + + sqlTest( + "SELECT int_Field FROM tableWithSchema WHERE int_Field < 5", + (1 to 4).map(Row(_)).toSeq) + + sqlTest( + "SELECT `longField_:,<>=+/~^` * 2 FROM tableWithSchema", + (1 to 10).map(i => Row(i * 2.toLong)).toSeq) + + sqlTest( + "SELECT structFieldSimple.key, arrayFieldSimple[1] FROM tableWithSchema a where int_Field=1", + Seq(Seq(1, 2))) + + sqlTest( + "SELECT structFieldComplex.Value.`value_(2)` FROM tableWithSchema", + (1 to 10).map(i => Row(Seq(new Date((i + 2) * 8640000)))).toSeq) test("Caching") { // Cached Query Execution diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 2c859894cf8d3..c25288e000122 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -20,12 +20,7 @@ package org.apache.spark.sql.hive import java.io.IOException import java.util.{List => JList} -import org.apache.spark.sql.execution.SparkPlan - -import scala.util.parsing.combinator.RegexParsers - import org.apache.hadoop.util.ReflectionUtils - import org.apache.hadoop.hive.metastore.TableType import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.hadoop.hive.metastore.api.{Table => TTable, Partition => TPartition} @@ -37,7 +32,6 @@ import org.apache.hadoop.hive.serde2.{Deserializer, SerDeException} import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.spark.Logging -import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.analysis.{Catalog, OverrideCatalog} import org.apache.spark.sql.catalyst.expressions._ @@ -412,88 +406,6 @@ private[hive] case class InsertIntoHiveTable( } } -/** - * :: DeveloperApi :: - * Provides conversions between Spark SQL data types and Hive Metastore types. - */ -@DeveloperApi -object HiveMetastoreTypes extends RegexParsers { - protected lazy val primitiveType: Parser[DataType] = - "string" ^^^ StringType | - "float" ^^^ FloatType | - "int" ^^^ IntegerType | - "tinyint" ^^^ ByteType | - "smallint" ^^^ ShortType | - "double" ^^^ DoubleType | - "bigint" ^^^ LongType | - "binary" ^^^ BinaryType | - "boolean" ^^^ BooleanType | - fixedDecimalType | // Hive 0.13+ decimal with precision/scale - "decimal" ^^^ DecimalType.Unlimited | // Hive 0.12 decimal with no precision/scale - "date" ^^^ DateType | - "timestamp" ^^^ TimestampType | - "varchar\\((\\d+)\\)".r ^^^ StringType - - protected lazy val fixedDecimalType: Parser[DataType] = - ("decimal" ~> "(" ~> "\\d+".r) ~ ("," ~> "\\d+".r <~ ")") ^^ { - case precision ~ scale => - DecimalType(precision.toInt, scale.toInt) - } - - protected lazy val arrayType: Parser[DataType] = - "array" ~> "<" ~> dataType <~ ">" ^^ { - case tpe => ArrayType(tpe) - } - - protected lazy val mapType: Parser[DataType] = - "map" ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ { - case t1 ~ _ ~ t2 => MapType(t1, t2) - } - - protected lazy val structField: Parser[StructField] = - "[a-zA-Z0-9_]*".r ~ ":" ~ dataType ^^ { - case name ~ _ ~ tpe => StructField(name, tpe, nullable = true) - } - - protected lazy val structType: Parser[DataType] = - "struct" ~> "<" ~> repsep(structField,",") <~ ">" ^^ { - case fields => new StructType(fields) - } - - protected lazy val dataType: Parser[DataType] = - arrayType | - mapType | - structType | - primitiveType - - def toDataType(metastoreType: String): DataType = parseAll(dataType, metastoreType) match { - case Success(result, _) => result - case failure: NoSuccess => sys.error(s"Unsupported dataType: $metastoreType") - } - - def toMetastoreType(dt: DataType): String = dt match { - case ArrayType(elementType, _) => s"array<${toMetastoreType(elementType)}>" - case StructType(fields) => - s"struct<${fields.map(f => s"${f.name}:${toMetastoreType(f.dataType)}").mkString(",")}>" - case MapType(keyType, valueType, _) => - s"map<${toMetastoreType(keyType)},${toMetastoreType(valueType)}>" - case StringType => "string" - case FloatType => "float" - case IntegerType => "int" - case ByteType => "tinyint" - case ShortType => "smallint" - case DoubleType => "double" - case LongType => "bigint" - case BinaryType => "binary" - case BooleanType => "boolean" - case DateType => "date" - case d: DecimalType => HiveShim.decimalMetastoreString(d) - case TimestampType => "timestamp" - case NullType => "void" - case udt: UserDefinedType[_] => toMetastoreType(udt.sqlType) - } -} - private[hive] case class MetastoreRelation (databaseName: String, tableName: String, alias: Option[String]) (val table: TTable, val partitions: Seq[TPartition]) @@ -551,7 +463,7 @@ private[hive] case class MetastoreRelation implicit class SchemaAttribute(f: FieldSchema) { def toAttribute = AttributeReference( f.getName, - HiveMetastoreTypes.toDataType(f.getType), + sqlContext.ddlParser.parseType(f.getType), // Since data can be dumped in randomly with no validation, everything is nullable. nullable = true )(qualifiers = Seq(alias.getOrElse(tableName))) @@ -571,3 +483,27 @@ private[hive] case class MetastoreRelation /** An attribute map for determining the ordinal for non-partition columns. */ val columnOrdinals = AttributeMap(attributes.zipWithIndex) } + +object HiveMetastoreTypes { + def toMetastoreType(dt: DataType): String = dt match { + case ArrayType(elementType, _) => s"array<${toMetastoreType(elementType)}>" + case StructType(fields) => + s"struct<${fields.map(f => s"${f.name}:${toMetastoreType(f.dataType)}").mkString(",")}>" + case MapType(keyType, valueType, _) => + s"map<${toMetastoreType(keyType)},${toMetastoreType(valueType)}>" + case StringType => "string" + case FloatType => "float" + case IntegerType => "int" + case ByteType => "tinyint" + case ShortType => "smallint" + case DoubleType => "double" + case LongType => "bigint" + case BinaryType => "binary" + case BooleanType => "boolean" + case DateType => "date" + case d: DecimalType => HiveShim.decimalMetastoreString(d) + case TimestampType => "timestamp" + case NullType => "void" + case udt: UserDefinedType[_] => toMetastoreType(udt.sqlType) + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala index 86535f8dd4f58..041a36f1295ef 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.hive import org.scalatest.FunSuite import org.apache.spark.sql.catalyst.types.StructType +import org.apache.spark.sql.sources.DDLParser import org.apache.spark.sql.test.ExamplePointUDT class HiveMetastoreCatalogSuite extends FunSuite { @@ -27,7 +28,9 @@ class HiveMetastoreCatalogSuite extends FunSuite { test("struct field should accept underscore in sub-column name") { val metastr = "struct" - val datatype = HiveMetastoreTypes.toDataType(metastr) + val ddlParser = new DDLParser + + val datatype = ddlParser.parseType(metastr) assert(datatype.isInstanceOf[StructType]) } From b3e86dc62476abb03b330f86a788aa19a6565317 Mon Sep 17 00:00:00 2001 From: scwf Date: Sat, 10 Jan 2015 14:08:04 -0800 Subject: [PATCH 107/116] [SPARK-4861][SQL] Refactory command in spark sql Follow up for #3712. This PR finally remove ```CommandStrategy``` and make all commands follow ```RunnableCommand``` so they can go with ```case r: RunnableCommand => ExecutedCommand(r) :: Nil```. One exception is the ```DescribeCommand``` of hive, which is a special case and need to distinguish hive table and temporary table, so still keep ```HiveCommandStrategy``` here. Author: scwf Closes #3948 from scwf/followup-SPARK-4861 and squashes the following commits: 6b48e64 [scwf] minor style fix 2c62e9d [scwf] fix for hive module 5a7a819 [scwf] Refactory command in spark sql --- ...ser.scala => AbstractSparkSQLParser.scala} | 69 ------------- .../sql/catalyst/plans/logical/commands.scala | 48 +-------- .../org/apache/spark/sql/SQLContext.scala | 3 +- .../org/apache/spark/sql/SparkSQLParser.scala | 97 +++++++++++++++++++ .../spark/sql/execution/SparkStrategies.scala | 20 +--- .../apache/spark/sql/execution/commands.scala | 2 +- .../spark/sql/hive/thriftserver/Shim12.scala | 4 +- .../spark/sql/hive/thriftserver/Shim13.scala | 4 +- .../apache/spark/sql/hive/HiveContext.scala | 4 +- .../org/apache/spark/sql/hive/HiveQl.scala | 31 +++++- .../spark/sql/hive/HiveStrategies.scala | 5 +- .../org/apache/spark/sql/hive/TestHive.scala | 3 +- .../hive/execution/HiveComparisonTest.scala | 2 + 13 files changed, 141 insertions(+), 151 deletions(-) rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/{SparkSQLParser.scala => AbstractSparkSQLParser.scala} (62%) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SparkSQLParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala similarity index 62% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SparkSQLParser.scala rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala index f1a1ca6616a21..93d74adbcc957 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SparkSQLParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/AbstractSparkSQLParser.scala @@ -105,72 +105,3 @@ class SqlLexical(val keywords: Seq[String]) extends StdLexical { } } } - -/** - * The top level Spark SQL parser. This parser recognizes syntaxes that are available for all SQL - * dialects supported by Spark SQL, and delegates all the other syntaxes to the `fallback` parser. - * - * @param fallback A function that parses an input string to a logical plan - */ -private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends AbstractSparkSQLParser { - - // A parser for the key-value part of the "SET [key = [value ]]" syntax - private object SetCommandParser extends RegexParsers { - private val key: Parser[String] = "(?m)[^=]+".r - - private val value: Parser[String] = "(?m).*$".r - - private val pair: Parser[LogicalPlan] = - (key ~ ("=".r ~> value).?).? ^^ { - case None => SetCommand(None) - case Some(k ~ v) => SetCommand(Some(k.trim -> v.map(_.trim))) - } - - def apply(input: String): LogicalPlan = parseAll(pair, input) match { - case Success(plan, _) => plan - case x => sys.error(x.toString) - } - } - - protected val AS = Keyword("AS") - protected val CACHE = Keyword("CACHE") - protected val LAZY = Keyword("LAZY") - protected val SET = Keyword("SET") - protected val TABLE = Keyword("TABLE") - protected val UNCACHE = Keyword("UNCACHE") - - protected implicit def asParser(k: Keyword): Parser[String] = - lexical.allCaseVersions(k.str).map(x => x : Parser[String]).reduce(_ | _) - - private val reservedWords: Seq[String] = - this - .getClass - .getMethods - .filter(_.getReturnType == classOf[Keyword]) - .map(_.invoke(this).asInstanceOf[Keyword].str) - - override val lexical = new SqlLexical(reservedWords) - - override protected lazy val start: Parser[LogicalPlan] = cache | uncache | set | others - - private lazy val cache: Parser[LogicalPlan] = - CACHE ~> LAZY.? ~ (TABLE ~> ident) ~ (AS ~> restInput).? ^^ { - case isLazy ~ tableName ~ plan => - CacheTableCommand(tableName, plan.map(fallback), isLazy.isDefined) - } - - private lazy val uncache: Parser[LogicalPlan] = - UNCACHE ~ TABLE ~> ident ^^ { - case tableName => UncacheTableCommand(tableName) - } - - private lazy val set: Parser[LogicalPlan] = - SET ~> restInput ^^ { - case input => SetCommandParser(input) - } - - private lazy val others: Parser[LogicalPlan] = - wholeInput ^^ { - case input => fallback(input) - } -} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala index 5a1863953eae9..45905f8ef98c5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/commands.scala @@ -17,8 +17,7 @@ package org.apache.spark.sql.catalyst.plans.logical -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} -import org.apache.spark.sql.catalyst.types.StringType +import org.apache.spark.sql.catalyst.expressions.Attribute /** * A logical node that represents a non-query command to be executed by the system. For example, @@ -28,48 +27,3 @@ abstract class Command extends LeafNode { self: Product => def output: Seq[Attribute] = Seq.empty } - -/** - * - * Commands of the form "SET [key [= value] ]". - */ -case class SetCommand(kv: Option[(String, Option[String])]) extends Command { - override def output = Seq( - AttributeReference("", StringType, nullable = false)()) -} - -/** - * Returned by a parser when the users only wants to see what query plan would be executed, without - * actually performing the execution. - */ -case class ExplainCommand(plan: LogicalPlan, extended: Boolean = false) extends Command { - override def output = - Seq(AttributeReference("plan", StringType, nullable = false)()) -} - -/** - * Returned for the "CACHE TABLE tableName [AS SELECT ...]" command. - */ -case class CacheTableCommand(tableName: String, plan: Option[LogicalPlan], isLazy: Boolean) - extends Command - -/** - * Returned for the "UNCACHE TABLE tableName" command. - */ -case class UncacheTableCommand(tableName: String) extends Command - -/** - * Returned for the "DESCRIBE [EXTENDED] [dbName.]tableName" command. - * @param table The table to be described. - * @param isExtended True if "DESCRIBE EXTENDED" is used. Otherwise, false. - * It is effective only when the table is a Hive table. - */ -case class DescribeCommand( - table: LogicalPlan, - isExtended: Boolean) extends Command { - override def output = Seq( - // Column names are based on Hive. - AttributeReference("col_name", StringType, nullable = false)(), - AttributeReference("data_type", StringType, nullable = false)(), - AttributeReference("comment", StringType, nullable = false)()) -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 9962937277dad..6c575dd727b46 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -76,7 +76,7 @@ class SQLContext(@transient val sparkContext: SparkContext) @transient protected[sql] val sqlParser = { val fallback = new catalyst.SqlParser - new catalyst.SparkSQLParser(fallback(_)) + new SparkSQLParser(fallback(_)) } protected[sql] def parseSql(sql: String): LogicalPlan = { @@ -329,7 +329,6 @@ class SQLContext(@transient val sparkContext: SparkContext) def strategies: Seq[Strategy] = extraStrategies ++ ( - CommandStrategy :: DataSourceStrategy :: TakeOrdered :: HashAggregation :: diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala new file mode 100644 index 0000000000000..65358b7d4ea8e --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.catalyst.{SqlLexical, AbstractSparkSQLParser} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.{UncacheTableCommand, CacheTableCommand, SetCommand} + +import scala.util.parsing.combinator.RegexParsers + +/** + * The top level Spark SQL parser. This parser recognizes syntaxes that are available for all SQL + * dialects supported by Spark SQL, and delegates all the other syntaxes to the `fallback` parser. + * + * @param fallback A function that parses an input string to a logical plan + */ +private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends AbstractSparkSQLParser { + + // A parser for the key-value part of the "SET [key = [value ]]" syntax + private object SetCommandParser extends RegexParsers { + private val key: Parser[String] = "(?m)[^=]+".r + + private val value: Parser[String] = "(?m).*$".r + + private val output: Seq[Attribute] = Seq(AttributeReference("", StringType, nullable = false)()) + + private val pair: Parser[LogicalPlan] = + (key ~ ("=".r ~> value).?).? ^^ { + case None => SetCommand(None, output) + case Some(k ~ v) => SetCommand(Some(k.trim -> v.map(_.trim)), output) + } + + def apply(input: String): LogicalPlan = parseAll(pair, input) match { + case Success(plan, _) => plan + case x => sys.error(x.toString) + } + } + + protected val AS = Keyword("AS") + protected val CACHE = Keyword("CACHE") + protected val LAZY = Keyword("LAZY") + protected val SET = Keyword("SET") + protected val TABLE = Keyword("TABLE") + protected val UNCACHE = Keyword("UNCACHE") + + protected implicit def asParser(k: Keyword): Parser[String] = + lexical.allCaseVersions(k.str).map(x => x : Parser[String]).reduce(_ | _) + + private val reservedWords: Seq[String] = + this + .getClass + .getMethods + .filter(_.getReturnType == classOf[Keyword]) + .map(_.invoke(this).asInstanceOf[Keyword].str) + + override val lexical = new SqlLexical(reservedWords) + + override protected lazy val start: Parser[LogicalPlan] = cache | uncache | set | others + + private lazy val cache: Parser[LogicalPlan] = + CACHE ~> LAZY.? ~ (TABLE ~> ident) ~ (AS ~> restInput).? ^^ { + case isLazy ~ tableName ~ plan => + CacheTableCommand(tableName, plan.map(fallback), isLazy.isDefined) + } + + private lazy val uncache: Parser[LogicalPlan] = + UNCACHE ~ TABLE ~> ident ^^ { + case tableName => UncacheTableCommand(tableName) + } + + private lazy val set: Parser[LogicalPlan] = + SET ~> restInput ^^ { + case input => SetCommandParser(input) + } + + private lazy val others: Parser[LogicalPlan] = + wholeInput ^^ { + case input => fallback(input) + } + +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index ce878c137e627..99b6611d3bbcf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -259,6 +259,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { def numPartitions = self.numPartitions def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case r: RunnableCommand => ExecutedCommand(r) :: Nil + case logical.Distinct(child) => execution.Distinct(partial = false, execution.Distinct(partial = true, planLater(child))) :: Nil @@ -308,22 +310,4 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { case _ => Nil } } - - case object CommandStrategy extends Strategy { - def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { - case r: RunnableCommand => ExecutedCommand(r) :: Nil - case logical.SetCommand(kv) => - Seq(ExecutedCommand(execution.SetCommand(kv, plan.output))) - case logical.ExplainCommand(logicalPlan, extended) => - Seq(ExecutedCommand( - execution.ExplainCommand(logicalPlan, plan.output, extended))) - case logical.CacheTableCommand(tableName, optPlan, isLazy) => - Seq(ExecutedCommand( - execution.CacheTableCommand(tableName, optPlan, isLazy))) - case logical.UncacheTableCommand(tableName) => - Seq(ExecutedCommand( - execution.UncacheTableCommand(tableName))) - case _ => Nil - } - } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala index b8fa4b019953e..0d765c4c92f85 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala @@ -113,7 +113,7 @@ case class SetCommand( @DeveloperApi case class ExplainCommand( logicalPlan: LogicalPlan, - override val output: Seq[Attribute], extended: Boolean) extends RunnableCommand { + override val output: Seq[Attribute], extended: Boolean = false) extends RunnableCommand { // Run through the optimizer to generate the physical plan. override def run(sqlContext: SQLContext) = try { diff --git a/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala b/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala index 5550183621fb6..80733ea1db93b 100644 --- a/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala +++ b/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala @@ -33,8 +33,8 @@ import org.apache.hive.service.cli.operation.ExecuteStatementOperation import org.apache.hive.service.cli.session.HiveSession import org.apache.spark.Logging -import org.apache.spark.sql.catalyst.plans.logical.SetCommand import org.apache.spark.sql.catalyst.types._ +import org.apache.spark.sql.execution.SetCommand import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._ import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes} import org.apache.spark.sql.{SQLConf, SchemaRDD, Row => SparkRow} @@ -190,7 +190,7 @@ private[hive] class SparkExecuteStatementOperation( result = hiveContext.sql(statement) logDebug(result.queryExecution.toString()) result.queryExecution.logical match { - case SetCommand(Some((SQLConf.THRIFTSERVER_POOL, Some(value)))) => + case SetCommand(Some((SQLConf.THRIFTSERVER_POOL, Some(value))), _) => sessionToActivePool(parentSession.getSessionHandle) = value logInfo(s"Setting spark.scheduler.pool=$value for future statements in this session.") case _ => diff --git a/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala b/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala index 798a690a20427..19d85140071ea 100644 --- a/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala +++ b/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala @@ -31,7 +31,7 @@ import org.apache.hive.service.cli.operation.ExecuteStatementOperation import org.apache.hive.service.cli.session.HiveSession import org.apache.spark.Logging -import org.apache.spark.sql.catalyst.plans.logical.SetCommand +import org.apache.spark.sql.execution.SetCommand import org.apache.spark.sql.catalyst.types._ import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._ import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes} @@ -161,7 +161,7 @@ private[hive] class SparkExecuteStatementOperation( result = hiveContext.sql(statement) logDebug(result.queryExecution.toString()) result.queryExecution.logical match { - case SetCommand(Some((SQLConf.THRIFTSERVER_POOL, Some(value)))) => + case SetCommand(Some((SQLConf.THRIFTSERVER_POOL, Some(value))), _) => sessionToActivePool(parentSession.getSessionHandle) = value logInfo(s"Setting spark.scheduler.pool=$value for future statements in this session.") case _ => diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala index 1648fa826b900..02eac43b2103f 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala @@ -38,8 +38,7 @@ import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.analysis.{Analyzer, EliminateAnalysisOperators, OverrideCatalog, OverrideFunctionRegistry} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.types.DecimalType -import org.apache.spark.sql.catalyst.types.decimal.Decimal -import org.apache.spark.sql.execution.{SparkPlan, ExecutedCommand, ExtractPythonUdfs, QueryExecutionException} +import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUdfs, SetCommand, QueryExecutionException} import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DescribeHiveTableCommand} import org.apache.spark.sql.sources.DataSourceStrategy @@ -340,7 +339,6 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { override def strategies: Seq[Strategy] = extraStrategies ++ Seq( DataSourceStrategy, - CommandStrategy, HiveCommandStrategy(self), TakeOrdered, ParquetOperations, diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index c2ab3579c1f95..28de03c38997b 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -24,8 +24,8 @@ import org.apache.hadoop.hive.ql.lib.Node import org.apache.hadoop.hive.ql.metadata.Table import org.apache.hadoop.hive.ql.parse._ import org.apache.hadoop.hive.ql.plan.PlanUtils +import org.apache.spark.sql.SparkSQLParser -import org.apache.spark.sql.catalyst.SparkSQLParser import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ @@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.types._ import org.apache.spark.sql.catalyst.types.decimal.Decimal +import org.apache.spark.sql.execution.ExplainCommand import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DropTable, AnalyzeTable} /* Implicit conversions */ @@ -45,6 +46,22 @@ import scala.collection.JavaConversions._ */ private[hive] case object NativePlaceholder extends Command +/** + * Returned for the "DESCRIBE [EXTENDED] [dbName.]tableName" command. + * @param table The table to be described. + * @param isExtended True if "DESCRIBE EXTENDED" is used. Otherwise, false. + * It is effective only when the table is a Hive table. + */ +case class DescribeCommand( + table: LogicalPlan, + isExtended: Boolean) extends Command { + override def output = Seq( + // Column names are based on Hive. + AttributeReference("col_name", StringType, nullable = false)(), + AttributeReference("data_type", StringType, nullable = false)(), + AttributeReference("comment", StringType, nullable = false)()) +} + /** Provides a mapping from HiveQL statements to catalyst logical plans and expression trees. */ private[hive] object HiveQl { protected val nativeCommands = Seq( @@ -457,17 +474,23 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C // Just fake explain for any of the native commands. case Token("TOK_EXPLAIN", explainArgs) if noExplainCommands.contains(explainArgs.head.getText) => - ExplainCommand(NoRelation) + ExplainCommand(NoRelation, Seq(AttributeReference("plan", StringType, nullable = false)())) case Token("TOK_EXPLAIN", explainArgs) if "TOK_CREATETABLE" == explainArgs.head.getText => val Some(crtTbl) :: _ :: extended :: Nil = getClauses(Seq("TOK_CREATETABLE", "FORMATTED", "EXTENDED"), explainArgs) - ExplainCommand(nodeToPlan(crtTbl), extended != None) + ExplainCommand( + nodeToPlan(crtTbl), + Seq(AttributeReference("plan", StringType,nullable = false)()), + extended != None) case Token("TOK_EXPLAIN", explainArgs) => // Ignore FORMATTED if present. val Some(query) :: _ :: extended :: Nil = getClauses(Seq("TOK_QUERY", "FORMATTED", "EXTENDED"), explainArgs) - ExplainCommand(nodeToPlan(query), extended != None) + ExplainCommand( + nodeToPlan(query), + Seq(AttributeReference("plan", StringType, nullable = false)()), + extended != None) case Token("TOK_DESCTABLE", describeArgs) => // Reference: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala index d3f6381b69a4d..c439b9ebfe104 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.planning._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.types.StringType +import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeCommand} import org.apache.spark.sql.execution._ import org.apache.spark.sql.hive import org.apache.spark.sql.hive.execution._ @@ -209,14 +210,14 @@ private[hive] trait HiveStrategies { case class HiveCommandStrategy(context: HiveContext) extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { - case describe: logical.DescribeCommand => + case describe: DescribeCommand => val resolvedTable = context.executePlan(describe.table).analyzed resolvedTable match { case t: MetastoreRelation => ExecutedCommand( DescribeHiveTableCommand(t, describe.output, describe.isExtended)) :: Nil case o: LogicalPlan => - ExecutedCommand(DescribeCommand(planLater(o), describe.output)) :: Nil + ExecutedCommand(RunnableDescribeCommand(planLater(o), describe.output)) :: Nil } case _ => Nil diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala index 8f2311cf83eb8..1358a0eccb353 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala @@ -34,8 +34,9 @@ import org.apache.hadoop.hive.serde2.avro.AvroSerDe import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.util.Utils import org.apache.spark.sql.catalyst.analysis._ -import org.apache.spark.sql.catalyst.plans.logical.{CacheTableCommand, LogicalPlan} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.sql.execution.CacheTableCommand import org.apache.spark.sql.hive._ import org.apache.spark.sql.SQLConf import org.apache.spark.sql.hive.execution.HiveNativeCommand diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala index 4104df8f8e022..f8a957d55d57e 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala @@ -22,6 +22,8 @@ import java.io._ import org.scalatest.{BeforeAndAfterAll, FunSuite, GivenWhenThen} import org.apache.spark.Logging +import org.apache.spark.sql.execution.{SetCommand, ExplainCommand} +import org.apache.spark.sql.hive.DescribeCommand import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.util._ From 77106df69147aba5eb1784adb84e2b574927c6de Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Sat, 10 Jan 2015 14:16:37 -0800 Subject: [PATCH 108/116] SPARK-4963 [SQL] Add copy to SQL's Sample operator https://issues.apache.org/jira/browse/SPARK-4963 SchemaRDD.sample() return wrong results due to GapSamplingIterator operating on mutable row. HiveTableScan make RDD with SpecificMutableRow and SchemaRDD.sample() will return GapSamplingIterator for iterating. override def next(): T = { val r = data.next() advance r } GapSamplingIterator.next() return the current underlying element and assigned it to r. However if the underlying iterator is mutable row just like what HiveTableScan returned, underlying iterator and r will point to the same object. After advance operation, we drop some underlying elments and it also changed r which is not expected. Then we return the wrong value different from initial r. To fix this issue, the most direct way is to make HiveTableScan return mutable row with copy just like the initial commit that I have made. This solution will make HiveTableScan can not get the full advantage of reusable MutableRow, but it can make sample operation return correct result. Further more, we need to investigate GapSamplingIterator.next() and make it can implement copy operation inside it. To achieve this, we should define every elements that RDD can store implement the function like cloneable and it will make huge change. Author: Yanbo Liang Closes #3827 from yanbohappy/spark-4963 and squashes the following commits: 0912ca0 [Yanbo Liang] code format keep 65c4e7c [Yanbo Liang] import file and clear annotation 55c7c56 [Yanbo Liang] better output of test case cea7e2e [Yanbo Liang] SchemaRDD add copy operation before Sample operator e840829 [Yanbo Liang] HiveTableScan return mutable row with copy --- .../apache/spark/sql/execution/basicOperators.scala | 2 +- .../spark/sql/hive/execution/SQLQuerySuite.scala | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala index e53723c176569..16ca4be5587c4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala @@ -70,7 +70,7 @@ case class Sample(fraction: Double, withReplacement: Boolean, seed: Long, child: override def output = child.output // TODO: How to pick seed? - override def execute() = child.execute().sample(withReplacement, fraction, seed) + override def execute() = child.execute().map(_.copy()).sample(withReplacement, fraction, seed) } /** diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 5d0fb7237011f..c1c3683f84ab2 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.Row import org.apache.spark.sql.hive.test.TestHive._ +import org.apache.spark.util.Utils case class Nested1(f1: Nested2) case class Nested2(f2: Nested3) @@ -202,4 +203,15 @@ class SQLQuerySuite extends QueryTest { checkAnswer(sql("SELECT sum( distinct key) FROM src group by key order by key"), sql("SELECT distinct key FROM src order by key").collect().toSeq) } + + test("SPARK-4963 SchemaRDD sample on mutable row return wrong result") { + sql("SELECT * FROM src WHERE key % 2 = 0") + .sample(withReplacement = false, fraction = 0.3) + .registerTempTable("sampled") + (1 to 10).foreach { i => + checkAnswer( + sql("SELECT * FROM sampled WHERE key % 2 = 1"), + Seq.empty[Row]) + } + } } From 3684fd21e1ffdc0adaad8ff6b31394b637e866ce Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Sat, 10 Jan 2015 14:25:45 -0800 Subject: [PATCH 109/116] [SPARK-5187][SQL] Fix caching of tables with HiveUDFs in the WHERE clause Author: Michael Armbrust Closes #3987 from marmbrus/hiveUdfCaching and squashes the following commits: 8bca2fa [Michael Armbrust] [SPARK-5187][SQL] Fix caching of tables with HiveUDFs in the WHERE clause --- .../scala/org/apache/spark/sql/hive/CachedTableSuite.scala | 6 ++++++ .../src/main/scala/org/apache/spark/sql/hive/Shim12.scala | 2 +- .../src/main/scala/org/apache/spark/sql/hive/Shim13.scala | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala index 2060e1f1a7a4b..f95a6b43af357 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala @@ -158,4 +158,10 @@ class CachedTableSuite extends QueryTest { uncacheTable("src") assert(!isMaterialized(rddId), "Uncached in-memory table should have been unpersisted") } + + test("CACHE TABLE with Hive UDF") { + sql("CACHE TABLE udfTest AS SELECT * FROM src WHERE floor(key) = 1") + assertCached(table("udfTest")) + uncacheTable("udfTest") + } } diff --git a/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala b/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala index 2d01a85067518..25fdf5c5f3da6 100644 --- a/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala +++ b/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala @@ -44,7 +44,7 @@ import scala.language.implicitConversions import org.apache.spark.sql.catalyst.types.DecimalType -class HiveFunctionWrapper(var functionClassName: String) extends java.io.Serializable { +case class HiveFunctionWrapper(functionClassName: String) extends java.io.Serializable { // for Serialization def this() = this(null) diff --git a/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala b/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala index b78c75798e988..e47002cb0b8c8 100644 --- a/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala +++ b/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala @@ -53,7 +53,7 @@ import scala.language.implicitConversions * * @param functionClassName UDF class name */ -class HiveFunctionWrapper(var functionClassName: String) extends java.io.Externalizable { +case class HiveFunctionWrapper(var functionClassName: String) extends java.io.Externalizable { // for Serialization def this() = this(null) From 0ca51cc31d5dd16aa458956e582eec58bbc31711 Mon Sep 17 00:00:00 2001 From: YanTangZhai Date: Sat, 10 Jan 2015 15:05:23 -0800 Subject: [PATCH 110/116] [SPARK-4692] [SQL] Support ! boolean logic operator like NOT Support ! boolean logic operator like NOT in sql as follows select * from for_test where !(col1 > col2) Author: YanTangZhai Author: Michael Armbrust Closes #3555 from YanTangZhai/SPARK-4692 and squashes the following commits: 1a9f605 [YanTangZhai] Update HiveQuerySuite.scala 7c03c68 [YanTangZhai] Merge pull request #23 from apache/master 992046e [YanTangZhai] Update HiveQuerySuite.scala ea618f4 [YanTangZhai] Update HiveQuerySuite.scala 192411d [YanTangZhai] Merge pull request #17 from YanTangZhai/master e4c2c0a [YanTangZhai] Merge pull request #15 from apache/master 1e1ebb4 [YanTangZhai] Update HiveQuerySuite.scala efc4210 [YanTangZhai] Update HiveQuerySuite.scala bd2c444 [YanTangZhai] Update HiveQuerySuite.scala 1893956 [YanTangZhai] Merge pull request #14 from marmbrus/pr/3555 59e4de9 [Michael Armbrust] make hive test 718afeb [YanTangZhai] Merge pull request #12 from apache/master 950b21e [YanTangZhai] Update HiveQuerySuite.scala 74175b4 [YanTangZhai] Update HiveQuerySuite.scala 92242c7 [YanTangZhai] Update HiveQl.scala 6e643f8 [YanTangZhai] Merge pull request #11 from apache/master e249846 [YanTangZhai] Merge pull request #10 from apache/master d26d982 [YanTangZhai] Merge pull request #9 from apache/master 76d4027 [YanTangZhai] Merge pull request #8 from apache/master 03b62b0 [YanTangZhai] Merge pull request #7 from apache/master 8a00106 [YanTangZhai] Merge pull request #6 from apache/master cbcba66 [YanTangZhai] Merge pull request #3 from apache/master cdef539 [YanTangZhai] Merge pull request #1 from apache/master --- .../src/main/scala/org/apache/spark/sql/hive/HiveQl.scala | 1 + .../golden/! operator-0-81d1a187c7f4a6337baf081510a5dc5e | 1 + .../apache/spark/sql/hive/execution/HiveQuerySuite.scala | 8 ++++++++ 3 files changed, 10 insertions(+) create mode 100644 sql/hive/src/test/resources/golden/! operator-0-81d1a187c7f4a6337baf081510a5dc5e diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index 28de03c38997b..34622b5f57873 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -1111,6 +1111,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C case Token(AND(), left :: right:: Nil) => And(nodeToExpr(left), nodeToExpr(right)) case Token(OR(), left :: right:: Nil) => Or(nodeToExpr(left), nodeToExpr(right)) case Token(NOT(), child :: Nil) => Not(nodeToExpr(child)) + case Token("!", child :: Nil) => Not(nodeToExpr(child)) /* Case statements */ case Token("TOK_FUNCTION", Token(WHEN(), Nil) :: branches) => diff --git a/sql/hive/src/test/resources/golden/! operator-0-81d1a187c7f4a6337baf081510a5dc5e b/sql/hive/src/test/resources/golden/! operator-0-81d1a187c7f4a6337baf081510a5dc5e new file mode 100644 index 0000000000000..d00491fd7e5bb --- /dev/null +++ b/sql/hive/src/test/resources/golden/! operator-0-81d1a187c7f4a6337baf081510a5dc5e @@ -0,0 +1 @@ +1 diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index fb6da33e88ef6..700a45edb11d6 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -62,6 +62,14 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter { sql("SHOW TABLES") } } + + createQueryTest("! operator", + """ + |SELECT a FROM ( + | SELECT 1 AS a FROM src LIMIT 1 UNION ALL + | SELECT 2 AS a FROM src LIMIT 1) table + |WHERE !(a>1) + """.stripMargin) createQueryTest("constant object inspector for generic udf", """SELECT named_struct( From f0d558b6e6ec0c97280d5844c98fb92c24954cbb Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 10 Jan 2015 15:35:41 -0800 Subject: [PATCH 111/116] [SPARK-5181] do not print writing WAL log when WAL is disabled https://issues.apache.org/jira/browse/SPARK-5181 Currently, even the logManager is not created, we still see the log entry s"Writing to log $record" a simple fix to make log more accurate Author: CodingCat Closes #3985 from CodingCat/SPARK-5181 and squashes the following commits: 0e27dc5 [CodingCat] do not print writing WAL log when WAL is disabled --- .../spark/streaming/scheduler/ReceivedBlockTracker.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala index 2ce458cddec1a..c3d9d7b6813d3 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala @@ -203,9 +203,11 @@ private[streaming] class ReceivedBlockTracker( /** Write an update to the tracker to the write ahead log */ private def writeToLog(record: ReceivedBlockTrackerLogEvent) { - logDebug(s"Writing to log $record") - logManagerOption.foreach { logManager => + if (isLogManagerEnabled) { + logDebug(s"Writing to log $record") + logManagerOption.foreach { logManager => logManager.writeToLog(ByteBuffer.wrap(Utils.serialize(record))) + } } } From 8a29dc716e3452fdf546852ddc18238018b73891 Mon Sep 17 00:00:00 2001 From: GuoQiang Li Date: Sat, 10 Jan 2015 15:38:43 -0800 Subject: [PATCH 112/116] [Minor]Resolve sbt warnings during build (MQTTStreamSuite.scala). cc andrewor14 Author: GuoQiang Li Closes #3989 from witgo/MQTTStreamSuite and squashes the following commits: a6e967e [GuoQiang Li] Resolve sbt warnings during build (MQTTStreamSuite.scala). --- .../scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala index 39eb8b183488f..30727dfa64437 100644 --- a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala +++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.streaming.mqtt import java.net.{URI, ServerSocket} import scala.concurrent.duration._ +import scala.language.postfixOps import org.apache.activemq.broker.{TransportConnector, BrokerService} import org.eclipse.paho.client.mqttv3._ From 92d9a704ce1232bddc570bca13758b11ff9ddb1f Mon Sep 17 00:00:00 2001 From: wangfei Date: Sat, 10 Jan 2015 17:04:56 -0800 Subject: [PATCH 113/116] [SPARK-4871][SQL] Show sql statement in spark ui when run sql with spark-sql Author: wangfei Closes #3718 from scwf/sparksqlui and squashes the following commits: e0d6b5d [wangfei] format fix 383b505 [wangfei] fix conflicts 4d2038a [wangfei] using setJobDescription df79837 [wangfei] fix compile error 92ce834 [wangfei] show sql statement in spark ui when run sql use spark-sql --- core/src/main/scala/org/apache/spark/SparkContext.scala | 1 - .../spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala | 1 + .../org/apache/spark/sql/hive/thriftserver/Shim12.scala | 5 +---- .../org/apache/spark/sql/hive/thriftserver/Shim13.scala | 5 +---- 4 files changed, 3 insertions(+), 9 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 3bf3acd245d8f..ff5d796ee2766 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -458,7 +458,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli Option(localProperties.get).map(_.getProperty(key)).getOrElse(null) /** Set a human readable description of the current job. */ - @deprecated("use setJobGroup", "0.8.1") def setJobDescription(value: String) { setLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION, value) } diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala index 7a3d76c61c3a1..59f3a75768082 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/AbstractSparkSQLDriver.scala @@ -53,6 +53,7 @@ private[hive] abstract class AbstractSparkSQLDriver( override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { + context.sparkContext.setJobDescription(command) val execution = context.executePlan(context.sql(command).logicalPlan) hiveResponse = execution.stringResult() tableSchema = getResultSetSchema(execution) diff --git a/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala b/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala index 80733ea1db93b..742acba58d776 100644 --- a/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala +++ b/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala @@ -22,7 +22,6 @@ import java.util.{ArrayList => JArrayList, Map => JMap} import scala.collection.JavaConversions._ import scala.collection.mutable.{ArrayBuffer, Map => SMap} -import scala.math._ import org.apache.hadoop.hive.common.`type`.HiveDecimal import org.apache.hadoop.hive.metastore.api.FieldSchema @@ -195,9 +194,7 @@ private[hive] class SparkExecuteStatementOperation( logInfo(s"Setting spark.scheduler.pool=$value for future statements in this session.") case _ => } - - val groupId = round(random * 1000000).toString - hiveContext.sparkContext.setJobGroup(groupId, statement) + hiveContext.sparkContext.setJobDescription(statement) sessionToActivePool.get(parentSession.getSessionHandle).foreach { pool => hiveContext.sparkContext.setLocalProperty("spark.scheduler.pool", pool) } diff --git a/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala b/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala index 19d85140071ea..b82156427a88c 100644 --- a/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala +++ b/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala @@ -22,7 +22,6 @@ import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} import scala.collection.JavaConversions._ import scala.collection.mutable.{ArrayBuffer, Map => SMap} -import scala.math._ import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.hadoop.security.UserGroupInformation @@ -166,9 +165,7 @@ private[hive] class SparkExecuteStatementOperation( logInfo(s"Setting spark.scheduler.pool=$value for future statements in this session.") case _ => } - - val groupId = round(random * 1000000).toString - hiveContext.sparkContext.setJobGroup(groupId, statement) + hiveContext.sparkContext.setJobDescription(statement) sessionToActivePool.get(parentSession.getSessionHandle).foreach { pool => hiveContext.sparkContext.setLocalProperty("spark.scheduler.pool", pool) } From d22a31f5e84e27e27a059f540d08a8a441fc17fa Mon Sep 17 00:00:00 2001 From: scwf Date: Sat, 10 Jan 2015 17:07:34 -0800 Subject: [PATCH 114/116] [SPARK-5029][SQL] Enable from follow multiple brackets Enable from follow multiple brackets: ``` select key from ((select * from testData limit 1) union all (select * from testData limit 1)) x limit 1 ``` Author: scwf Closes #3853 from scwf/from and squashes the following commits: 14f110a [scwf] enable from follow multiple brackets --- .../apache/spark/sql/catalyst/SqlParser.scala | 2 +- .../org/apache/spark/sql/SQLQuerySuite.scala | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index fc7b8745590d1..5d974df98b699 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala @@ -125,7 +125,7 @@ class SqlParser extends AbstractSparkSQLParser { } protected lazy val start: Parser[LogicalPlan] = - ( select * + ( (select | ("(" ~> select <~ ")")) * ( UNION ~ ALL ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Union(q1, q2) } | INTERSECT ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Intersect(q1, q2) } | EXCEPT ^^^ { (q1: LogicalPlan, q2: LogicalPlan) => Except(q1, q2)} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index add4e218a22ee..d9de5686dce48 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -272,6 +272,23 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll { mapData.collect().take(1).toSeq) } + test("from follow multiple brackets") { + checkAnswer(sql( + "select key from ((select * from testData limit 1) union all (select * from testData limit 1)) x limit 1"), + 1 + ) + + checkAnswer(sql( + "select key from (select * from testData) x limit 1"), + 1 + ) + + checkAnswer(sql( + "select key from (select * from testData limit 1 union all select * from testData limit 1) x limit 1"), + 1 + ) + } + test("average") { checkAnswer( sql("SELECT AVG(a) FROM testData2"), From 33132609096d7fa45001c6a67724ec60bcaefaa9 Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Sat, 10 Jan 2015 17:25:39 -0800 Subject: [PATCH 115/116] [SPARK-5032] [graphx] Remove GraphX MIMA exclude for 1.3 Since GraphX is no longer alpha as of 1.2, MimaExcludes should not exclude GraphX for 1.3 Here are the individual excludes I had to add + the associated commits: ``` // SPARK-4444 ProblemFilters.exclude[IncompatibleResultTypeProblem]( "org.apache.spark.graphx.EdgeRDD.fromEdges"), ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.EdgeRDD.filter"), ProblemFilters.exclude[IncompatibleResultTypeProblem]( "org.apache.spark.graphx.impl.EdgeRDDImpl.filter"), ``` [https://github.com/apache/spark/commit/9ac2bb18ede2e9f73c255fa33445af89aaf8a000] ``` // SPARK-3623 ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.Graph.checkpoint") ``` [https://github.com/apache/spark/commit/e895e0cbecbbec1b412ff21321e57826d2d0a982] ``` // SPARK-4620 ProblemFilters.exclude[MissingMethodProblem]("org.apache.spark.graphx.Graph.unpersist"), ``` [https://github.com/apache/spark/commit/8817fc7fe8785d7b11138ca744f22f7e70f1f0a0] CC: rxin Author: Joseph K. Bradley Closes #3856 from jkbradley/graphx-mima and squashes the following commits: 1eea2f6 [Joseph K. Bradley] moved cleanup to run-tests 527ccd9 [Joseph K. Bradley] fixed jenkins script to remove ivy2 cache 802e252 [Joseph K. Bradley] Removed GraphX MIMA excludes and added line to clear spark from .m2 dir before Jenkins tests. This may not work yet... 30f8bb4 [Joseph K. Bradley] added individual mima excludes for graphx a3fea42 [Joseph K. Bradley] removed graphx mima exclude for 1.3 --- dev/run-tests | 4 +++- project/MimaExcludes.scala | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dev/run-tests b/dev/run-tests index 20603fc089239..2257a566bb1bb 100755 --- a/dev/run-tests +++ b/dev/run-tests @@ -21,8 +21,10 @@ FWDIR="$(cd "`dirname $0`"/..; pwd)" cd "$FWDIR" -# Remove work directory +# Clean up work directory and caches rm -rf ./work +rm -rf ~/.ivy2/local/org.apache.spark +rm -rf ~/.ivy2/cache/org.apache.spark source "$FWDIR/dev/run-tests-codes.sh" diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 31d4c317ae569..51e8bd4cf6419 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -36,7 +36,6 @@ object MimaExcludes { case v if v.startsWith("1.3") => Seq( MimaBuild.excludeSparkPackage("deploy"), - MimaBuild.excludeSparkPackage("graphx"), // These are needed if checking against the sbt build, since they are part of // the maven-generated artifacts in the 1.2 build. MimaBuild.excludeSparkPackage("unused"), From 1656aae2b4e8b026f8cfe782519f72d32ed2b291 Mon Sep 17 00:00:00 2001 From: lewuathe Date: Sun, 11 Jan 2015 13:50:42 -0800 Subject: [PATCH 116/116] [SPARK-5073] spark.storage.memoryMapThreshold have two default value Because major OS page sizes is about 4KB, the default value of spark.storage.memoryMapThreshold is integrated to 2 * 4096 Author: lewuathe Closes #3900 from Lewuathe/integrate-memoryMapThreshold and squashes the following commits: e417acd [lewuathe] [SPARK-5073] Update docs/configuration 834aba4 [lewuathe] [SPARK-5073] Fix style adcea33 [lewuathe] [SPARK-5073] Integrate memory map threshold to 2MB fcce2e5 [lewuathe] [SPARK-5073] spark.storage.memoryMapThreshold have two default value --- core/src/main/scala/org/apache/spark/storage/DiskStore.scala | 3 ++- docs/configuration.md | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala index 8dadf6794039e..61ef5ff168791 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala @@ -31,7 +31,8 @@ import org.apache.spark.util.Utils private[spark] class DiskStore(blockManager: BlockManager, diskManager: DiskBlockManager) extends BlockStore(blockManager) with Logging { - val minMemoryMapBytes = blockManager.conf.getLong("spark.storage.memoryMapThreshold", 2 * 4096L) + val minMemoryMapBytes = blockManager.conf.getLong( + "spark.storage.memoryMapThreshold", 2 * 1024L * 1024L) override def getSize(blockId: BlockId): Long = { diskManager.getFile(blockId.name).length diff --git a/docs/configuration.md b/docs/configuration.md index 2add48569bece..f292bfbb7dcd6 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -678,7 +678,7 @@ Apart from these, the following properties are also available, and may be useful - +
    Property NameDefaultMeaning
    spark.yarn.am.memory512m + Amount of memory to use for the YARN Application Master in client mode, in the same format as JVM memory strings (e.g. 512m, 2g). + In cluster mode, use spark.driver.memory instead. +
    spark.yarn.am.waitTime 100000spark.yarn.driver.memoryOverhead driverMemory * 0.07, with minimum of 384 - The amount of off heap memory (in megabytes) to be allocated per driver. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%). + The amount of off heap memory (in megabytes) to be allocated per driver in cluster mode. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%). +
    spark.yarn.am.memoryOverheadAM memory * 0.07, with minimum of 384 + Same as spark.yarn.driver.memoryOverhead, but for the Application Master in client mode.
    spark.yarn.am.extraJavaOptions (none) - A string of extra JVM options to pass to the Yarn ApplicationMaster in client mode. + A string of extra JVM options to pass to the YARN Application Master in client mode. In cluster mode, use spark.driver.extraJavaOptions instead.
    spark.storage.memoryMapThreshold81922097152 Size of a block, in bytes, above which Spark memory maps when reading a block from disk. This prevents Spark from memory mapping very small blocks. In general, memory