[SPARK-1455] [SPARK-3534] [Build] When possible, run SQL tests only.

If the only files changed are related to SQL, then only run the SQL tests. This patch includes some cosmetic/maintainability refactoring. I would be more than happy to undo some of these changes if they are inappropriate. We can accept this patch mostly as-is and address the immediate need documented in [SPARK-3534](https://issues.apache.org/jira/browse/SPARK-3534), or we can keep it open until a satisfactory solution along the lines [discussed here](https://issues.apache.org/jira/browse/SPARK-1455?focusedCommentId=14136424&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-14136424) is reached. Note: I had to hack this patch up to test it locally, so what I'm submitting here and what I tested are technically different. Author: Nicholas Chammas <nicholas.chammas@gmail.com> Closes apache#2420 from nchammas/selective-testing and squashes the following commits: db3fa2d [Nicholas Chammas] diff against master! f9e23f6 [Nicholas Chammas] when possible, run SQL tests only
freeman-lab · Sep 17, 2014 · 5044e49 · 5044e49
1 parent cbf983b
commit 5044e49
Showing 1 changed file with 106 additions and 50 deletions.
diff --git a/dev/run-tests b/dev/run-tests
@@ -21,44 +21,73 @@
 FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
-if [ -n "$AMPLAB_JENKINS_BUILD_PROFILE" ]; then
-  if [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop1.0" ]; then
-    export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=1.0.4"
-  elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.0" ]; then
-    export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=2.0.0-mr1-cdh4.1.1"
-  elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.2" ]; then
-    export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Dhadoop.version=2.2.0"
-  elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.3" ]; then
-    export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
+# Remove work directory
+rm -rf ./work
+
+# Build against the right verison of Hadoop.
+{
+  if [ -n "$AMPLAB_JENKINS_BUILD_PROFILE" ]; then
+    if [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop1.0" ]; then
+      export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=1.0.4"
+    elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.0" ]; then
+      export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=2.0.0-mr1-cdh4.1.1"
+    elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.2" ]; then
+      export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Dhadoop.version=2.2.0"
+    elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.3" ]; then
+      export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
+    fi
   fi
-fi
 
-if [ -z "$SBT_MAVEN_PROFILES_ARGS" ]; then
-  export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
-fi
+  if [ -z "$SBT_MAVEN_PROFILES_ARGS" ]; then
+    export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
+  fi
+}
 
 export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl"
 
-echo "SBT_MAVEN_PROFILES_ARGS=\"$SBT_MAVEN_PROFILES_ARGS\""
-
-# Remove work directory
-rm -rf ./work
-
-if test -x "$JAVA_HOME/bin/java"; then
-    declare java_cmd="$JAVA_HOME/bin/java"
-else 
-    declare java_cmd=java
-fi
-JAVA_VERSION=$($java_cmd -version 2>&1 | sed 's/java version "\(.*\)\.\(.*\)\..*"/\1\2/; 1q')
-[ "$JAVA_VERSION" -ge 18 ] && echo "" || echo "[Warn] Java 8 tests will not run because JDK version is < 1.8."
+# Determine Java path and version.
+{
+  if test -x "$JAVA_HOME/bin/java"; then
+      declare java_cmd="$JAVA_HOME/bin/java"
+  else 
+      declare java_cmd=java
+  fi
+
+  # We can't use sed -r -e due to OS X / BSD compatibility; hence, all the parentheses.
+  JAVA_VERSION=$(
+    $java_cmd -version 2>&1 \
+    | grep -e "^java version" --max-count=1 \
+    | sed "s/java version \"\(.*\)\.\(.*\)\.\(.*\)\"/\1\2/"
+  )
+
+  if [ "$JAVA_VERSION" -lt 18 ]; then
+    echo "[warn] Java 8 tests will not run because JDK version is < 1.8."
+  fi
+}
 
-# Partial solution for SPARK-1455. Only run Hive tests if there are sql changes.
+# Only run Hive tests if there are sql changes.
+# Partial solution for SPARK-1455.
 if [ -n "$AMPLAB_JENKINS" ]; then
   git fetch origin master:master
-  diffs=`git diff --name-only master | grep "^\(sql/\)\|\(bin/spark-sql\)\|\(sbin/start-thriftserver.sh\)"`
-  if [ -n "$diffs" ]; then
-    echo "Detected changes in SQL. Will run Hive test suite."
+
+  sql_diffs=$(
+    git diff --name-only master \
+    | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
+  )
+
+  non_sql_diffs=$(
+    git diff --name-only master \
+    | grep -v -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
+  )
+
+  if [ -n "$sql_diffs" ]; then
+    echo "[info] Detected changes in SQL. Will run Hive test suite."
     _RUN_SQL_TESTS=true
+
+    if [ -z "$non_sql_diffs" ]; then
+      echo "[info] Detected no changes except in SQL. Will only run SQL tests."
+      _SQL_TESTS_ONLY=true
+    fi
   fi
 fi
 
@@ -70,42 +99,69 @@ echo ""
 echo "========================================================================="
 echo "Running Apache RAT checks"
 echo "========================================================================="
-dev/check-license
+./dev/check-license
 
 echo ""
 echo "========================================================================="
 echo "Running Scala style checks"
 echo "========================================================================="
-dev/lint-scala
+./dev/lint-scala
 
 echo ""
 echo "========================================================================="
 echo "Running Python style checks"
 echo "========================================================================="
-dev/lint-python
+./dev/lint-python
+
+echo ""
+echo "========================================================================="
+echo "Building Spark"
+echo "========================================================================="
+
+{
+  # We always build with Hive because the PySpark Spark SQL tests need it.
+  BUILD_MVN_PROFILE_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive"
+
+  echo "[info] Building Spark with these arguments: $BUILD_MVN_PROFILE_ARGS"
+
+  # NOTE: echo "q" is needed because sbt on encountering a build file with failure
+  #+ (either resolution or compilation) prompts the user for input either q, r, etc
+  #+ to quit or retry. This echo is there to make it not block.
+  # QUESTION: Why doesn't 'yes "q"' work?
+  # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work?
+  echo -e "q\n" \
+    | sbt/sbt $BUILD_MVN_PROFILE_ARGS clean package assembly/assembly \
+    | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
+}
 
 echo ""
 echo "========================================================================="
 echo "Running Spark unit tests"
 echo "========================================================================="
 
-# Build Spark; we always build with Hive because the PySpark Spark SQL tests need it.
-# echo "q" is needed because sbt on encountering a build file with failure
-# (either resolution or compilation) prompts the user for input either q, r,
-# etc to quit or retry. This echo is there to make it not block.
-BUILD_MVN_PROFILE_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive "
-echo -e "q\n" | sbt/sbt $BUILD_MVN_PROFILE_ARGS clean package assembly/assembly | \
-  grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
-
-# If the Spark SQL tests are enabled, run the tests with the Hive profiles enabled:
-if [ -n "$_RUN_SQL_TESTS" ]; then
-  SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive"
-fi
-# echo "q" is needed because sbt on encountering a build file with failure
-# (either resolution or compilation) prompts the user for input either q, r,
-# etc to quit or retry. This echo is there to make it not block.
-echo -e "q\n" | sbt/sbt $SBT_MAVEN_PROFILES_ARGS test | \
-  grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
+{
+  # If the Spark SQL tests are enabled, run the tests with the Hive profiles enabled.
+  if [ -n "$_RUN_SQL_TESTS" ]; then
+    SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive"
+  fi
+
+  if [ -n "$_SQL_TESTS_ONLY" ]; then
+    SBT_MAVEN_TEST_ARGS="catalyst/test sql/test hive/test"
+  else
+    SBT_MAVEN_TEST_ARGS="test"
+  fi
+
+  echo "[info] Running Spark tests with these arguments: $SBT_MAVEN_PROFILES_ARGS $SBT_MAVEN_TEST_ARGS"
+
+  # NOTE: echo "q" is needed because sbt on encountering a build file with failure
+  #+ (either resolution or compilation) prompts the user for input either q, r, etc
+  #+ to quit or retry. This echo is there to make it not block.
+  # QUESTION: Why doesn't 'yes "q"' work?
+  # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work?
+  echo -e "q\n" \
+    | sbt/sbt "$SBT_MAVEN_PROFILES_ARGS" "$SBT_MAVEN_TEST_ARGS" \
+    | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
+}
 
 echo ""
 echo "========================================================================="
@@ -117,4 +173,4 @@ echo ""
 echo "========================================================================="
 echo "Detecting binary incompatibilites with MiMa"
 echo "========================================================================="
-dev/mima
+./dev/mima