Merge remote-tracking branch 'apache-github/master' into network-rece…

…iver-api
pdeyhim · Apr 21, 2014 · ea27b38 · ea27b38
2 parents 3a4777c + fb98488
commit ea27b38
Show file tree

Hide file tree

Showing 215 changed files with 5,539 additions and 1,240 deletions.
diff --git a/.rat-excludes b/.rat-excludes
@@ -11,6 +11,7 @@ RELEASE
 control
 docs
 fairscheduler.xml.template
+spark-defaults.conf.template
 log4j.properties
 log4j.properties.template
 metrics.properties.template
@@ -40,5 +41,5 @@ work
 golden
 test.out/*
 .*iml
-python/metastore/service.properties
-python/metastore/db.lck
+service.properties
+db.lck
diff --git a/README.md b/README.md
@@ -10,20 +10,33 @@ guide, on the project webpage at <http://spark.apache.org/documentation.html>.
 This README file only contains basic setup instructions.
 
 
-## Building
+## Building Spark
 
-Spark requires Scala 2.10. The project is built using Simple Build Tool (SBT),
-which can be obtained [here](http://www.scala-sbt.org). If SBT is installed we
-will use the system version of sbt otherwise we will attempt to download it
-automatically. To build Spark and its example programs, run:
+Spark is built on Scala 2.10. To build Spark and its example programs, run:
 
     ./sbt/sbt assembly
 
-Once you've built Spark, the easiest way to start using it is the shell:
+## Interactive Scala Shell
+
+The easiest way to start using Spark is through the Scala shell:
 
     ./bin/spark-shell
 
-Or, for the Python API, the Python shell (`./bin/pyspark`).
+Try the following command, which should return 1000:
+
+    scala> sc.parallelize(1 to 1000).count()
+
+## Interactive Python Shell
+
+Alternatively, if you prefer Python, you can use the Python shell:
+
+    ./bin/pyspark
+
+And run the following command, which should also return 1000:
+
+    >>> sc.parallelize(range(1000)).count()
+
+## Example Programs
 
 Spark also comes with several sample programs in the `examples` directory.
 To run one of them, use `./bin/run-example <class> <params>`. For example:
@@ -38,13 +51,13 @@ All of the Spark samples take a `<master>` parameter that is the cluster URL
 to connect to. This can be a mesos:// or spark:// URL, or "local" to run
 locally with one thread, or "local[N]" to run locally with N threads.
 
-## Running tests
+## Running Tests
 
-Testing first requires [Building](#building) Spark. Once Spark is built, tests
+Testing first requires [building Spark](#building-spark). Once Spark is built, tests
 can be run using:
 
-`./sbt/sbt test`
- 
+    ./sbt/sbt test
+
 ## A Note About Hadoop Versions
 
 Spark uses the Hadoop core library to talk to HDFS and other Hadoop-supported

diff --git a/bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala b/bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala
@@ -24,6 +24,8 @@ import org.scalatest.time.SpanSugar._
 import org.apache.spark._
 import org.apache.spark.storage.StorageLevel
 
+import scala.language.postfixOps
+
 class TestVertex(val active: Boolean, val age: Int) extends Vertex with Serializable
 class TestMessage(val targetId: String) extends Message[String] with Serializable
 

diff --git a/bin/run-example b/bin/run-example
@@ -75,7 +75,6 @@ fi
 
 # Set JAVA_OPTS to be able to load native libraries and to set heap size
 JAVA_OPTS="$SPARK_JAVA_OPTS"
-JAVA_OPTS="$JAVA_OPTS -Djava.library.path=$SPARK_LIBRARY_PATH"
 # Load extra JAVA_OPTS from conf/java-opts, if it exists
 if [ -e "$FWDIR/conf/java-opts" ] ; then
   JAVA_OPTS="$JAVA_OPTS `cat $FWDIR/conf/java-opts`"

diff --git a/bin/spark-class b/bin/spark-class
@@ -98,7 +98,7 @@ fi
 
 # Set JAVA_OPTS to be able to load native libraries and to set heap size
 JAVA_OPTS="$OUR_JAVA_OPTS"
-JAVA_OPTS="$JAVA_OPTS -Djava.library.path=$SPARK_LIBRARY_PATH"
+JAVA_OPTS="$JAVA_OPTS -Djava.library.path=$_SPARK_LIBRARY_PATH"
 JAVA_OPTS="$JAVA_OPTS -Xms$OUR_JAVA_MEM -Xmx$OUR_JAVA_MEM"
 # Load extra JAVA_OPTS from conf/java-opts, if it exists
 if [ -e "$FWDIR/conf/java-opts" ] ; then

diff --git a/bin/spark-submit b/bin/spark-submit
@@ -25,8 +25,13 @@ while (($#)); do
     DEPLOY_MODE=$2
   elif [ $1 = "--driver-memory" ]; then
     DRIVER_MEMORY=$2
+  elif [ $1 = "--driver-library-path" ]; then
+    export _SPARK_LIBRARY_PATH=$2
+  elif [ $1 = "--driver-class-path" ]; then
+    export SPARK_CLASSPATH="$SPARK_CLASSPATH:$2"
+  elif [ $1 = "--driver-java-options" ]; then
+    export SPARK_JAVA_OPTS="$SPARK_JAVA_OPTS $2"
   fi
-
   shift
 done
 

diff --git a/conf/spark-defaults.conf.template b/conf/spark-defaults.conf.template
@@ -0,0 +1,7 @@
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master 	          spark://master:7077
+# spark.eventLog.enabled  true
+# spark.eventLog.dir      hdfs://namenode:8021/directory
diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template
@@ -1,22 +1,41 @@
 #!/usr/bin/env bash
 
-# This file contains environment variables required to run Spark. Copy it as
-# spark-env.sh and edit that to configure Spark for your site.
-#
-# The following variables can be set in this file:
+# This file is sourced when running various Spark programs.
+# Copy it as spark-env.sh and edit that to configure Spark for your site.
+
+# Options read when launching programs locally with 
+# ./bin/run-example or ./bin/spark-submit
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
+# - SPARK_CLASSPATH, default classpath entries to append
+
+# Options read by executors and drivers running inside the cluster
 # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
+# - SPARK_CLASSPATH, default classpath entries to append
+# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
 # - MESOS_NATIVE_LIBRARY, to point to your libmesos.so if you use Mesos
-# - SPARK_JAVA_OPTS, to set node-specific JVM options for Spark. Note that
-#   we recommend setting app-wide options in the application's driver program.
-#     Examples of node-specific options : -Dspark.local.dir, GC options
-#     Examples of app-wide options : -Dspark.serializer
-#
-# If using the standalone deploy mode, you can also set variables for it here:
+
+# Options read in YARN client mode
+# - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2)
+# - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1).
+# - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G)
+# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb)
+# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark)
+# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: ‘default’)
+# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job.
+# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job.
+
+# Options for the daemons used in the standalone deploy mode:
 # - SPARK_MASTER_IP, to bind the master to a different IP address or hostname
 # - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports
+# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
 # - SPARK_WORKER_CORES, to set the number of cores to use on this machine
-# - SPARK_WORKER_MEMORY, to set how much memory to use (e.g. 1000m, 2g)
+# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
 # - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT
 # - SPARK_WORKER_INSTANCES, to set the number of worker processes per node
 # - SPARK_WORKER_DIR, to set the working directory of worker processes
-# - SPARK_PUBLIC_DNS, to set the public dns name of the master
+# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
+# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
+# - SPARK_DAEMON_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
+# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
diff --git a/core/pom.xml b/core/pom.xml
@@ -266,6 +266,11 @@
       <artifactId>junit-interface</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.spark-project</groupId>
+      <artifactId>pyrolite</artifactId>
+      <version>2.0</version>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>

diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -89,12 +89,14 @@ class HashPartitioner(partitions: Int) extends Partitioner {
  * A [[org.apache.spark.Partitioner]] that partitions sortable records by range into roughly
  * equal ranges. The ranges are determined by sampling the content of the RDD passed in.
  */
-class RangePartitioner[K <% Ordered[K]: ClassTag, V](
+class RangePartitioner[K : Ordering : ClassTag, V](
     partitions: Int,
     @transient rdd: RDD[_ <: Product2[K,V]],
     private val ascending: Boolean = true)
   extends Partitioner {
 
+  private val ordering = implicitly[Ordering[K]]
+
   // An array of upper bounds for the first (partitions - 1) partitions
   private val rangeBounds: Array[K] = {
     if (partitions == 1) {
@@ -103,7 +105,7 @@ class RangePartitioner[K <% Ordered[K]: ClassTag, V](
       val rddSize = rdd.count()
       val maxSampleSize = partitions * 20.0
       val frac = math.min(maxSampleSize / math.max(rddSize, 1), 1.0)
-      val rddSample = rdd.sample(false, frac, 1).map(_._1).collect().sortWith(_ < _)
+      val rddSample = rdd.sample(false, frac, 1).map(_._1).collect().sorted
       if (rddSample.length == 0) {
         Array()
       } else {
@@ -126,7 +128,7 @@ class RangePartitioner[K <% Ordered[K]: ClassTag, V](
     var partition = 0
     if (rangeBounds.length < 1000) {
       // If we have less than 100 partitions naive search
-      while (partition < rangeBounds.length && k > rangeBounds(partition)) {
+      while (partition < rangeBounds.length && ordering.gt(k, rangeBounds(partition))) {
         partition += 1
       }
     } else {

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -208,6 +208,82 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
     new SparkConf(false).setAll(settings)
   }
 
+  /** Checks for illegal or deprecated config settings. Throws an exception for the former. Not
+    * idempotent - may mutate this conf object to convert deprecated settings to supported ones. */
+  private[spark] def validateSettings() {
+    if (settings.contains("spark.local.dir")) {
+      val msg = "In Spark 1.0 and later spark.local.dir will be overridden by the value set by " +
+        "the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone and LOCAL_DIRS in YARN)."
+      logWarning(msg)
+    }
+
+    val executorOptsKey = "spark.executor.extraJavaOptions"
+    val executorClasspathKey = "spark.executor.extraClassPath"
+    val driverOptsKey = "spark.driver.extraJavaOptions"
+    val driverClassPathKey = "spark.driver.extraClassPath"
+
+    // Validate spark.executor.extraJavaOptions
+    settings.get(executorOptsKey).map { javaOpts =>
+      if (javaOpts.contains("-Dspark")) {
+        val msg = s"$executorOptsKey is not allowed to set Spark options (was '$javaOpts)'. " +
+          "Set them directly on a SparkConf or in a properties file when using ./bin/spark-submit."
+        throw new Exception(msg)
+      }
+      if (javaOpts.contains("-Xmx") || javaOpts.contains("-Xms")) {
+        val msg = s"$executorOptsKey is not allowed to alter memory settings (was '$javaOpts'). " +
+          "Use spark.executor.memory instead."
+        throw new Exception(msg)
+      }
+    }
+
+    // Check for legacy configs
+    sys.env.get("SPARK_JAVA_OPTS").foreach { value =>
+      val error =
+        s"""
+          |SPARK_JAVA_OPTS was detected (set to '$value').
+          |This has undefined behavior when running on a cluster and is deprecated in Spark 1.0+.
+          |
+          |Please instead use:
+          | - ./spark-submit with conf/spark-defaults.conf to set defaults for an application
+          | - ./spark-submit with --driver-java-options to set -X options for a driver
+          | - spark.executor.extraJavaOptions to set -X options for executors
+          | - SPARK_DAEMON_OPTS to set java options for standalone daemons (i.e. master, worker)
+        """.stripMargin
+      logError(error)
+
+      for (key <- Seq(executorOptsKey, driverOptsKey)) {
+        if (getOption(key).isDefined) {
+          throw new SparkException(s"Found both $key and SPARK_JAVA_OPTS. Use only the former.")
+        } else {
+          logWarning(s"Setting '$key' to '$value' as a work-around.")
+          set(key, value)
+        }
+      }
+    }
+
+    sys.env.get("SPARK_CLASSPATH").foreach { value =>
+      val error =
+        s"""
+          |SPARK_CLASSPATH was detected (set to '$value').
+          | This has undefined behavior when running on a cluster and is deprecated in Spark 1.0+.
+          |
+          |Please instead use:
+          | - ./spark-submit with --driver-class-path to augment the driver classpath
+          | - spark.executor.extraClassPath to augment the executor classpath
+        """.stripMargin
+      logError(error)
+
+      for (key <- Seq(executorClasspathKey, driverClassPathKey)) {
+        if (getOption(key).isDefined) {
+          throw new SparkException(s"Found both $key and SPARK_CLASSPATH. Use only the former.")
+        } else {
+          logWarning(s"Setting '$key' to '$value' as a work-around.")
+          set(key, value)
+        }
+      }
+    }
+  }
+
   /**
    * Return a string listing all keys and values, one per line. This is useful to print the
    * configuration out for debugging.