Merge branch 'master' into SPARK-1149

MediaV · Mar 3, 2014 · 1e3331e · 1e3331e
2 parents 3348619 + f65c1f3
commit 1e3331e
Show file tree

Hide file tree

Showing 36 changed files with 756 additions and 201 deletions.
diff --git a/LICENSE b/LICENSE
@@ -396,3 +396,35 @@ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
+
+
+========================================================================
+For sbt and sbt-launch-lib.bash in sbt/:
+========================================================================
+
+// Generated from http://www.opensource.org/licenses/bsd-license.php
+Copyright (c) 2011, Paul Phillips.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of the author nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -21,14 +21,14 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.0.0-incubating-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-assembly_2.10</artifactId>
   <name>Spark Project Assembly</name>
-  <url>http://spark.incubator.apache.org/</url>
+  <url>http://spark.apache.org/</url>
 
   <properties>
     <spark.jar>${project.build.directory}/scala-${scala.binary.version}/${project.artifactId}-${project.version}-hadoop${hadoop.version}.jar</spark.jar>

diff --git a/bagel/pom.xml b/bagel/pom.xml
@@ -21,15 +21,29 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.0.0-incubating-SNAPSHOT</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-bagel_2.10</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Bagel</name>
-  <url>http://spark.incubator.apache.org/</url>
+  <url>http://spark.apache.org/</url>
+
+  <profiles>
+    <profile>
+      <!-- SPARK-1121: SPARK-1121: Adds an explicit dependency on Avro to work around
+           a Hadoop 0.23.X issue -->
+      <id>yarn-alpha</id>
+      <dependencies>
+         <dependency>
+           <groupId>org.apache.avro</groupId>
+           <artifactId>avro</artifactId>
+         </dependency>
+      </dependencies>
+    </profile>
+  </profiles>
 
   <dependencies>
     <dependency>

diff --git a/core/pom.xml b/core/pom.xml
@@ -21,15 +21,29 @@
     <parent>
         <groupId>org.apache.spark</groupId>
         <artifactId>spark-parent</artifactId>
-        <version>1.0.0-incubating-SNAPSHOT</version>
+        <version>1.0.0-SNAPSHOT</version>
         <relativePath>../pom.xml</relativePath>
     </parent>
 
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-core_2.10</artifactId>
     <packaging>jar</packaging>
     <name>Spark Project Core</name>
-    <url>http://spark.incubator.apache.org/</url>
+    <url>http://spark.apache.org/</url>
+
+    <!-- SPARK-1121: SPARK-1121: Adds an explicit dependency on Avro to work around
+         a Hadoop 0.23.X issue -->
+    <profiles>
+      <profile>
+        <id>yarn-alpha</id>
+        <dependencies>
+           <dependency>
+             <groupId>org.apache.avro</groupId>
+             <artifactId>avro</artifactId>
+           </dependency>
+        </dependencies>
+      </profile>
+    </profiles>
 
     <dependencies>
         <dependency>
@@ -125,6 +139,15 @@
             <groupId>org.json4s</groupId>
             <artifactId>json4s-jackson_${scala.binary.version}</artifactId>
             <version>3.2.6</version>
+            <!-- see also exclusion for lift-json; this is necessary since it depends on
+                 scala-library and scalap 2.10.0, but we use 2.10.3, and only override
+                 scala-library -->
+            <exclusions>
+              <exclusion>
+                <groupId>org.scala-lang</groupId>
+                <artifactId>scalap</artifactId>
+              </exclusion>
+            </exclusions>
         </dependency>
         <dependency>
             <groupId>it.unimi.dsi</groupId>

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala
@@ -64,11 +64,11 @@ class ZooKeeperPersistenceEngine(serialization: Serialization, conf: SparkConf)
   override def readPersistedData(): (Seq[ApplicationInfo], Seq[DriverInfo], Seq[WorkerInfo]) = {
     val sortedFiles = zk.getChildren().forPath(WORKING_DIR).toList.sorted
     val appFiles = sortedFiles.filter(_.startsWith("app_"))
-    val apps = appFiles.map(deserializeFromFile[ApplicationInfo])
+    val apps = appFiles.map(deserializeFromFile[ApplicationInfo]).flatten
     val driverFiles = sortedFiles.filter(_.startsWith("driver_"))
-    val drivers = driverFiles.map(deserializeFromFile[DriverInfo])
+    val drivers = driverFiles.map(deserializeFromFile[DriverInfo]).flatten
     val workerFiles = sortedFiles.filter(_.startsWith("worker_"))
-    val workers = workerFiles.map(deserializeFromFile[WorkerInfo])
+    val workers = workerFiles.map(deserializeFromFile[WorkerInfo]).flatten
     (apps, drivers, workers)
   }
 
@@ -78,10 +78,18 @@ class ZooKeeperPersistenceEngine(serialization: Serialization, conf: SparkConf)
     zk.create().withMode(CreateMode.PERSISTENT).forPath(path, serialized)
   }
 
-  def deserializeFromFile[T](filename: String)(implicit m: Manifest[T]): T = {
+  def deserializeFromFile[T](filename: String)(implicit m: Manifest[T]): Option[T] = {
     val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename)
     val clazz = m.runtimeClass.asInstanceOf[Class[T]]
     val serializer = serialization.serializerFor(clazz)
-    serializer.fromBinary(fileData).asInstanceOf[T]
+    try {
+      Some(serializer.fromBinary(fileData).asInstanceOf[T])
+    } catch {
+      case e: Exception => {
+        logWarning("Exception while reading persisted file, deleting", e)
+        zk.delete().forPath(WORKING_DIR + "/" + filename)
+        None
+      }
+    }
   }
 }
diff --git a/dev/audit-release/audit_release.py b/dev/audit-release/audit_release.py
@@ -31,10 +31,10 @@
 import urllib2
 
 ## Fill in release details here:
-RELEASE_URL = "http://people.apache.org/~pwendell/spark-0.9.0-incubating-rc5/"
+RELEASE_URL = "http://people.apache.org/~pwendell/spark-1.0.0-rc1/"
 RELEASE_KEY = "9E4FE3AF"
 RELEASE_REPOSITORY = "https://repository.apache.org/content/repositories/orgapachespark-1006/"
-RELEASE_VERSION = "0.9.0-incubating"
+RELEASE_VERSION = "1.0.0"
 SCALA_VERSION = "2.10.3"
 SCALA_BINARY_VERSION = "2.10"
 ##
@@ -191,10 +191,6 @@ def ensure_path_not_present(x):
   test("NOTICE" in base_files, "Tarball contains NOTICE file")
   test("LICENSE" in base_files, "Tarball contains LICENSE file")
 
-  os.chdir(os.path.join(WORK_DIR, dir_name))
-  readme = "".join(open("README.md").readlines())
-  disclaimer_part = "is an effort undergoing incubation"
-  test(disclaimer_part in readme, "README file contains disclaimer")
   os.chdir(WORK_DIR)
 
 for artifact in artifacts:

diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh
@@ -120,7 +120,7 @@ scp spark* \
 # Docs
 cd spark
 cd docs
-jekyll build
+PRODUCTION=1 jekyll build
 echo "Copying release documentation"
 rc_docs_folder=${rc_folder}-docs
 rsync -r _site/* $USER_NAME@people.apache.org /home/$USER_NAME/public_html/$rc_docs_folder

diff --git a/docs/README.md b/docs/README.md
@@ -10,9 +10,22 @@ We include the Spark documentation as part of the source (as opposed to using a
 
 In this directory you will find textfiles formatted using Markdown, with an ".md" suffix. You can read those text files directly if you want. Start with index.md.
 
-To make things quite a bit prettier and make the links easier to follow, generate the html version of the documentation based on the src directory by running `jekyll build` in the docs directory. Use the command `SKIP_SCALADOC=1 jekyll build` to skip building and copying over the scaladoc which can be timely. To use the `jekyll` command, you will need to have Jekyll installed, the easiest way to do this is via a Ruby Gem, see the [jekyll installation instructions](http://jekyllrb.com/docs/installation). This will create a directory called _site containing index.html as well as the rest of the compiled files. Read more about Jekyll at https://github.com/mojombo/jekyll/wiki.
-
-In addition to generating the site as html from the markdown files, jekyll can serve up the site via a webserver. To build and run a local webserver use the command `jekyll serve` (or the faster variant `SKIP_SCALADOC=1 jekyll serve`), which runs the webserver on port 4000, then visit the site at http://localhost:4000.
+The markdown code can be compiled to HTML using the 
+[Jekyll tool](http://jekyllrb.com).
+To use the `jekyll` command, you will need to have Jekyll installed. 
+The easiest way to do this is via a Ruby Gem, see the 
+[jekyll installation instructions](http://jekyllrb.com/docs/installation). 
+Compiling the site with Jekyll will create a directory called 
+_site containing index.html as well as the rest of the compiled files.
+
+You can modify the default Jekyll build as follows:
+
+    # Skip generating API docs (which takes a while)
+    $ SKIP_SCALADOC=1 jekyll build
+    # Serve content locally on port 4000
+    $ jekyll serve --watch
+    # Build the site with extra features used on the live page
+    $ PRODUCTION=1 jekyll build
 
 ## Pygments
 

diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
@@ -24,9 +24,9 @@
 
         <link rel="stylesheet" href="css/pygments-default.css">
 
+        {% production %}
         <!-- Google analytics script -->
         <script type="text/javascript">
-          /*
           var _gaq = _gaq || [];
           _gaq.push(['_setAccount', 'UA-32518208-1']);
           _gaq.push(['_trackPageview']);
@@ -36,8 +36,8 @@
             ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
             var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
           })();
-          */
         </script>
+        {% endproduction %}
 
     </head>
     <body>

diff --git a/docs/_plugins/production_tag.rb b/docs/_plugins/production_tag.rb
@@ -0,0 +1,14 @@
+module Jekyll
+  class ProductionTag < Liquid::Block
+
+    def initialize(tag_name, markup, tokens)
+      super
+    end
+
+    def render(context)
+      if ENV['PRODUCTION'] then super else "" end
+    end
+  end
+end
+
+Liquid::Template.register_tag('production', Jekyll::ProductionTag)
diff --git a/docs/building-with-maven.md b/docs/building-with-maven.md
@@ -76,7 +76,3 @@ The maven build includes support for building a Debian package containing the as
     $ mvn -Pdeb -DskipTests clean package
 
 The debian package can then be found under assembly/target. We added the short commit hash to the file name so that we can distinguish individual packages built for SNAPSHOT versions.
-
-## A note about Hadoop version 0.23.x
-
-For building spark with hadoop 0.23.x and also yarn, you will have to manually add a dependency on avro (org.apache.avro, avro, 1.7.4).
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
@@ -58,11 +58,21 @@ do is as follows.
 
 <div class="codetabs">
 <div data-lang="scala"  markdown="1" >
+First, we import the names of the Spark Streaming classes, and some implicit
+conversions from StreamingContext into our environment, to add useful methods to
+other classes we need (like DStream).
 
-First, we create a
-[StreamingContext](api/streaming/index.html#org.apache.spark.streaming.StreamingContext) object,
-which is the main entry point for all streaming
-functionality. Besides Spark's configuration, we specify that any DStream will be processed
+[StreamingContext](api/streaming/index.html#org.apache.spark.streaming.StreamingContext) is the
+main entry point for all streaming functionality.
+
+{% highlight scala %}
+import org.apache.spark.streaming._
+import org.apache.spark.streaming.StreamingContext._
+{% endhighlight %}
+
+Then we create a
+[StreamingContext](api/streaming/index.html#org.apache.spark.streaming.StreamingContext) object.
+Besides Spark's configuration, we specify that any DStream will be processed
 in 1 second batches.
 
 {% highlight scala %}
@@ -98,7 +108,7 @@ val pairs = words.map(word => (word, 1))
 val wordCounts = pairs.reduceByKey(_ + _)
 
 // Print a few of the counts to the console
-wordCount.print()
+wordCounts.print()
 {% endhighlight %}
 
 The `words` DStream is further mapped (one-to-one transformation) to a DStream of `(word,
@@ -178,7 +188,7 @@ JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey(
       return i1 + i2;
     }
   });
-wordCount.print();     // Print a few of the counts to the console
+wordCounts.print();     // Print a few of the counts to the console
 {% endhighlight %}
 
 The `words` DStream is further mapped (one-to-one transformation) to a DStream of `(word,
@@ -262,6 +272,24 @@ Time: 1357008430000 ms
     </td>
 </table>
 
+If you plan to run the Scala code for Spark Streaming-based use cases in the Spark
+shell, you should start the shell with the SparkConfiguration pre-configured to
+discard old batches periodically:
+
+{% highlight bash %}
+$ SPARK_JAVA_OPTS=-Dspark.cleaner.ttl=10000 bin/spark-shell
+{% endhighlight %}
+
+... and create your StreamingContext by wrapping the existing interactive shell
+SparkContext object, `sc`:
+
+{% highlight scala %}
+val ssc = new StreamingContext(sc, Seconds(1))
+{% endhighlight %}
+
+When working with the shell, you may also need to send a `^D` to your netcat session
+to force the pipeline to print the word counts to the console at the sink.
+
 ***************************************************************************************************  
 
 # Basics

diff --git a/ec2/README b/ec2/README
@@ -1,4 +1,4 @@
 This folder contains a script, spark-ec2, for launching Spark clusters on
 Amazon EC2. Usage instructions are available online at:
 
-http://spark.incubator.apache.org/docs/latest/ec2-scripts.html
+http://spark.apache.org/docs/latest/ec2-scripts.html
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
@@ -73,7 +73,7 @@ def parse_args():
   parser.add_option("-v", "--spark-version", default="0.9.0",
       help="Version of Spark to use: 'X.Y.Z' or a specific git hash")
   parser.add_option("--spark-git-repo",
-      default="https://github.com/apache/incubator-spark",
+      default="https://github.com/apache/spark",
       help="Github repo from which to checkout supplied commit hash")
   parser.add_option("--hadoop-major-version", default="1",
       help="Major version of Hadoop (default: 1)")