apache · marmbrus · Jan 30, 2014 · Jan 30, 2014 · Jan 31, 2014 · Jan 31, 2014
diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -79,6 +79,11 @@
       <artifactId>spark-graphx_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>net.sf.py4j</groupId>
       <artifactId>py4j</artifactId>

diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
@@ -33,23 +33,43 @@ fi
 # Build up classpath
 CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
 
+# Support for interacting with Hive.  Since hive pulls in a lot of dependencies that might break
+# existing Spark applications, it is not included in the standard spark assembly.  Instead, we only
+# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
+# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
+# the future.
+if [ -f "$FWDIR"/sql/hive/target/scala-$SCALA_VERSION/spark-hive-assembly-*.jar ]; then
+  echo "Hive assembly found, including hive support.  If this isn't desired run sbt hive/clean."
+
+  # Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
+  DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
+  CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
+
+  ASSEMBLY_DIR="$FWDIR/sql/hive/target/scala-$SCALA_VERSION/"
+else
+  ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
+fi
+
 # First check if we have a dependencies jar. If so, include binary classes with the deps jar
-if [ -f "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-deps.jar ]; then
+if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
   CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SCALA_VERSION/classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SCALA_VERSION/classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
 
-  DEPS_ASSEMBLY_JAR=`ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-deps.jar`
+  DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*-deps.jar`
   CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
 else
   # Else use spark-assembly jar from either RELEASE or assembly directory
   if [ -f "$FWDIR/RELEASE" ]; then
-    ASSEMBLY_JAR=`ls "$FWDIR"/jars/spark-assembly*.jar`
+    ASSEMBLY_JAR=`ls "$FWDIR"/jars/spark*-assembly*.jar`
   else
-    ASSEMBLY_JAR=`ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*.jar`
+    ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*.jar`
   fi
   CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
 fi
@@ -62,6 +82,9 @@ if [[ $SPARK_TESTING == 1 ]]; then
   CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/test-classes"
   CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SCALA_VERSION/test-classes"
   CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SCALA_VERSION/test-classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SCALA_VERSION/test-classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/test-classes"
+  CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/test-classes"
 fi
 
 # Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail !

diff --git a/dev/download-hive-tests.sh b/dev/download-hive-tests.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+wget -O hiveTests.tgz http://cs.berkeley.edu/~marmbrus/tmp/hiveTests.tgz
+tar zxf hiveTests.tgz
diff --git a/dev/run-tests b/dev/run-tests
@@ -21,6 +21,9 @@
 FWDIR="$(cd `dirname $0`/..; pwd)"
 cd $FWDIR
 
+# Download Hive Compatability Files
+dev/download-hive-tests.sh
+
 # Remove work directory
 rm -rf ./work
 

diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
@@ -66,6 +66,7 @@
                                 <li><a href="python-programming-guide.html">Spark in Python</a></li>
                                 <li class="divider"></li>
                                 <li><a href="streaming-programming-guide.html">Spark Streaming</a></li>
+                                <li><a href="sql-programming-guide.html">Spark SQL</a></li>
                                 <li><a href="mllib-guide.html">MLlib (Machine Learning)</a></li>
                                 <li><a href="bagel-programming-guide.html">Bagel (Pregel on Spark)</a></li>
                                 <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li>
@@ -79,6 +80,14 @@
                                 <li><a href="api/pyspark/index.html">Spark Core for Python</a></li>
                                 <li class="divider"></li>
                                 <li><a href="api/streaming/index.html#org.apache.spark.streaming.package">Spark Streaming</a></li>
+                                <li class="dropdown-submenu">
+                                    <a tabindex="-1" href="#">Spark SQL</a>
+                                    <ul class="dropdown-menu">
+                                        <li><a href="api/sql/core/org/apache/spark/sql/SQLContext.html">Spark SQL Core</a></li>
+                                        <li><a href="api/sql/hive/org/apache/spark/sql/hive/package.html">Hive Support</a></li>
+                                        <li><a href="api/sql/catalyst/org/apache/spark/sql/catalyst/package.html">Catalyst (Optimization)</a></li>
+                                    </ul>
+                                </li>
                                 <li><a href="api/mllib/index.html#org.apache.spark.mllib.package">MLlib (Machine Learning)</a></li>
                                 <li><a href="api/bagel/index.html#org.apache.spark.bagel.package">Bagel (Pregel on Spark)</a></li>
                                 <li><a href="api/graphx/index.html#org.apache.spark.graphx.package">GraphX (Graph Processing)</a></li>

diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
@@ -22,6 +22,7 @@
   # Build Scaladoc for Java/Scala
   core_projects = ["core", "examples", "repl", "bagel", "graphx", "streaming", "mllib"]
   external_projects = ["flume", "kafka", "mqtt", "twitter", "zeromq"]
+  sql_projects = ["catalyst", "core", "hive"]
 
   projects = core_projects + external_projects.map { |project_name| "external/" + project_name }
 
@@ -49,6 +50,18 @@
     cp_r(source + "/.", dest)
   end
 
+  sql_projects.each do |project_name|
+    source = "../sql/" + project_name + "/target/scala-2.10/api/"
+    dest = "api/sql/" + project_name
+
+    puts "echo making directory " + dest
+    mkdir_p dest
+
+    # From the rubydoc: cp_r('src', 'dest') makes src/dest, but this doesn't.
+    puts "cp -r " + source + "/. " + dest
+    cp_r(source + "/.", dest)
+  end
+
   # Build Epydoc for Python
   puts "Moving to python directory and building epydoc."
   cd("../python")

diff --git a/docs/index.md b/docs/index.md
@@ -76,6 +76,7 @@ For this version of Spark (0.8.1) Hadoop 2.2.x (or newer) users will have to bui
   * [Java Programming Guide](java-programming-guide.html): using Spark from Java
   * [Python Programming Guide](python-programming-guide.html): using Spark from Python
 * [Spark Streaming](streaming-programming-guide.html): Spark's API for processing data streams
+* [Spark SQL](sql-programming-guide.html): Support for running relational queries on Spark
 * [MLlib (Machine Learning)](mllib-guide.html): Spark's built-in machine learning library
 * [Bagel (Pregel on Spark)](bagel-programming-guide.html): simple graph processing model
 * [GraphX (Graphs on Spark)](graphx-programming-guide.html): Spark's new API for graphs

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
@@ -0,0 +1,143 @@
+---
+layout: global
+title: Spark SQL Programming Guide
+---
+**Spark SQL is currently an Alpha component. Therefore, the APIs may be changed in future releases.**
+
+* This will become a table of contents (this text will be scraped).
+{:toc}
+
+# Overview
+Spark SQL allows relational queries expressed in SQL, HiveQL, or Scala to be executed using
+Spark.  At the core of this component is a new type of RDD,
+[SchemaRDD](api/sql/core/index.html#org.apache.spark.sql.SchemaRDD).  SchemaRDDs are composed
+[Row](api/sql/catalyst/index.html#org.apache.spark.sql.catalyst.expressions.Row) objects along with
+a schema that describes the data types of each column in the row.  A SchemaRDD is similar to a table
+in a traditional relational database.  A SchemaRDD can be created from an existing RDD, parquet
+file, or by running HiveQL against data stored in [Apache Hive](http://hive.apache.org/).
+
+**All of the examples on this page use sample data included in the Spark distribution and can be run in the spark-shell.**
+
+***************************************************************************************************
+
+# Getting Started
+
+The entry point into all relational functionallity in Spark is the
+[SQLContext](api/sql/core/index.html#org.apache.spark.sql.SQLContext) class, or one of its
+decendents.  To create a basic SQLContext, all you need is a SparkContext.
+
+{% highlight scala %}
+val sc: SparkContext // An existing SparkContext.
+val sqlContext = new org.apache.spark.sql.SQLContext(sc)
+
+// Importing the SQL context gives access to all the public SQL functions and implicit conversions.
+import sqlContext._
+{% endhighlight %}
+
+## Running SQL on RDDs
+One type of table that is supported by Spark SQL is an RDD of Scala case classetees.  The case class
+defines the schema of the table.  The names of the arguments to the case class are read using
+reflection and become the names of the columns. Case classes can also be nested or contain complex
+types such as Sequences or Arrays. This RDD can be implicitly converted to a SchemaRDD and then be
+registered as a table.  Tables can used in subsequent SQL statements.
+
+{% highlight scala %}
+val sqlContext = new org.apache.spark.sql.SQLContext(sc)
+import sqlContext._
+
+// Define the schema using a case class.
+case class Person(name: String, age: Int)
+
+// Create an RDD of Person objects and register it as a table.
+val people = sc.textFile("examples/src/main/resources/people.txt").map(_.split(",")).map(p => Person(p(0), p(1).trim.toInt))
+people.registerAsTable("people")
+
+// SQL statements can be run by using the sql methods provided by sqlContext.
+val teenagers = sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
+
+// The results of SQL queries are SchemaRDDs and support all the normal RDD operations.
+// The columns of a row in the result can be accessed by ordinal.
+teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
+{% endhighlight %}
+
+**Note that Spark SQL currently uses a very basic SQL parser, and the keywords are case sensitive.**
+Users that want a more complete dialect of SQL should look at the HiveQL support provided by
+`HiveContext`.
+
+## Using Parquet
+
+Parquet is a columnar format that is supported by many other data processing systems.  Spark SQL
+provides support for both reading and writing parquet files that automatically preserves the schema
+of the original data.  Using the data from the above example:
+
+{% highlight scala %}
+val sqlContext = new org.apache.spark.sql.SQLContext(sc)
+import sqlContext._
+
+val people: RDD[Person] // An RDD of case class objects, from the previous example.
+
+// The RDD is implicitly converted to a SchemaRDD, allowing it to be stored using parquet.
+people.saveAsParquetFile("people.parquet")
+
+// Read in the parquet file created above.  Parquet files are self-describing so the schema is preserved.
+// The result of loading a parquet file is also a SchemaRDD.
+val parquetFile = sqlContext.parquetFile("people.parquet")
+
+//Parquet files can also be registered as tables and then used in SQL statements.
+parquetFile.registerAsTable("parquetFile")
+val teenagers = sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
+teenagers.collect().foreach(println)
+{% endhighlight %}
+
+## Writing Language-Integrated Relational Queries
+
+Spark SQL also supports a domain specific language for writing queries.  Once again,
+using the data from the above examples:
+
+{% highlight scala %}
+val sqlContext = new org.apache.spark.sql.SQLContext(sc)
+import sqlContext._
+val people: RDD[Person] // An RDD of case class objects, from the first example.
+
+// The following is the same as 'SELECT name FROM people WHERE age >= 10 AND age <= 19'
+val teenagers = people.where('age >= 10).where('age <= 19).select('name)
+{% endhighlight %}
+
+The DSL uses Scala symbols to represent columns in the underlying table, which are identifiers
+prefixed with a tick (`'`).  Implicit conversions turn these symbols into expressions that are
+evaluated by the SQL execution engine.  A full list of the functions supported can be found in the
+[ScalaDoc](api/sql/core/index.html#org.apache.spark.sql.SchemaRDD).
+
+<!-- TODO: Include the table of operations here. -->
+
+# Hive Support
+
+Spark SQL also supports reading and writing data stored in [Apache Hive](http://hive.apache.org/).
+However, since Hive has a large number of dependencies, it is not included in the default Spark assembly.
+In order to use Hive you must first run '`sbt/sbt hive/assembly`'.  This command builds a new assembly
+jar that includes Hive.  When this jar is present, Spark will use the Hive
+assembly instead of the normal Spark assembly.  Note that this Hive assembly jar must also be present
+on all of the worker nodes, as they will need access to the Hive serialization and deserialization libraries
+(SerDes) in order to acccess data stored in Hive.
+
+Configuration of Hive is done by placing your `hive-site.xml` file in `conf/`.
+
+When working with Hive one must construct a `HiveContext`, which inherits from `SQLContext`, and
+adds support for finding tables in in the MetaStore and writing queries using HiveQL. Users who do
+not have an existing Hive deployment can also experiment with the `LocalHiveContext`,
+which is similar to `HiveContext`, but creates a local copy of the `metastore` and `warehouse`
+automatically.
+
+{% highlight scala %}
+val sc: SparkContext // An existing SparkContext.
+val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)
+
+// Importing the SQL context gives access to all the public SQL functions and implicit conversions.
+import hiveContext._
+
+sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
+sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
+
+// Queries are expressed in HiveQL
+sql("SELECT key, value FROM src").collect().foreach(println)
+{% endhighlight %}
diff --git a/examples/pom.xml b/examples/pom.xml
@@ -70,6 +70,12 @@
       <version>${project.version}</version>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-hive_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-graphx_${scala.binary.version}</artifactId>