From 575d2df350e32964fffb02536cb0587a92ef20e9 Mon Sep 17 00:00:00 2001
From: Leolh <leosandylh@gmail.com>
Date: Fri, 6 Feb 2015 09:01:53 +0000
Subject: [PATCH 001/357] [SPARK-5380][GraphX]  Solve an
 ArrayIndexOutOfBoundsException when build graph with a file format error

When I build a graph with a file format error, there will be an ArrayIndexOutOfBoundsException

Author: Leolh <leosandylh@gmail.com>

Closes #4176 from Leolh/patch-1 and squashes the following commits:

94f6d22 [Leolh] Update GraphLoader.scala
23767f1 [Leolh] [SPARK-3650][GraphX] There will be an ArrayIndexOutOfBoundsException if the format of the source file is wrong
---
 graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
index 4933aecba1286..21187be7678a6 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
@@ -77,7 +77,7 @@ object GraphLoader extends Logging {
         if (!line.isEmpty && line(0) != '#') {
           val lineArray = line.split("\\s+")
           if (lineArray.length < 2) {
-            logWarning("Invalid line: " + line)
+            throw new IllegalArgumentException("Invalid line: " + line)
           }
           val srcId = lineArray(0).toLong
           val dstId = lineArray(1).toLong

From f6ba813af21f8aa277155b789a7330e356ce15da Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.co.jp>
Date: Fri, 6 Feb 2015 09:33:36 +0000
Subject: [PATCH 002/357] [Minor] Remove permission for execution from
 spark-shell.cmd

.cmd files in bin is not set permission for execution except for spark-shell.cmd.
Let's unify that.

Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp>

Closes #3983 from sarutak/fix-mode-of-cmd and squashes the following commits:

9d6eedc [Kousuke Saruta] Removed permission for execution from spark-shell.cmd
---
 bin/spark-shell.cmd | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 bin/spark-shell.cmd

diff --git a/bin/spark-shell.cmd b/bin/spark-shell.cmd
old mode 100755
new mode 100644

From 24dbc50b934bfcd9fc3fd91cfd9966b75b250ad7 Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.co.jp>
Date: Fri, 6 Feb 2015 09:39:12 +0000
Subject: [PATCH 003/357] [SPARK-5157][YARN] Configure more JVM options
 properly when we use ConcMarkSweepGC for AM.

When we set `SPARK_USE_CONC_INCR_GC`, ConcurrentMarkSweepGC works on the AM.
Actually, if ConcurrentMarkSweepGC is set for the JVM, following JVM options are set automatically and implicitly.

* MaxTenuringThreshold=0
* SurvivorRatio=1024

Those can not be proper value for most cases.
See also http://www.oracle.com/technetwork/java/tuning-139912.html

Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp>

Closes #3956 from sarutak/SPARK-5157 and squashes the following commits:

c15da4e [Kousuke Saruta] Set more JVM options for AM when enabling CMS
---
 yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 91e8574e94e2f..e7005094b5f3c 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -418,6 +418,8 @@ private[spark] class Client(
       // In our expts, using (default) throughput collector has severe perf ramifications in
       // multi-tenant machines
       javaOpts += "-XX:+UseConcMarkSweepGC"
+      javaOpts += "-XX:MaxTenuringThreshold=31"
+      javaOpts += "-XX:SurvivorRatio=8"
       javaOpts += "-XX:+CMSIncrementalMode"
       javaOpts += "-XX:+CMSIncrementalPacing"
       javaOpts += "-XX:CMSIncrementalDutyCycleMin=0"

From 856928979f7d00fbb518fc1102a2e7d80cea0f7c Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 6 Feb 2015 10:07:20 +0000
Subject: [PATCH 004/357] [SPARK-5582] [history] Ignore empty log directories.

Empty log directories are not useful at the moment, but if one ends
up showing in the log root, it breaks the code that checks for log
directories.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #4352 from vanzin/SPARK-5582 and squashes the following commits:

1a6a3d4 [Marcelo Vanzin] [SPARK-5582] Fix exception when looking at empty directories.
---
 .../deploy/history/FsHistoryProvider.scala    | 22 ++++++++++++-------
 .../history/FsHistoryProviderSuite.scala      | 18 +++++++++++++++
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 0ae45f4ad9130..92125f2df7d10 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -173,9 +173,10 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
       val logInfos = statusList
         .filter { entry =>
           try {
-            val modTime = getModificationTime(entry)
-            newLastModifiedTime = math.max(newLastModifiedTime, modTime)
-            modTime >= lastModifiedTime
+            getModificationTime(entry).map { time =>
+              newLastModifiedTime = math.max(newLastModifiedTime, time)
+              time >= lastModifiedTime
+            }.getOrElse(false)
           } catch {
             case e: AccessControlException =>
               // Do not use "logInfo" since these messages can get pretty noisy if printed on
@@ -251,7 +252,7 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
         appListener.appName.getOrElse(NOT_STARTED),
         appListener.startTime.getOrElse(-1L),
         appListener.endTime.getOrElse(-1L),
-        getModificationTime(eventLog),
+        getModificationTime(eventLog).get,
         appListener.sparkUser.getOrElse(NOT_STARTED),
         isApplicationCompleted(eventLog))
     } finally {
@@ -310,11 +311,16 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
    */
   private def isLegacyLogDirectory(entry: FileStatus): Boolean = entry.isDir()
 
-  private def getModificationTime(fsEntry: FileStatus): Long = {
-    if (fsEntry.isDir) {
-      fs.listStatus(fsEntry.getPath).map(_.getModificationTime()).max
+  /**
+   * Returns the modification time of the given event log. If the status points at an empty
+   * directory, `None` is returned, indicating that there isn't an event log at that location.
+   */
+  private def getModificationTime(fsEntry: FileStatus): Option[Long] = {
+    if (isLegacyLogDirectory(fsEntry)) {
+      val statusList = fs.listStatus(fsEntry.getPath)
+      if (!statusList.isEmpty) Some(statusList.map(_.getModificationTime()).max) else None
     } else {
-      fsEntry.getModificationTime()
+      Some(fsEntry.getModificationTime())
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index 3fbc1a21d10ed..1d95432258111 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -190,6 +190,24 @@ class FsHistoryProviderSuite extends FunSuite with BeforeAndAfter with Matchers
     appListAfterRename.head.logPath should not endWith(EventLoggingListener.IN_PROGRESS)
   }
 
+  test("SPARK-5582: empty log directory") {
+    val conf = new SparkConf()
+      .set("spark.history.fs.logDirectory", testDir.getAbsolutePath())
+    val provider = new FsHistoryProvider(conf)
+
+    val logFile1 = new File(testDir, "app1" + EventLoggingListener.IN_PROGRESS)
+    writeFile(logFile1, true, None,
+      SparkListenerApplicationStart("app1", Some("app1"), 1L, "test"),
+      SparkListenerApplicationEnd(2L))
+
+    val oldLog = new File(testDir, "old1")
+    oldLog.mkdir()
+
+    provider.checkForLogs()
+    val appListAfterRename = provider.getListing()
+    appListAfterRename.size should be (1)
+  }
+
   private def writeFile(file: File, isNewFormat: Boolean, codec: Option[CompressionCodec],
     events: SparkListenerEvent*) = {
     val out =

From ed3aac791232560d2cfe25e3304e48873d48b338 Mon Sep 17 00:00:00 2001
From: lianhuiwang <lianhuiwang09@gmail.com>
Date: Fri, 6 Feb 2015 11:00:35 +0000
Subject: [PATCH 005/357] [SPARK-5470][Core]use defaultClassLoader to load
 classes in KryoSerializer

Now KryoSerializer load classes of classesToRegister at the time of its initialization. when we set spark.kryo.classesToRegister=class1, it will throw SparkException("Failed to load class to register with Kryo".
because in KryoSerializer's initialization, classLoader cannot include class of user's jars.
we need to use defaultClassLoader of Serializer in newKryo(), because executor will reset defaultClassLoader of Serializer after Serializer's initialization.
thank zzcclp for reporting it to me.

Author: lianhuiwang <lianhuiwang09@gmail.com>

Closes #4258 from lianhuiwang/SPARK-5470 and squashes the following commits:

73b719f [lianhuiwang] do the splitting and filtering during initialization
64cf306 [lianhuiwang] use defaultClassLoader to load classes of classesToRegister in KryoSerializer
---
 .../org/apache/spark/serializer/KryoSerializer.scala  | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index d56e23ce4478a..02158aa0f866e 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -58,14 +58,6 @@ class KryoSerializer(conf: SparkConf)
   private val classesToRegister = conf.get("spark.kryo.classesToRegister", "")
     .split(',')
     .filter(!_.isEmpty)
-    .map { className =>
-      try {
-        Class.forName(className)
-      } catch {
-        case e: Exception =>
-          throw new SparkException("Failed to load class to register with Kryo", e)
-      }
-    }
 
   def newKryoOutput() = new KryoOutput(bufferSize, math.max(bufferSize, maxBufferSize))
 
@@ -97,7 +89,8 @@ class KryoSerializer(conf: SparkConf)
       // Use the default classloader when calling the user registrator.
       Thread.currentThread.setContextClassLoader(classLoader)
       // Register classes given through spark.kryo.classesToRegister.
-      classesToRegister.foreach { clazz => kryo.register(clazz) }
+      classesToRegister
+        .foreach { className => kryo.register(Class.forName(className, true, classLoader)) }
       // Allow the user to register their own classes by setting spark.kryo.registrator.
       userRegistrator
         .map(Class.forName(_, true, classLoader).newInstance().asInstanceOf[KryoRegistrator])

From f827ef4d7ead85aab65837ea625141e05e917c2e Mon Sep 17 00:00:00 2001
From: Miguel Peralvo <miguel.peralvo@gmail.com>
Date: Fri, 6 Feb 2015 11:04:48 +0000
Subject: [PATCH 006/357] Update ec2-scripts.md

Change spark-version from 1.1.0 to 1.2.0 in the example for spark-ec2/Launch Cluster.

Author: Miguel Peralvo <miguel.peralvo@gmail.com>

Closes #4300 from MiguelPeralvo/patch-1 and squashes the following commits:

38adf0b [Miguel Peralvo] Update ec2-scripts.md
1850869 [Miguel Peralvo] Update ec2-scripts.md
---
 docs/ec2-scripts.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ec2-scripts.md b/docs/ec2-scripts.md
index d50f445d7ecc7..8c9a1e1262d8f 100644
--- a/docs/ec2-scripts.md
+++ b/docs/ec2-scripts.md
@@ -52,7 +52,7 @@ identify machines belonging to each cluster in the Amazon EC2 Console.
     ```bash
     export AWS_SECRET_ACCESS_KEY=AaBbCcDdEeFGgHhIiJjKkLlMmNnOoPpQqRrSsTtU
 export AWS_ACCESS_KEY_ID=ABCDEFG1234567890123
-./spark-ec2 --key-pair=awskey --identity-file=awskey.pem --region=us-west-1 --zone=us-west-1a --spark-version=1.1.0 launch my-spark-cluster
+./spark-ec2 --key-pair=awskey --identity-file=awskey.pem --region=us-west-1 --zone=us-west-1a launch my-spark-cluster
     ```
 
 -   After everything launches, check that the cluster scheduler is up and sees

From cf6778e8d8e8d7e0b4e7b17637ad812624a937dd Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Fri, 6 Feb 2015 11:38:39 +0000
Subject: [PATCH 007/357] [Build] Set all Debian package permissions to 755

755 means the owner can read, write, and execute, and everyone else can just read and execute. I think that's what we want here since without execute permissions others cannot open directories.

Inspired by [this comment on a separate PR](https://github.com/apache/spark/pull/3297#issuecomment-63286730).

Author: Nicholas Chammas <nicholas.chammas@gmail.com>

Closes #4277 from nchammas/patch-1 and squashes the following commits:

da77fb0 [Nicholas Chammas] [Build] Set all Debian package permissions to 755
---
 assembly/pom.xml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index 1bb5a671f5390..301ff69c2ae3b 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -39,7 +39,7 @@
     <deb.pkg.name>spark</deb.pkg.name>
     <deb.install.path>/usr/share/spark</deb.install.path>
     <deb.user>root</deb.user>
-    <deb.bin.filemode>744</deb.bin.filemode>
+    <deb.bin.filemode>755</deb.bin.filemode>
   </properties>
 
   <dependencies>
@@ -280,7 +280,7 @@
                         <user>${deb.user}</user>
                         <group>${deb.user}</group>
                         <prefix>${deb.install.path}/conf</prefix>
-                        <filemode>744</filemode>
+                        <filemode>${deb.bin.filemode}</filemode>
                       </mapper>
                     </data>
                     <data>
@@ -302,7 +302,7 @@
                         <user>${deb.user}</user>
                         <group>${deb.user}</group>
                         <prefix>${deb.install.path}/sbin</prefix>
-                        <filemode>744</filemode>
+                        <filemode>${deb.bin.filemode}</filemode>
                       </mapper>
                     </data>
                     <data>
@@ -313,7 +313,7 @@
                         <user>${deb.user}</user>
                         <group>${deb.user}</group>
                         <prefix>${deb.install.path}/python</prefix>
-                        <filemode>744</filemode>
+                        <filemode>${deb.bin.filemode}</filemode>
                       </mapper>
                     </data>
                   </dataSet>

From 37d35ab53b82916e4bf0eeb788044bfc33f329df Mon Sep 17 00:00:00 2001
From: Ryan Williams <ryan.blake.williams@gmail.com>
Date: Fri, 6 Feb 2015 12:22:25 +0000
Subject: [PATCH 008/357] [SPARK-5416] init Executor.threadPool before
 ExecutorSource

Some ExecutorSource metrics can NPE by attempting to reference the
threadpool otherwise.

Author: Ryan Williams <ryan.blake.williams@gmail.com>

Closes #4212 from ryan-williams/threadpool and squashes the following commits:

236f2ad [Ryan Williams] init Executor.threadPool before ExecutorSource
---
 .../src/main/scala/org/apache/spark/executor/Executor.scala | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 312bb3a1daaa3..5141483d1e745 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -75,6 +75,9 @@ private[spark] class Executor(
     Thread.setDefaultUncaughtExceptionHandler(SparkUncaughtExceptionHandler)
   }
 
+  // Start worker thread pool
+  val threadPool = Utils.newDaemonCachedThreadPool("Executor task launch worker")
+
   val executorSource = new ExecutorSource(this, executorId)
 
   if (!isLocal) {
@@ -101,9 +104,6 @@ private[spark] class Executor(
   // Limit of bytes for total size of results (default is 1GB)
   private val maxResultSize = Utils.getMaxResultSize(conf)
 
-  // Start worker thread pool
-  val threadPool = Utils.newDaemonCachedThreadPool("Executor task launch worker")
-
   // Maintains the list of running tasks.
   private val runningTasks = new ConcurrentHashMap[Long, TaskRunner]
 

From 9ad56ad2a2a51df449040c4f4b7c66b104883312 Mon Sep 17 00:00:00 2001
From: Travis Galoppo <tjg2107@columbia.edu>
Date: Fri, 6 Feb 2015 10:26:51 -0800
Subject: [PATCH 009/357] [SPARK-5013] [MLlib] Added documentation and sample
 data file for GaussianMixture

Simple description and code samples (and sample data) for GaussianMixture

Author: Travis Galoppo <tjg2107@columbia.edu>

Closes #4401 from tgaloppo/spark-5013 and squashes the following commits:

c9ff9a5 [Travis Galoppo] Fixed link in mllib-clustering.md Added Gaussian mixture and power iteration as available clustering techniques in mllib-guide
2368690 [Travis Galoppo] Minor fixes
3eb41fa [Travis Galoppo] [SPARK-5013] Added documentation and sample data file for GaussianMixture
---
 data/mllib/gmm_data.txt  | 2000 ++++++++++++++++++++++++++++++++++++++
 docs/mllib-clustering.md |  131 ++-
 docs/mllib-guide.md      |    2 +
 3 files changed, 2130 insertions(+), 3 deletions(-)
 create mode 100644 data/mllib/gmm_data.txt

diff --git a/data/mllib/gmm_data.txt b/data/mllib/gmm_data.txt
new file mode 100644
index 0000000000000..934ee4a83a2df
--- /dev/null
+++ b/data/mllib/gmm_data.txt
@@ -0,0 +1,2000 @@
+ 2.59470454e+00 2.12298217e+00
+ 1.15807024e+00 -1.46498723e-01
+ 2.46206638e+00 6.19556894e-01
+ -5.54845070e-01 -7.24700066e-01
+ -3.23111426e+00 -1.42579084e+00
+ 3.02978115e+00 7.87121753e-01
+ 1.97365907e+00 1.15914704e+00
+ -6.44852101e+00 -3.18154314e+00
+ 1.30963349e+00 1.62866434e-01
+ 4.26482541e+00 2.15547996e+00
+ 3.79927257e+00 1.50572445e+00
+ 4.17452609e-01 -6.74032760e-01
+ 4.21117627e-01 4.45590255e-01
+ -2.80425571e+00 -7.77150554e-01
+ 2.55928797e+00 7.03954218e-01
+ 1.32554059e+00 -9.46663152e-01
+ -3.39691439e+00 -1.49005743e+00
+ -2.26542270e-01 3.60052515e-02
+ 1.04994198e+00 5.29825685e-01
+ -1.51566882e+00 -1.86264432e-01
+ -3.27928172e-01 -7.60859110e-01
+ -3.18054866e-01 3.97719805e-01
+ 1.65579418e-01 -3.47232033e-01
+ 6.47162333e-01 4.96059961e-02
+ -2.80776647e-01 4.79418757e-01
+ 7.45069752e-01 1.20790281e-01
+ 2.13604102e-01 1.59542555e-01
+ -3.08860224e+00 -1.43259870e+00
+ 8.97066497e-01 1.10206801e+00
+ -2.23918874e-01 -1.07267267e+00
+ 2.51525708e+00 2.84761973e-01
+ 9.98052532e-01 1.08333783e+00
+ 1.76705588e+00 8.18866778e-01
+ 5.31555163e-02 -1.90111151e-01
+ -2.17405059e+00 7.21854582e-02
+ -2.13772505e+00 -3.62010387e-01
+ 2.95974057e+00 1.31602381e+00
+ 2.74053561e+00 1.61781757e+00
+ 6.68135448e-01 2.86586009e-01
+ 2.82323739e+00 1.74437257e+00
+ 8.11540288e-01 5.50744478e-01
+ 4.10050897e-01 5.10668402e-03
+ 9.58626136e-01 -3.49633680e-01
+ 4.66599798e+00 1.49964894e+00
+ 4.94507794e-01 2.58928077e-01
+ -2.36029742e+00 -1.61042909e+00
+ -4.99306804e-01 -8.04984769e-01
+ 1.07448510e+00 9.39605828e-01
+ -1.80448949e+00 -1.05983264e+00
+ -3.22353821e-01 1.73612093e-01
+ 1.85418702e+00 1.15640643e+00
+ 6.93794163e-01 6.59993560e-01
+ 1.99399102e+00 1.44547123e+00
+ 3.38866124e+00 1.23379290e+00
+ -4.24067720e+00 -1.22264282e+00
+ 6.03230201e-02 2.95232729e-01
+ -3.59341813e+00 -7.17453726e-01
+ 4.87447372e-01 -2.00733911e-01
+ 1.20149195e+00 4.07880197e-01
+ -2.13331464e+00 -4.58518077e-01
+ -3.84091083e+00 -1.71553950e+00
+ -5.37279250e-01 2.64822629e-02
+ -2.10155227e+00 -1.32558103e+00
+ -1.71318897e+00 -7.12098563e-01
+ -1.46280695e+00 -1.84868337e-01
+ -3.59785325e+00 -1.54832434e+00
+ -5.77528081e-01 -5.78580857e-01
+ 3.14734283e-01 5.80184639e-01
+ -2.71164714e+00 -1.19379432e+00
+ 1.09634489e+00 7.20143887e-01
+ -3.05527722e+00 -1.47774064e+00
+ 6.71753586e-01 7.61350020e-01
+ 3.98294144e+00 1.54166484e+00
+ -3.37220384e+00 -2.21332064e+00
+ 1.81222914e+00 7.41212752e-01
+ 2.71458282e-01 1.36329078e-01
+ -3.97815359e-01 1.16766886e-01
+ -1.70192814e+00 -9.75851571e-01
+ -3.46803804e+00 -1.09965988e+00
+ -1.69649627e+00 -5.76045801e-01
+ -1.02485636e-01 -8.81841246e-01
+ -3.24194667e-02 2.55429276e-01
+ -2.75343168e+00 -1.51366320e+00
+ -2.78676702e+00 -5.22360489e-01
+ 1.70483164e+00 1.19769805e+00
+ 4.92022579e-01 3.24944706e-01
+ 2.48768464e+00 1.00055363e+00
+ 4.48786400e-01 7.63902870e-01
+ 2.93862696e+00 1.73809968e+00
+ -3.55019305e+00 -1.97875558e+00
+ 1.74270784e+00 6.90229224e-01
+ 5.13391994e-01 4.58374016e-01
+ 1.78379499e+00 9.08026381e-01
+ 1.75814147e+00 7.41449784e-01
+ -2.30687792e-01 3.91009729e-01
+ 3.92271353e+00 1.44006290e+00
+ 2.93361679e-01 -4.99886375e-03
+ 2.47902690e-01 -7.49542503e-01
+ -3.97675355e-01 1.36824887e-01
+ 3.56535953e+00 1.15181329e+00
+ 3.22425301e+00 1.28702383e+00
+ -2.94192478e-01 -2.42382557e-01
+ 8.02068864e-01 -1.51671475e-01
+ 8.54133530e-01 -4.89514885e-02
+ -1.64316316e-01 -5.34642346e-01
+ -6.08485405e-01 -2.10332352e-01
+ -2.18940059e+00 -1.07024952e+00
+ -1.71586960e+00 -2.83333492e-02
+ 1.70200448e-01 -3.28031178e-01
+ -1.97210346e+00 -5.39948532e-01
+ 2.19500160e+00 1.05697170e+00
+ -1.76239935e+00 -1.09377438e+00
+ 1.68314744e+00 6.86491164e-01
+ -2.99852288e+00 -1.46619067e+00
+ -2.23769560e+00 -9.15008355e-01
+ 9.46887516e-01 5.58410503e-01
+ 5.02153123e-01 1.63851235e-01
+ -9.70297062e-01 3.14625374e-01
+ -1.29405593e+00 -8.20994131e-01
+ 2.72516079e+00 7.85839947e-01
+ 1.45788024e+00 3.37487353e-01
+ -4.36292749e-01 -5.42150480e-01
+ 2.21304711e+00 1.25254042e+00
+ -1.20810271e-01 4.79632898e-01
+ -3.30884511e+00 -1.50607586e+00
+ -6.55882455e+00 -1.94231256e+00
+ -3.17033630e+00 -9.94678930e-01
+ 1.42043617e+00 7.28808957e-01
+ -1.57546099e+00 -1.10320497e+00
+ -3.22748754e+00 -1.64174579e+00
+ 2.96776017e-03 -3.16191512e-02
+ -2.25986054e+00 -6.13123197e-01
+ 2.49434243e+00 7.73069183e-01
+ 9.08494049e-01 -1.53926853e-01
+ -2.80559090e+00 -1.37474221e+00
+ 4.75224286e-01 2.53153674e-01
+ 4.37644006e+00 8.49116998e-01
+ 2.27282959e+00 6.16568202e-01
+ 1.16006880e+00 1.65832798e-01
+ -1.67163193e+00 -1.22555386e+00
+ -1.38231118e+00 -7.29575504e-01
+ -3.49922750e+00 -2.26446675e+00
+ -3.73780110e-01 -1.90657869e-01
+ 1.68627679e+00 1.05662987e+00
+ -3.28891792e+00 -1.11080334e+00
+ -2.59815798e+00 -1.51410198e+00
+ -2.61203309e+00 -6.00143552e-01
+ 6.58964943e-01 4.47216094e-01
+ -2.26711381e+00 -7.26512923e-01
+ -5.31429009e-02 -1.97925341e-02
+ 3.19749807e+00 9.20425476e-01
+ -1.37595787e+00 -6.58062732e-01
+ 8.09900278e-01 -3.84286160e-01
+ -5.07741280e+00 -1.97683808e+00
+ -2.99764250e+00 -1.50753777e+00
+ -9.87671815e-01 -4.63255889e-01
+ 1.65390765e+00 6.73806615e-02
+ 5.51252659e+00 2.69842267e+00
+ -2.23724309e+00 -4.77624004e-01
+ 4.99726228e+00 1.74690949e+00
+ 1.75859162e-01 -1.49350995e-01
+ 4.13382789e+00 1.31735161e+00
+ 2.69058117e+00 4.87656923e-01
+ 1.07180318e+00 1.01426954e+00
+ 3.37216869e+00 1.05955377e+00
+ -2.95006781e+00 -1.57048303e+00
+ -2.46401648e+00 -8.37056374e-01
+ 1.19012962e-01 7.54702770e-01
+ 3.34142539e+00 4.81938295e-01
+ 2.92643913e+00 1.04301050e+00
+ 2.89697751e+00 1.37551442e+00
+ -1.03094242e+00 2.20903962e-01
+ -5.13914589e+00 -2.23355387e+00
+ -8.81680780e-01 1.83590000e-01
+ 2.82334775e+00 1.26650464e+00
+ -2.81042540e-01 -3.26370240e-01
+ 2.97995487e+00 8.34569452e-01
+ -1.39857135e+00 -1.15798385e+00
+ 4.27186506e+00 9.04253702e-01
+ 6.98684517e-01 7.91167305e-01
+ 3.52233095e+00 1.29976473e+00
+ 2.21448029e+00 2.73213379e-01
+ -3.13505683e-01 -1.20593774e-01
+ 3.70571571e+00 1.06220876e+00
+ 9.83881041e-01 5.67713803e-01
+ -2.17897705e+00 2.52925205e-01
+ 1.38734039e+00 4.61287066e-01
+ -1.41181602e+00 -1.67248955e-02
+ -1.69974639e+00 -7.17812071e-01
+ -2.01005793e-01 -7.49662056e-01
+ 1.69016336e+00 3.24687979e-01
+ -2.03250179e+00 -2.76108460e-01
+ 3.68776848e-01 4.12536941e-01
+ 7.66238259e-01 -1.84750637e-01
+ -2.73989147e-01 -1.72817250e-01
+ -2.18623745e+00 -2.10906798e-01
+ -1.39795625e-01 3.26066094e-02
+ -2.73826912e-01 -6.67586097e-02
+ -1.57880654e+00 -4.99395900e-01
+ 4.55950908e+00 2.29410489e+00
+ -7.36479631e-01 -1.57861857e-01
+ 1.92082888e+00 1.05843391e+00
+ 4.29192810e+00 1.38127810e+00
+ 1.61852879e+00 1.95871986e-01
+ -1.95027403e+00 -5.22448168e-01
+ -1.67446281e+00 -9.41497162e-01
+ 6.07097859e-01 3.44178029e-01
+ -3.44004683e+00 -1.49258461e+00
+ 2.72114752e+00 6.00728991e-01
+ 8.80685522e-01 -2.53243336e-01
+ 1.39254928e+00 3.42988512e-01
+ 1.14194836e-01 -8.57945694e-02
+ -1.49387332e+00 -7.60860481e-01
+ -1.98053285e+00 -4.86039865e-01
+ 3.56008568e+00 1.08438692e+00
+ 2.27833961e-01 1.09441881e+00
+ -1.16716710e+00 -6.54778242e-01
+ 2.02156613e+00 5.42075758e-01
+ 1.08429178e+00 -7.67420693e-01
+ 6.63058455e-01 4.61680991e-01
+ -1.06201537e+00 1.38862846e-01
+ 3.08701875e+00 8.32580273e-01
+ -4.96558108e-01 -2.47031257e-01
+ 7.95109987e-01 7.59314147e-02
+ -3.39903524e-01 8.71565566e-03
+ 8.68351357e-01 4.78358641e-01
+ 1.48750819e+00 7.63257420e-01
+ -4.51224101e-01 -4.44056898e-01
+ -3.02734750e-01 -2.98487961e-01
+ 5.46846609e-01 7.02377629e-01
+ 1.65129778e+00 3.74008231e-01
+ -7.43336512e-01 3.95723531e-01
+ -5.88446605e-01 -6.47520211e-01
+ 3.58613167e+00 1.95024937e+00
+ 3.11718883e+00 8.37984715e-01
+ 1.80919244e+00 9.62644986e-01
+ 5.43856371e-02 -5.86297543e-01
+ -1.95186766e+00 -1.02624212e-01
+ 8.95628057e-01 5.91812281e-01
+ 4.97691627e-02 5.31137156e-01
+ -1.07633113e+00 -2.47392788e-01
+ -1.17257986e+00 -8.68528265e-01
+ -8.19227665e-02 5.80579434e-03
+ -2.86409787e-01 1.95812924e-01
+ 1.10582671e+00 7.42853240e-01
+ 4.06429774e+00 1.06557476e+00
+ -3.42521792e+00 -7.74327139e-01
+ 1.28468671e+00 6.20431661e-01
+ 6.01201008e-01 -1.16799728e-01
+ -1.85058727e-01 -3.76235293e-01
+ 5.44083324e+00 2.98490868e+00
+ 2.69273070e+00 7.83901153e-01
+ 1.88938036e-01 -4.83222152e-01
+ 1.05667256e+00 -2.57003165e-01
+ 2.99711662e-01 -4.33131912e-01
+ 7.73689216e-02 -1.78738364e-01
+ 9.58326279e-01 6.38325706e-01
+ -3.97727049e-01 2.27314759e-01
+ 3.36098175e+00 1.12165237e+00
+ 1.77804871e+00 6.46961933e-01
+ -2.86945546e+00 -1.00395518e+00
+ 3.03494815e+00 7.51814612e-01
+ -1.43658194e+00 -3.55432244e-01
+ -3.08455105e+00 -1.51535106e+00
+ -1.55841975e+00 3.93454820e-02
+ 7.96073412e-01 -3.11036969e-01
+ -9.84125401e-01 -1.02064649e+00
+ -7.75688143e+00 -3.65219926e+00
+ 1.53816429e+00 7.65926670e-01
+ -4.92712738e-01 2.32244240e-02
+ -1.93166919e+00 -1.07701304e+00
+ 2.03029875e-02 -7.54055699e-01
+ 2.52177489e+00 1.01544979e+00
+ 3.65109048e-01 -9.48328494e-01
+ -1.28849143e-01 2.51947174e-01
+ -1.02428075e+00 -9.37767116e-01
+ -3.04179748e+00 -9.97926994e-01
+ -2.51986980e+00 -1.69117413e+00
+ -1.24900838e+00 -4.16179917e-01
+ 2.77943992e+00 1.22842327e+00
+ -4.37434557e+00 -1.70182693e+00
+ -1.60019319e+00 -4.18345639e-01
+ -1.67613646e+00 -9.44087262e-01
+ -9.00843245e-01 8.26378089e-02
+ 3.29770621e-01 -9.07870444e-01
+ -2.84650535e+00 -9.00155396e-01
+ 1.57111705e+00 7.07432268e-01
+ 1.24948552e+00 1.04812849e-01
+ 1.81440558e+00 9.53545082e-01
+ -1.74915794e+00 -1.04606288e+00
+ 1.20593269e+00 -1.12607147e-02
+ 1.36004919e-01 -1.09828044e+00
+ 2.57480693e-01 3.34941541e-01
+ 7.78775385e-01 -5.32494732e-01
+ -1.79155126e+00 -6.29994129e-01
+ -1.75706839e+00 -8.35100126e-01
+ 4.29512012e-01 7.81426910e-02
+ 3.08349370e-01 -1.27359861e-01
+ 1.05560329e+00 4.55150640e-01
+ 1.95662574e+00 1.17593217e+00
+ 8.77376632e-01 6.57866662e-01
+ 7.71311255e-01 9.15134334e-02
+ -6.36978275e+00 -2.55874241e+00
+ -2.98335339e+00 -1.59567024e+00
+ -3.67104587e-01 1.85315291e-01
+ 1.95347407e+00 -7.15503113e-02
+ 8.45556363e-01 6.51256415e-02
+ 9.42868521e-01 3.56647624e-01
+ 2.99321875e+00 1.07505254e+00
+ -2.91030538e-01 -3.77637183e-01
+ 1.62870918e+00 3.37563671e-01
+ 2.05773173e-01 3.43337416e-01
+ -8.40879199e-01 -1.35600767e-01
+ 1.38101624e+00 5.99253495e-01
+ -6.93715607e+00 -2.63580662e+00
+ -1.04423404e+00 -8.32865050e-01
+ 1.33448476e+00 1.04863475e+00
+ 6.01675207e-01 1.98585194e-01
+ 2.31233993e+00 7.98628331e-01
+ 1.85201313e-01 -1.76070247e+00
+ 1.92006354e+00 8.45737582e-01
+ 1.06320415e+00 2.93426068e-01
+ -1.20360141e+00 -1.00301288e+00
+ 1.95926629e+00 6.26643532e-01
+ 6.04483978e-02 5.72643059e-01
+ -1.04568563e+00 -5.91021496e-01
+ 2.62300678e+00 9.50997831e-01
+ -4.04610275e-01 3.73150879e-01
+ 2.26371902e+00 8.73627529e-01
+ 2.12545313e+00 7.90640352e-01
+ 7.72181917e-03 1.65718952e-02
+ 1.00422340e-01 -2.05562936e-01
+ -1.22989802e+00 -1.01841681e-01
+ 3.09064082e+00 1.04288010e+00
+ 5.18274167e+00 1.34749259e+00
+ -8.32075153e-01 -1.97592029e-01
+ 3.84126764e-02 5.58171345e-01
+ 4.99560727e-01 -4.26154438e-02
+ 4.79071151e+00 2.19728942e+00
+ -2.78437968e+00 -1.17812590e+00
+ -2.22804226e+00 -4.31174255e-01
+ 8.50762292e-01 -1.06445261e-01
+ 1.10812830e+00 -2.59118812e-01
+ -2.91450155e-01 6.42802679e-01
+ -1.38631532e-01 -5.88585623e-01
+ -5.04120983e-01 -2.17094915e-01
+ 3.41410820e+00 1.67897767e+00
+ -2.23697326e+00 -6.62735244e-01
+ -3.55961064e-01 -1.27647226e-01
+ -3.55568274e+00 -2.49011369e+00
+ -8.77586408e-01 -9.38268065e-03
+ 1.52382384e-01 -5.62155760e-01
+ 1.55885574e-01 1.07617069e-01
+ -8.37129973e-01 -5.22259081e-01
+ -2.92741750e+00 -1.35049428e+00
+ -3.54670781e-01 5.69205952e-02
+ 2.21030255e+00 1.34689986e+00
+ 1.60787722e+00 5.75984706e-01
+ 1.32294221e+00 5.31577509e-01
+ 7.05672928e-01 3.34241244e-01
+ 1.41406179e+00 1.15783408e+00
+ -6.92172228e-01 -2.84817896e-01
+ 3.28358655e-01 -2.66910083e-01
+ 1.68013644e-01 -4.28016549e-02
+ 2.07365974e+00 7.76496211e-01
+ -3.92974907e-01 2.46796730e-01
+ -5.76078636e-01 3.25676963e-01
+ -1.82547204e-01 -5.06410543e-01
+ 3.04754906e+00 1.16174496e+00
+ -3.01090632e+00 -1.09195183e+00
+ -1.44659696e+00 -6.87838682e-01
+ 2.11395861e+00 9.10495785e-01
+ 1.40962871e+00 1.13568678e+00
+ -1.66653234e-01 -2.10012503e-01
+ 3.17456029e+00 9.74502922e-01
+ 2.15944820e+00 8.62807189e-01
+ -3.45418719e+00 -1.33647548e+00
+ -3.41357732e+00 -8.47048920e-01
+ -3.06702448e-01 -6.64280634e-01
+ -2.86930714e-01 -1.35268264e-01
+ -3.15835557e+00 -5.43439253e-01
+ 2.49541440e-01 -4.71733570e-01
+ 2.71933912e+00 4.13308399e-01
+ -2.43787038e+00 -1.08050547e+00
+ -4.90234490e-01 -6.64069865e-01
+ 8.99524451e-02 5.76180541e-01
+ 5.00500404e+00 2.12125521e+00
+ -1.73107940e-01 -2.28506575e-02
+ 5.44938858e-01 -1.29523352e-01
+ 5.13526842e+00 1.68785993e+00
+ 1.70228304e+00 1.02601138e+00
+ 3.58957507e+00 1.54396196e+00
+ 1.85615738e+00 4.92916197e-01
+ 2.55772147e+00 7.88438908e-01
+ -1.57008279e+00 -4.17377300e-01
+ -1.42548604e+00 -3.63684860e-01
+ -8.52026118e-01 2.72052686e-01
+ -5.10563077e+00 -2.35665994e+00
+ -2.95517031e+00 -1.84945297e+00
+ -2.91947959e+00 -1.66016784e+00
+ -4.21462387e+00 -1.41131535e+00
+ 6.59901121e-01 4.87156314e-01
+ -9.75352532e-01 -4.50231285e-01
+ -5.94084444e-01 -1.16922670e+00
+ 7.50554615e-01 -9.83692552e-01
+ 1.07054926e+00 2.77143030e-01
+ -3.88079578e-01 -4.17737309e-02
+ -9.59373733e-01 -8.85454886e-01
+ -7.53560665e-02 -5.16223870e-02
+ 9.84108158e-01 -5.89290700e-02
+ 1.87272961e-01 -4.34238391e-01
+ 6.86509981e-01 -3.15116460e-01
+ -1.07762538e+00 6.58984161e-02
+ 6.09266592e-01 6.91808473e-02
+ -8.30529954e-01 -7.00454791e-01
+ -9.13179464e-01 -6.31712891e-01
+ 7.68744851e-01 1.09840676e+00
+ -1.07606690e+00 -8.78390282e-01
+ -1.71038184e+00 -5.73606033e-01
+ 8.75982765e-01 3.66343143e-01
+ -7.04919009e-01 -8.49182590e-01
+ -1.00274668e+00 -7.99573611e-01
+ -1.05562848e+00 -5.84060076e-01
+ 4.03490015e+00 1.28679206e+00
+ -3.53484804e+00 -1.71381255e+00
+ 2.31527363e-01 1.04179397e-01
+ -3.58592392e-02 3.74895739e-01
+ 3.92253428e+00 1.81852726e+00
+ -7.27384249e-01 -6.45605128e-01
+ 4.65678097e+00 2.41379899e+00
+ 1.16750534e+00 7.60718205e-01
+ 1.15677059e+00 7.96225550e-01
+ -1.42920261e+00 -4.66946295e-01
+ 3.71148192e+00 1.88060191e+00
+ 2.44052407e+00 3.84472199e-01
+ -1.64535035e+00 -8.94530036e-01
+ -3.69608753e+00 -1.36402754e+00
+ 2.24419208e+00 9.69744889e-01
+ 2.54822427e+00 1.22613039e+00
+ 3.77484909e-01 -5.98521878e-01
+ -3.61521175e+00 -1.11123912e+00
+ 3.28113127e+00 1.52551775e+00
+ -3.51030902e+00 -1.53913980e+00
+ -2.44874505e+00 -6.30246005e-01
+ -3.42516153e-01 -5.07352665e-01
+ 1.09110502e+00 6.36821628e-01
+ -2.49434967e+00 -8.02827146e-01
+ 1.41763139e+00 -3.46591820e-01
+ 1.61108619e+00 5.93871102e-01
+ 3.97371717e+00 1.35552499e+00
+ -1.33437177e+00 -2.83908670e-01
+ -1.41606483e+00 -1.76402601e-01
+ 2.23945322e-01 -1.77157065e-01
+ 2.60271569e+00 2.40778251e-01
+ -2.82213895e-02 1.98255474e-01
+ 4.20727940e+00 1.31490863e+00
+ 3.36944889e+00 1.57566635e+00
+ 3.53049396e+00 1.73579350e+00
+ -1.29170202e+00 -1.64196290e+00
+ 9.27295604e-01 9.98808036e-01
+ 1.75321843e-01 -2.83267817e-01
+ -2.19069578e+00 -1.12814358e+00
+ 1.66606031e+00 7.68006933e-01
+ -7.13826035e-01 5.20881684e-02
+ -3.43821888e+00 -2.36137021e+00
+ -5.93210310e-01 1.21843813e-01
+ -4.09800822e+00 -1.39893953e+00
+ 2.74110954e+00 1.52728606e+00
+ 1.72652512e+00 -1.25435113e-01
+ 1.97722357e+00 6.40667481e-01
+ 4.18635780e-01 3.57018509e-01
+ -1.78303569e+00 -2.11864764e-01
+ -3.52809366e+00 -2.58794450e-01
+ -4.72407090e+00 -1.63870734e+00
+ 1.73917807e+00 8.73251829e-01
+ 4.37979356e-01 8.49210569e-01
+ 3.93791881e+00 1.76269490e+00
+ 2.79065411e+00 1.04019042e+00
+ -8.47426142e-01 -3.40136892e-01
+ -4.24389181e+00 -1.80253120e+00
+ -1.86675870e+00 -7.64558265e-01
+ 9.46212675e-01 -7.77681445e-02
+ -2.82448462e+00 -1.33592449e+00
+ -2.57938567e+00 -1.56554690e+00
+ -2.71615767e+00 -6.27667233e-01
+ -1.55999166e+00 -5.81013466e-01
+ -4.24696864e-01 -7.44673250e-01
+ 1.67592970e+00 7.68164292e-01
+ 8.48455216e-01 -6.05681126e-01
+ 6.12575454e+00 1.65607584e+00
+ 1.38207327e+00 2.39261863e-01
+ 3.13364450e+00 1.17154698e+00
+ 1.71694858e+00 1.26744905e+00
+ -1.61746367e+00 -8.80098073e-01
+ -8.52196756e-01 -9.27299728e-01
+ -1.51562462e-01 -8.36552490e-02
+ -7.04792753e-01 -1.24726713e-02
+ -3.35265757e+00 -1.82176312e+00
+ 3.32173170e-01 -1.33405580e-01
+ 4.95841013e-01 4.58292712e-01
+ 1.57713955e+00 7.79272991e-01
+ 2.09743109e+00 9.23542557e-01
+ 3.90450311e-03 -8.42873164e-01
+ 2.59519038e+00 7.56479591e-01
+ -5.77643976e-01 -2.36401904e-01
+ -5.22310654e-01 1.34187830e-01
+ -2.22096086e+00 -7.75507719e-01
+ 1.35907831e+00 7.80197510e-01
+ 3.80355868e+00 1.16983476e+00
+ 3.82746596e+00 1.31417718e+00
+ 3.30451183e+00 1.55398159e+00
+ -3.42917814e-01 -8.62281222e-02
+ -2.59093020e+00 -9.29883526e-01
+ 1.40928562e+00 1.08398346e+00
+ 1.54400137e-01 3.35881092e-01
+ 1.59171586e+00 1.18855802e+00
+ -5.25164002e-01 -1.03104220e-01
+ 2.20067959e+00 1.37074713e+00
+ 6.97860830e-01 6.27718548e-01
+ -4.59743507e-01 1.36061163e-01
+ -1.04691963e-01 -2.16271727e-01
+ -1.08905573e+00 -5.95510769e-01
+ -1.00826983e+00 -5.38509162e-02
+ -3.16402719e+00 -1.33414216e+00
+ 1.47870874e-01 1.75234619e-01
+ -2.57078234e-01 7.03316889e-02
+ 1.81073945e+00 4.26901462e-01
+ 2.65476530e+00 6.74217273e-01
+ 1.27539811e+00 6.22914081e-01
+ -3.76750499e-01 -1.20629449e+00
+ 1.00177595e+00 -1.40660091e-01
+ -2.98919265e+00 -1.65145013e+00
+ -2.21557682e+00 -8.11123452e-01
+ -3.22635378e+00 -1.65639056e+00
+ -2.72868553e+00 -1.02812087e+00
+ 1.26042797e+00 8.49005248e-01
+ -9.38318534e-01 -9.87588651e-01
+ 3.38013194e-01 -1.00237461e-01
+ 1.91175691e+00 8.48716369e-01
+ 4.30244344e-01 6.05539915e-02
+ 2.21783435e+00 3.03268204e-01
+ 1.78019576e+00 1.27377108e+00
+ 1.59733274e+00 4.40674687e-02
+ 3.97428484e+00 2.20881566e+00
+ -2.41108677e+00 -6.01410418e-01
+ -2.50796499e+00 -5.71169866e-01
+ -3.71957427e+00 -1.38195726e+00
+ -1.57992670e+00 1.32068593e-01
+ -1.35278851e+00 -6.39349270e-01
+ 1.23075932e+00 2.40445409e-01
+ 1.35606530e+00 4.33180078e-01
+ 9.60968518e-02 2.26734255e-01
+ 6.22975063e-01 5.03431915e-02
+ -1.47624851e+00 -3.60568238e-01
+ -2.49337808e+00 -1.15083052e+00
+ 2.15717792e+00 1.03071559e+00
+ -3.07814376e-02 1.38700314e-02
+ 4.52049499e-02 -4.86409775e-01
+ 2.58231061e+00 1.14327809e-01
+ 1.10999138e+00 -5.18568405e-01
+ -2.19426443e-01 -5.37505538e-01
+ -4.44740298e-01 6.78099955e-01
+ 4.03379080e+00 1.49825720e+00
+ -5.13182408e-01 -4.90201950e-01
+ -6.90139716e-01 1.63875126e-01
+ -8.17281461e-01 2.32155064e-01
+ -2.92357619e-01 -8.02573544e-01
+ -1.80769841e+00 -7.58907326e-01
+ 2.16981590e+00 1.06728873e+00
+ 1.98995203e-01 -6.84176682e-02
+ -2.39546753e+00 -2.92873789e-01
+ -4.24251021e+00 -1.46255564e+00
+ -5.01411291e-01 -5.95712813e-03
+ 2.68085809e+00 1.42883780e+00
+ -4.13289873e+00 -1.62729388e+00
+ 1.87957843e+00 3.63341638e-01
+ -1.15270744e+00 -3.03563774e-01
+ -4.43994248e+00 -2.97323905e+00
+ -7.17067733e-01 -7.08349542e-01
+ -3.28870393e+00 -1.19263863e+00
+ -7.55325944e-01 -5.12703329e-01
+ -2.07291938e+00 -2.65025085e-01
+ -7.50073814e-01 -1.70771041e-01
+ -8.77381404e-01 -5.47417325e-01
+ -5.33725862e-01 5.15837119e-01
+ 8.45056431e-01 2.82125560e-01
+ -1.59598637e+00 -1.38743235e+00
+ 1.41362902e+00 1.06407789e+00
+ 1.02584504e+00 -3.68219466e-01
+ -1.04644488e+00 -1.48769392e-01
+ 2.66990191e+00 8.57633492e-01
+ -1.84251857e+00 -9.82430175e-01
+ 9.71404204e-01 -2.81934209e-01
+ -2.50177989e+00 -9.21260335e-01
+ -1.31060074e+00 -5.84488113e-01
+ -2.12129400e-01 -3.06244708e-02
+ -5.28933882e+00 -2.50663129e+00
+ 1.90220541e+00 1.08662918e+00
+ -3.99366086e-02 -6.87178973e-01
+ -4.93417342e-01 4.37354182e-01
+ 2.13494486e+00 1.37679569e+00
+ 2.18396765e+00 5.81023868e-01
+ -3.07866587e+00 -1.45384974e+00
+ 6.10894119e-01 -4.17050124e-01
+ -1.88766952e+00 -8.86160058e-01
+ 3.34527253e+00 1.78571260e+00
+ 6.87769059e-01 -5.01157336e-01
+ 2.60470837e+00 1.45853560e+00
+ -6.49315691e-01 -9.16112805e-01
+ -1.29817687e+00 -2.15924339e-01
+ -1.20100409e-03 -4.03137422e-01
+ -1.36471594e+00 -6.93266356e-01
+ 1.38682062e+00 7.15131598e-01
+ 2.47830103e+00 1.24862305e+00
+ -2.78288147e+00 -1.03329235e+00
+ -7.33443403e-01 -6.11041652e-01
+ -4.12745671e-01 -5.96133390e-02
+ -2.58632336e+00 -4.51557058e-01
+ -1.16570367e+00 -1.27065510e+00
+ 2.76187104e+00 2.21895451e-01
+ -3.80443767e+00 -1.66319902e+00
+ 9.84658633e-01 6.81475569e-01
+ 9.33814584e-01 -4.89335563e-02
+ -4.63427997e-01 1.72989539e-01
+ 1.82401546e+00 3.60164021e-01
+ -5.36521077e-01 -8.08691351e-01
+ -1.37367030e+00 -1.02126160e+00
+ -3.70310682e+00 -1.19840844e+00
+ -1.51894242e+00 -3.89510223e-01
+ -3.67347940e-01 -3.25540516e-02
+ -1.00988595e+00 1.82802194e-01
+ 2.01622795e+00 7.86367901e-01
+ 1.02440231e+00 8.79780360e-01
+ -3.05971480e+00 -8.40901527e-01
+ 2.73909457e+00 1.20558628e+00
+ 2.39559056e+00 1.10786694e+00
+ 1.65471544e+00 7.33824651e-01
+ 2.18546787e+00 6.41168955e-01
+ 1.47152266e+00 3.91839132e-01
+ 1.45811155e+00 5.21820495e-01
+ -4.27531469e-02 -3.52343068e-03
+ -9.54948010e-01 -1.52313876e-01
+ 7.57151215e-01 -5.68728854e-03
+ -8.46205751e-01 -7.54580229e-01
+ 4.14493548e+00 1.45532780e+00
+ 4.58688968e-01 -4.54012803e-02
+ -1.49295381e+00 -4.57471758e-01
+ 1.80020351e+00 8.13724973e-01
+ -5.82727738e+00 -2.18269581e+00
+ -2.09017809e+00 -1.18305177e+00
+ -2.31628303e+00 -7.21600235e-01
+ -8.09679091e-01 -1.49101752e-01
+ 8.88005605e-01 8.57940857e-01
+ -1.44148219e+00 -3.10926299e-01
+ 3.68828186e-01 -3.08848059e-01
+ -6.63267389e-01 -8.58950139e-02
+ -1.14702569e+00 -6.32147854e-01
+ -1.51741715e+00 -8.53330564e-01
+ -1.33903718e+00 -1.45875547e-01
+ 4.12485387e+00 1.85620435e+00
+ -2.42353639e+00 -2.92669850e-01
+ 1.88708583e+00 9.35984730e-01
+ 2.15585179e+00 6.30469051e-01
+ -1.13627973e-01 -1.62554045e-01
+ 2.04540494e+00 1.36599834e+00
+ 2.81591381e+00 1.60897941e+00
+ 3.02736260e-02 3.83255815e-03
+ 7.97634013e-02 -2.82035099e-01
+ -3.24607473e-01 -5.30065956e-01
+ -3.91862894e+00 -1.94083334e+00
+ 1.56360901e+00 7.93882743e-01
+ -1.03905772e+00 6.25590229e-01
+ 2.54746492e+00 1.64233560e+00
+ -4.80774423e-01 -8.92298032e-02
+ 9.06979990e-02 1.05020427e+00
+ -2.47521290e+00 -1.78275982e-01
+ -3.91871729e-01 3.80285423e-01
+ 1.00658382e+00 4.58947483e-01
+ 4.68102941e-01 1.02992741e+00
+ 4.44242568e-01 2.89870239e-01
+ 3.29684452e+00 1.44677474e+00
+ -2.24983007e+00 -9.65574499e-01
+ -3.54453926e-01 -3.99020325e-01
+ -3.87429665e+00 -1.90079739e+00
+ 2.02656674e+00 1.12444894e+00
+ 3.77011621e+00 1.43200852e+00
+ 1.61259275e+00 4.65417399e-01
+ 2.28725434e+00 6.79181395e-01
+ 2.75421009e+00 2.27327345e+00
+ -2.40894409e+00 -1.03926359e+00
+ 1.52996651e-01 -2.73373046e-02
+ -2.63218977e+00 -7.22802821e-01
+ 2.77688169e+00 1.15310186e+00
+ 1.18832341e+00 4.73457165e-01
+ -2.35536326e+00 -1.08034554e+00
+ -5.84221627e-01 1.03505984e-02
+ 2.96730300e+00 1.33478306e+00
+ -8.61947692e-01 6.09137051e-02
+ 8.22343921e-01 -8.14155286e-02
+ 1.75809015e+00 1.07921470e+00
+ 1.19501279e+00 1.05309972e+00
+ -1.75901792e+00 9.75320161e-02
+ 1.64398635e+00 9.54384323e-01
+ -2.21878052e-01 -3.64847144e-01
+ -2.03128968e+00 -8.57866419e-01
+ 1.86750633e+00 7.08524487e-01
+ 8.03972976e-01 3.47404314e-01
+ 3.41203749e+00 1.39810900e+00
+ 4.22397681e-01 -6.41440488e-01
+ -4.88493360e+00 -1.58967816e+00
+ -1.67649284e-01 -1.08485915e-01
+ 2.11489023e+00 1.50506158e+00
+ -1.81639929e+00 -3.85542192e-01
+ 2.24044819e-01 -1.45100577e-01
+ -3.39262411e+00 -1.44394324e+00
+ 1.68706599e+00 2.29199618e-01
+ -1.94093257e+00 -1.65975814e-01
+ 8.28143367e-01 5.92109281e-01
+ -8.29587998e-01 -9.57130831e-01
+ -1.50011401e+00 -8.36802092e-01
+ 2.40770449e+00 9.32820177e-01
+ 7.41391309e-02 3.12878473e-01
+ 1.87745264e-01 6.19231425e-01
+ 9.57622692e-01 -2.20640033e-01
+ 3.18479243e+00 1.02986233e+00
+ 2.43133846e+00 8.41302677e-01
+ -7.09963834e-01 1.99718943e-01
+ -2.88253498e-01 -3.62772094e-01
+ 5.14052574e+00 1.79304595e+00
+ -3.27930993e+00 -1.29177973e+00
+ -1.16723536e+00 1.29519656e-01
+ 1.04801056e+00 3.41508300e-01
+ -3.99256195e+00 -2.51176471e+00
+ -7.62824318e-01 -6.84242153e-01
+ 2.71524986e-02 5.35157164e-02
+ 3.26430102e+00 1.34887262e+00
+ -1.72357766e+00 -4.94524388e-01
+ -3.81149536e+00 -1.28121944e+00
+ 3.36919354e+00 1.10672075e+00
+ -3.14841757e+00 -7.10713767e-01
+ -3.16463676e+00 -7.58558435e-01
+ -2.44745969e+00 -1.08816514e+00
+ 2.79173264e-01 -2.19652051e-02
+ 4.15309883e-01 6.07502790e-01
+ -9.51007417e-01 -5.83976336e-01
+ -1.47929839e+00 -8.39850409e-01
+ 2.38335703e+00 6.16055149e-01
+ -7.47749031e-01 -5.56164928e-01
+ -3.65643622e-01 -5.06684411e-01
+ -1.76634163e+00 -7.86382097e-01
+ 6.76372222e-01 -3.06592181e-01
+ -1.33505058e+00 -1.18301441e-01
+ 3.59660179e+00 2.00424178e+00
+ -7.88912762e-02 8.71956146e-02
+ 1.22656397e+00 1.18149583e+00
+ 4.24919729e+00 1.20082355e+00
+ 2.94607456e+00 1.00676505e+00
+ 7.46061275e-02 4.41761753e-02
+ -2.47738025e-02 1.92737701e-01
+ -2.20509316e-01 -3.79163193e-01
+ -3.50222190e-01 3.58727299e-01
+ -3.64788014e+00 -1.36107312e+00
+ 3.56062799e+00 9.27032742e-01
+ 1.04317289e+00 6.08035970e-01
+ 4.06718718e-01 3.00628051e-01
+ 4.33158086e+00 2.25860714e+00
+ 2.13917145e-01 -1.72757967e-01
+ -1.40637998e+00 -1.14119465e+00
+ 3.61554872e+00 1.87797348e+00
+ 1.01726871e+00 5.70255097e-01
+ -7.04902551e-01 2.16444147e-01
+ -2.51492186e+00 -8.52997369e-01
+ 1.85097530e+00 1.15124496e+00
+ -8.67569714e-01 -3.05682432e-01
+ 8.07550858e-01 5.88901608e-01
+ 1.85186755e-01 -1.94589367e-01
+ -1.23378238e+00 -7.84128347e-01
+ -1.22713161e+00 -4.21218235e-01
+ 2.97751165e-01 2.81055275e-01
+ 4.77703554e+00 1.66265524e+00
+ 2.51549669e+00 7.49980674e-01
+ 2.76510822e-01 1.40456909e-01
+ 1.98740905e+00 -1.79608212e-01
+ 9.35429145e-01 8.44344180e-01
+ -1.20854492e+00 -5.00598453e-01
+ 2.29936219e+00 8.10236668e-01
+ 6.92555544e-01 -2.65891331e-01
+ -1.58050994e+00 2.31237821e-01
+ -1.50864880e+00 -9.49661690e-01
+ -1.27689206e+00 -7.18260016e-01
+ -3.12517127e+00 -1.75587113e+00
+ 8.16062912e-02 -6.56551804e-01
+ -5.02479939e-01 -4.67162543e-01
+ -5.47435788e+00 -2.47799576e+00
+ 1.95872901e-02 5.80874076e-01
+ -1.59064958e+00 -6.34554756e-01
+ -3.77521478e+00 -1.74301790e+00
+ 5.89628224e-01 8.55736553e-01
+ -1.81903543e+00 -7.50011008e-01
+ 1.38557775e+00 3.71490991e-01
+ 9.70032652e-01 -7.11356016e-01
+ 2.63539625e-01 -4.20994771e-01
+ 2.12154222e+00 8.19081400e-01
+ -6.56977937e-01 -1.37810098e-01
+ 8.91309581e-01 2.77864361e-01
+ -7.43693195e-01 -1.46293770e-01
+ 2.24447769e+00 4.00911438e-01
+ -2.25169262e-01 2.04148801e-02
+ 1.68744684e+00 9.47573007e-01
+ 2.73086373e-01 3.30877195e-01
+ 5.54294414e+00 2.14198009e+00
+ -8.49238733e-01 3.65603298e-02
+ 2.39685712e+00 1.17951039e+00
+ -2.58230528e+00 -5.52116673e-01
+ 2.79785277e+00 2.88833717e-01
+ -1.96576188e-01 1.11652123e+00
+ -4.69383301e-01 1.96496282e-01
+ -1.95011845e+00 -6.15235169e-01
+ 1.03379890e-02 2.33701239e-01
+ 4.18933607e-01 2.77939814e-01
+ -1.18473337e+00 -4.10051126e-01
+ -7.61499744e-01 -1.43658094e+00
+ -1.65586092e+00 -3.41615303e-01
+ -5.58523700e-02 -5.21837080e-01
+ -2.40331088e+00 -2.64521583e-01
+ 2.24925206e+00 6.79843335e-02
+ 1.46360479e+00 1.04271443e+00
+ -3.09255443e+00 -1.82548953e+00
+ 2.11325841e+00 1.14996627e+00
+ -8.70657797e-01 1.02461839e-01
+ -5.71056521e-01 9.71232588e-02
+ -3.37870752e+00 -1.54091877e+00
+ 1.03907189e+00 -1.35661392e-01
+ 8.40057486e-01 6.12172413e-02
+ -1.30998234e+00 -1.34077226e+00
+ 7.53744974e-01 1.49447350e-01
+ 9.13995056e-01 -1.81227962e-01
+ 2.28386229e-01 3.74498520e-01
+ 2.54829151e-01 -2.88802704e-01
+ 1.61709009e+00 2.09319193e-01
+ -1.12579380e+00 -5.95955338e-01
+ -2.69610726e+00 -2.76222736e-01
+ -2.63773329e+00 -7.84491970e-01
+ -2.62167427e+00 -1.54792874e+00
+ -4.80639856e-01 -1.30582102e-01
+ -1.26130891e+00 -8.86841840e-01
+ -1.24951950e+00 -1.18182622e+00
+ -1.40107574e+00 -9.13695575e-01
+ 4.99872179e-01 4.69014702e-01
+ -2.03550193e-02 -1.48859738e-01
+ -1.50189069e+00 -2.97714278e-02
+ -2.07846113e+00 -7.29937809e-01
+ -5.50576792e-01 -7.03151525e-01
+ -3.88069238e+00 -1.63215295e+00
+ 2.97032988e+00 6.43571144e-01
+ -1.85999273e-01 1.18107620e+00
+ 1.79249709e+00 6.65356160e-01
+ 2.68842472e+00 1.35703255e+00
+ 1.07675417e+00 1.39845588e-01
+ 8.01226349e-01 2.11392275e-01
+ 9.64329379e-01 3.96146195e-01
+ -8.22529511e-01 1.96080831e-01
+ 1.92481841e+00 4.62985744e-01
+ 3.69756927e-01 3.77135799e-01
+ 1.19807835e+00 8.87715050e-01
+ -1.01363587e+00 -2.48151636e-01
+ 8.53071010e-01 4.96887868e-01
+ -3.41120553e+00 -1.35401843e+00
+ -2.64787381e+00 -1.08690563e+00
+ -1.11416759e+00 -4.43848915e-01
+ 1.46242648e+00 6.17106076e-02
+ -7.52968881e-01 -9.20972209e-01
+ -1.22492228e+00 -5.40327617e-01
+ 1.08001827e+00 5.29593785e-01
+ -2.58706464e-01 1.13022085e-01
+ -4.27394011e-01 1.17864354e-02
+ -3.20728413e+00 -1.71224737e-01
+ 1.71398530e+00 8.68885893e-01
+ 2.12067866e+00 1.45092772e+00
+ 4.32782616e-01 -3.34117769e-01
+ 7.80084374e-01 -1.35100217e-01
+ -2.05547729e+00 -4.70217750e-01
+ 2.38379736e+00 1.09186058e+00
+ -2.80825477e+00 -1.03320187e+00
+ 2.63434576e+00 1.15671733e+00
+ -1.60936214e+00 1.91843035e-01
+ -5.02298769e+00 -2.32820708e+00
+ 1.90349195e+00 1.45215416e+00
+ 3.00232888e-01 3.24412586e-01
+ -2.46503943e+00 -1.19550010e+00
+ 1.06304233e+00 2.20136246e-01
+ -2.99101388e+00 -1.58299318e+00
+ 2.30071719e+00 1.12881362e+00
+ -2.37587247e+00 -8.08298336e-01
+ 7.27006308e-01 3.80828984e-01
+ 2.61199061e+00 1.56473491e+00
+ 8.33936357e-01 -1.42189425e-01
+ 3.13291605e+00 1.77771210e+00
+ 2.21917371e+00 5.68427075e-01
+ 2.38867649e+00 9.06637262e-01
+ -6.92959466e+00 -3.57682881e+00
+ 2.57904824e+00 5.93959108e-01
+ 2.71452670e+00 1.34436199e+00
+ 4.39988761e+00 2.13124672e+00
+ 5.71783077e-01 5.08346173e-01
+ -3.65399429e+00 -1.18192861e+00
+ 4.46176453e-01 3.75685594e-02
+ -2.97501495e+00 -1.69459236e+00
+ 1.60855728e+00 9.20930014e-01
+ -1.44270290e+00 -1.93922306e-01
+ 1.67624229e+00 1.66233866e+00
+ -1.42579598e+00 -1.44990145e-01
+ 1.19923176e+00 4.58490278e-01
+ -9.00068460e-01 5.09701825e-02
+ -1.69391694e+00 -7.60070300e-01
+ -1.36576440e+00 -5.24244256e-01
+ -1.03016748e+00 -3.44625878e-01
+ 2.40519313e+00 1.09947587e+00
+ 1.50365433e+00 1.06464802e+00
+ -1.07609727e+00 -3.68897187e-01
+ 2.44969069e+00 1.28486192e+00
+ -1.25610307e+00 -1.14644789e+00
+ 2.05962899e+00 4.31162369e-01
+ -7.15886908e-01 -6.11587804e-02
+ -6.92354119e-01 -7.85019920e-01
+ -1.63016508e+00 -5.96944975e-01
+ 1.90352536e+00 1.28197457e+00
+ -4.01535243e+00 -1.81934488e+00
+ -1.07534435e+00 -2.10544784e-01
+ 3.25500866e-01 7.69603661e-01
+ 2.18443365e+00 6.59773335e-01
+ 8.80856790e-01 6.39505913e-01
+ -2.23956372e-01 -4.65940132e-01
+ -1.06766519e+00 -5.38388505e-03
+ 7.25556863e-01 -2.91123488e-01
+ -4.69451411e-01 7.89182650e-02
+ 2.58146587e+00 1.29653243e+00
+ 1.53747468e-01 7.69239075e-01
+ -4.61152262e-01 -4.04151413e-01
+ 1.48183517e+00 8.10079506e-01
+ -1.83402614e+00 -1.36939322e+00
+ 1.49315501e+00 7.95225425e-01
+ 1.41922346e+00 1.05582774e-01
+ 1.57473493e-01 9.70795657e-01
+ -2.67603254e+00 -7.48562280e-01
+ -8.49156216e-01 -6.05762529e-03
+ 1.12944274e+00 3.67741591e-01
+ 1.94228071e-01 5.28188141e-01
+ -3.65610158e-01 4.05851838e-01
+ -1.98839111e+00 -1.38452764e+00
+ 2.73765752e+00 8.24150530e-01
+ 7.63728641e-01 3.51617707e-01
+ 5.78307267e+00 1.68103612e+00
+ 2.27547227e+00 3.60876164e-01
+ -3.50681697e+00 -1.74429984e+00
+ 4.01241184e+00 1.26227829e+00
+ 2.44946343e+00 9.06119057e-01
+ -2.96638941e+00 -9.01532322e-01
+ 1.11267643e+00 -3.43333381e-01
+ -6.61868994e-01 -3.44666391e-01
+ -8.34917179e-01 5.69478372e-01
+ -1.91888454e+00 -3.03791075e-01
+ 1.50397636e+00 8.31961240e-01
+ 6.12260198e+00 2.16851807e+00
+ 1.34093127e+00 8.86649385e-01
+ 1.48748519e+00 8.26273697e-01
+ 7.62243068e-01 2.64841396e-01
+ -2.17604986e+00 -3.54219958e-01
+ 2.64708640e-01 -4.38136718e-02
+ 1.44725372e+00 1.18499914e-01
+ -6.71259446e-01 -1.19526851e-01
+ 2.40134595e-01 -8.90042323e-02
+ -3.57238199e+00 -1.23166201e+00
+ -3.77626645e+00 -1.19533443e+00
+ -3.81101035e-01 -4.94160532e-01
+ -3.02758757e+00 -1.18436066e+00
+ 2.59116298e-01 1.38023047e+00
+ 4.17900116e+00 1.12065959e+00
+ 1.54598848e+00 2.89806755e-01
+ 1.00656475e+00 1.76974511e-01
+ -4.15730234e-01 -6.22681694e-01
+ -6.00903565e-01 -1.43256959e-01
+ -6.03652508e-01 -5.09936379e-01
+ -1.94096658e+00 -9.48789544e-01
+ -1.74464105e+00 -8.50491590e-01
+ 1.17652544e+00 1.88118317e+00
+ 2.35507776e+00 1.44000205e+00
+ 2.63067924e+00 1.06692988e+00
+ 2.88805386e+00 1.23924715e+00
+ 8.27595008e-01 5.75364692e-01
+ 3.91384216e-01 9.72781920e-02
+ -1.03866816e+00 -1.37567768e+00
+ -1.34777969e+00 -8.40266025e-02
+ -4.12904508e+00 -1.67618340e+00
+ 1.27918111e+00 3.52085961e-01
+ 4.15361174e-01 6.28896189e-01
+ -7.00539496e-01 4.80447955e-02
+ -1.62332639e+00 -5.98236485e-01
+ 1.45957300e+00 1.00305154e+00
+ -3.06875603e+00 -1.25897545e+00
+ -1.94708176e+00 4.85143006e-01
+ 3.55744156e+00 -1.07468822e+00
+ 1.21602223e+00 1.28768827e-01
+ 1.89093098e+00 -4.70835659e-01
+ -6.55759125e+00 2.70114082e+00
+ 8.96843535e-01 -3.98115252e-01
+ 4.13450429e+00 -2.32069236e+00
+ 2.37764218e+00 -1.09098890e+00
+ -1.11388901e+00 6.27083097e-01
+ -6.34116929e-01 4.62816387e-01
+ 2.90203079e+00 -1.33589143e+00
+ 3.17457598e+00 -5.13575945e-01
+ -1.76362299e+00 5.71820693e-01
+ 1.66103362e+00 -8.99466249e-01
+ -2.53947433e+00 8.40084780e-01
+ 4.36631397e-01 7.24234261e-02
+ -1.87589394e+00 5.08529113e-01
+ 4.49563965e+00 -9.43365992e-01
+ 1.78876299e+00 -1.27076149e+00
+ -1.16269107e-01 -4.55078316e-01
+ 1.92966079e+00 -8.05371385e-01
+ 2.20632583e+00 -9.00919345e-01
+ 1.52387824e+00 -4.82391996e-01
+ 8.04004564e-01 -2.73650595e-01
+ -7.75326067e-01 1.07469566e+00
+ 1.83226282e+00 -4.52173344e-01
+ 1.25079758e-01 -3.52895417e-02
+ -9.90957437e-01 8.55993130e-01
+ 1.71623322e+00 -7.08691667e-01
+ -2.86175924e+00 6.75160955e-01
+ -8.40817853e-01 -1.00361809e-01
+ 1.33393000e+00 -4.65788123e-01
+ 5.29394114e-01 -5.44881619e-02
+ -8.07435599e-01 8.27353370e-01
+ -4.33165824e+00 1.97299638e+00
+ 1.26452422e+00 -8.34070486e-01
+ 1.45996394e-02 2.97736043e-01
+ -1.64489287e+00 6.72839598e-01
+ -5.74234578e+00 3.20975117e+00
+ 2.13841341e-02 3.64514015e-01
+ 6.68084924e+00 -2.27464254e+00
+ -3.22881590e+00 8.01879324e-01
+ 3.02534313e-01 -4.56222796e-01
+ -5.84520734e+00 1.95678162e+00
+ 2.81515232e+00 -1.72101318e+00
+ -2.39620908e-01 2.69145522e-01
+ -7.41669691e-01 -2.30283281e-01
+ -2.15682714e+00 3.45313021e-01
+ 1.23475788e+00 -7.32276553e-01
+ -1.71816113e-01 1.20419560e-02
+ 1.89174235e+00 2.27435901e-01
+ -3.64511114e-01 1.72260361e-02
+ -3.24143860e+00 6.50125817e-01
+ -2.25707409e+00 5.66970751e-01
+ 1.03901456e+00 -1.00588433e+00
+ -5.09159710e+00 1.58736109e+00
+ 1.45534075e+00 -5.83787452e-01
+ 4.28879587e+00 -1.58006866e+00
+ 8.52384427e-01 -1.11042299e+00
+ 4.51431615e+00 -2.63844265e+00
+ -4.33042648e+00 1.86497078e+00
+ -2.13568046e+00 5.82559743e-01
+ -4.42568887e+00 1.26131214e+00
+ 3.15821315e+00 -1.61515905e+00
+ -3.14125204e+00 8.49604386e-01
+ 6.54152300e-01 -2.04624711e-01
+ -3.73374317e-01 9.94187820e-02
+ -3.96177282e+00 1.27245623e+00
+ 9.59825199e-01 -1.15547861e+00
+ 3.56902055e+00 -1.46591091e+00
+ 1.55433633e-02 6.93544345e-01
+ 1.15684646e+00 -4.99836352e-01
+ 3.11824573e+00 -4.75900506e-01
+ -8.61706369e-01 -3.50774059e-01
+ 9.89057391e-01 -7.16878802e-01
+ -4.94787870e+00 2.09137481e+00
+ 1.37777347e+00 -1.34946349e+00
+ -1.13161577e+00 8.05114754e-01
+ 8.12020675e-01 -1.04849421e+00
+ 4.73783881e+00 -2.26718812e+00
+ 8.99579366e-01 -8.89764451e-02
+ 4.78524868e+00 -2.25795843e+00
+ 1.75164590e+00 -1.73822209e-01
+ 1.30204590e+00 -7.26724717e-01
+ -7.26526403e-01 -5.23925361e-02
+ 2.01255351e+00 -1.69965366e+00
+ 9.87852740e-01 -4.63577220e-01
+ 2.45957762e+00 -1.29278962e+00
+ -3.13817948e+00 1.64433038e+00
+ -1.76302159e+00 9.62784302e-01
+ -1.91106331e+00 5.81460008e-01
+ -3.30883001e+00 1.30378978e+00
+ 5.54376450e-01 3.78814272e-01
+ 1.09982111e+00 -1.47969612e+00
+ -2.61300705e-02 -1.42573464e-01
+ -2.22096157e+00 7.75684440e-01
+ 1.70319323e+00 -2.89738444e-01
+ -1.43223842e+00 6.39284281e-01
+ 2.34360959e-01 -1.64379268e-01
+ -2.67147991e+00 9.46548086e-01
+ 1.51131425e+00 -4.91594395e-01
+ -2.48446856e+00 1.01286123e+00
+ 1.50534658e-01 -2.94620246e-01
+ -1.66966792e+00 1.67755508e+00
+ -1.50094241e+00 3.30163095e-01
+ 2.27681194e+00 -1.08064317e+00
+ 2.05122965e+00 -1.15165939e+00
+ -4.23509309e-01 -6.56906167e-02
+ 1.80084023e+00 -1.07228556e+00
+ -2.65769521e+00 1.18023206e+00
+ 2.02852676e+00 -8.06793574e-02
+ -4.49544185e+00 2.68200163e+00
+ -7.50043216e-01 1.17079331e+00
+ 6.80060893e-02 3.99055351e-01
+ -3.83634635e+00 1.38406887e+00
+ 3.24858545e-01 -9.25273218e-02
+ -2.19895100e+00 1.47819500e+00
+ -3.61569522e-01 -1.03188739e-01
+ 1.12180375e-01 -9.52696354e-02
+ -1.31477803e+00 1.79900570e-01
+ 2.39573628e+00 -6.09739269e-01
+ -1.00135700e+00 6.02837296e-01
+ -4.11994589e+00 2.49599192e+00
+ -1.54196236e-01 -4.84921951e-01
+ 5.92569908e-01 -1.87310359e-01
+ 3.85407741e+00 -1.50979925e+00
+ 5.17802528e+00 -2.26032607e+00
+ -1.37018916e+00 1.87111822e-01
+ 8.46682996e-01 -3.56676331e-01
+ -1.17559949e+00 5.29057734e-02
+ -5.56475671e-02 6.79049243e-02
+ 1.07851745e+00 -5.14535101e-01
+ -2.71622446e+00 1.00151846e+00
+ -1.08477208e+00 8.81391054e-01
+ 5.50755824e-01 -5.20577727e-02
+ 4.70885495e+00 -2.04220397e+00
+ -1.87375336e-01 -6.16962830e-02
+ 3.52097100e-01 2.21163550e-01
+ 7.07929984e-01 -1.75827590e-01
+ -1.22149219e+00 1.83084346e-01
+ 2.58247412e+00 -6.15914898e-01
+ -6.01206182e-01 -2.29832987e-01
+ 9.83360449e-01 -3.75870060e-01
+ -3.20027685e+00 1.35467480e+00
+ 1.79178978e+00 -1.38531981e+00
+ -3.30376867e-01 -1.16250192e-01
+ -1.89053055e+00 5.68463567e-01
+ -4.20604849e+00 1.65429681e+00
+ -1.01185529e+00 1.92801240e-01
+ -6.18819882e-01 5.42206996e-01
+ -5.08091672e+00 2.61598591e+00
+ -2.62570344e+00 2.51590658e+00
+ 3.05577906e+00 -1.49090609e+00
+ 2.77609677e+00 -1.37681378e+00
+ -7.93515301e-02 4.28072744e-01
+ -2.08359471e+00 8.94334295e-01
+ 2.20163801e+00 4.01127167e-02
+ -1.18145785e-01 -2.06822464e-01
+ -2.74788298e-01 2.96250607e-01
+ 1.59613555e+00 -3.87246203e-01
+ -3.82971472e-01 -3.39716093e-02
+ -4.20311307e-02 3.88529510e-01
+ 1.52128574e+00 -9.33138876e-01
+ -9.06584458e-01 -2.75016094e-02
+ 3.56216834e+00 -9.99384622e-01
+ 2.11964220e+00 -9.98749118e-02
+ 4.01203480e+00 -2.03032745e+00
+ -1.24171557e+00 1.97596725e-01
+ -1.57230455e+00 4.14126609e-01
+ -1.85484741e+00 5.40041563e-01
+ 1.76329831e+00 -6.95967734e-01
+ -2.29439232e-01 5.08669245e-01
+ -5.45124276e+00 2.26907549e+00
+ -5.71364288e-02 5.04476476e-01
+ 3.12468018e+00 -1.46358879e+00
+ 8.20017359e-01 6.51949028e-01
+ -1.33977500e+00 2.83634232e-04
+ -1.83311685e+00 1.23947117e+00
+ 6.31205922e-01 1.19792164e-02
+ -2.21967834e+00 6.94056232e-01
+ -1.41693842e+00 9.93526233e-01
+ -7.58885703e-01 6.78547347e-01
+ 3.60239086e+00 -1.08644935e+00
+ 6.72217073e-02 3.00036011e-02
+ -3.42680958e-01 -3.48049352e-01
+ 1.87546079e+00 -4.78018246e-01
+ 7.00485821e-01 -3.52905383e-01
+ -8.54580948e-01 8.17330861e-01
+ 8.19123706e-01 -5.73927281e-01
+ 2.70855639e-01 -3.08940052e-01
+ -1.05059952e+00 3.27873168e-01
+ 1.08282999e+00 4.84559349e-02
+ -7.89899220e-01 1.22291138e+00
+ -2.87939816e+00 7.17403497e-01
+ -2.08429452e+00 8.87409226e-01
+ 1.58409232e+00 -4.74123532e-01
+ 1.26882735e+00 1.59162510e-01
+ -2.53782993e+00 6.18253491e-01
+ -8.92757445e-01 3.35979011e-01
+ 1.31867900e+00 -1.17355054e+00
+ 1.14918879e-01 -5.35184038e-01
+ -1.70288738e-01 5.35868087e-02
+ 4.21355121e-01 5.41848690e-02
+ 2.07926943e+00 -5.72538144e-01
+ 4.08788970e-01 3.77655777e-01
+ -3.39631381e+00 9.84216764e-01
+ 2.94170163e+00 -1.83120916e+00
+ -7.94798752e-01 7.39889052e-01
+ 1.46555463e+00 -4.62275563e-01
+ 2.57255955e+00 -1.04671434e+00
+ 8.45042540e-01 -1.96952892e-01
+ -3.23526646e+00 1.60049846e+00
+ 3.21948565e+00 -8.88376674e-01
+ 1.43005104e+00 -9.21561086e-01
+ 8.82360506e-01 2.98403872e-01
+ -8.91168097e-01 1.01319072e+00
+ -5.13215241e-01 -2.47182649e-01
+ -1.35759444e+00 7.07450608e-02
+ -4.04550983e+00 2.23534867e+00
+ 1.39348883e+00 3.81637747e-01
+ -2.85676418e+00 1.53240862e+00
+ -1.37183120e+00 6.37977425e-02
+ -3.88195859e+00 1.73887145e+00
+ 1.19509776e+00 -6.25013512e-01
+ -2.80062734e+00 1.79840585e+00
+ 1.96558429e+00 -4.70997234e-01
+ 1.93111352e+00 -9.70318441e-01
+ 3.57991190e+00 -1.65065116e+00
+ 2.12831714e+00 -1.11531708e+00
+ -3.95661018e-01 -8.54339904e-02
+ -2.41630441e+00 1.65166304e+00
+ 7.55412624e-01 -1.53453579e-01
+ -1.77043450e+00 1.39928715e+00
+ -9.32631260e-01 8.73649199e-01
+ 1.53342205e+00 -8.39569765e-01
+ -6.29846924e-02 1.25023084e-01
+ 3.31509049e+00 -1.10733235e+00
+ -2.18957109e+00 3.07376993e-01
+ -2.35740747e+00 6.47437564e-01
+ -2.22142438e+00 8.47318938e-01
+ -6.51401147e-01 3.48398562e-01
+ 2.75763095e+00 -1.21390708e+00
+ 1.12550484e+00 -5.61412847e-01
+ -5.65053161e-01 6.74365205e-02
+ 1.68952456e+00 -6.57566096e-01
+ 8.95598401e-01 3.96738993e-01
+ -1.86537066e+00 9.44129208e-01
+ -2.59933294e+00 2.57423247e-01
+ -6.59598267e-01 1.91828851e-02
+ -2.64506676e+00 8.41783205e-01
+ -1.25911802e+00 5.52425066e-01
+ -1.39754507e+00 3.73689222e-01
+ 5.49550729e-02 1.35071215e+00
+ 3.31874811e+00 -1.05682424e+00
+ 3.63159604e+00 -1.42864695e+00
+ -4.45944617e+00 1.42889446e+00
+ 5.87314342e-01 -4.88892988e-01
+ -7.26130820e-01 1.51936106e-01
+ -1.79246441e+00 6.05888105e-01
+ -5.50948207e-01 6.21443081e-01
+ -3.17246063e-01 1.77213880e-01
+ -2.00098937e+00 1.23799074e+00
+ 4.33790961e+00 -1.08490465e+00
+ -2.03114114e+00 1.31613237e+00
+ -6.29216542e+00 1.92406317e+00
+ -1.60265624e+00 8.87947500e-01
+ 8.64465062e-01 -8.37416270e-01
+ -2.14273937e+00 8.05485900e-01
+ -2.36844256e+00 6.17915124e-01
+ -1.40429636e+00 6.78296866e-01
+ 9.99019988e-01 -5.84297572e-01
+ 7.38824546e-01 1.68838678e-01
+ 1.45681238e+00 3.04641461e-01
+ 2.15914949e+00 -3.43089227e-01
+ -1.23895930e+00 1.05339864e-01
+ -1.23162264e+00 6.46629863e-01
+ 2.28183862e+00 -9.24157063e-01
+ -4.29615882e-01 5.69130863e-01
+ -1.37449121e+00 -9.12032183e-01
+ -7.33890904e-01 -3.91865471e-02
+ 8.41400661e-01 -4.76002200e-01
+ -1.73349274e-01 -6.84143467e-02
+ 3.16042891e+00 -1.32651856e+00
+ -3.78244609e+00 2.38619718e+00
+ -3.69634380e+00 2.22368561e+00
+ 1.83766344e+00 -1.65675953e+00
+ -1.63206002e+00 1.19484469e+00
+ 3.68480064e-01 -5.70764494e-01
+ 3.61982479e-01 1.04274409e-01
+ 2.48863048e+00 -1.13285542e+00
+ -2.81896488e+00 9.47958768e-01
+ 5.74952901e-01 -2.75959392e-01
+ 3.72783275e-01 -3.48937848e-01
+ 1.95935716e+00 -1.06750415e+00
+ 5.19357531e+00 -2.32070803e+00
+ 4.09246149e+00 -1.89976700e+00
+ -3.36666087e-01 8.17645057e-02
+ 1.85453493e-01 3.76913151e-01
+ -3.06458262e+00 1.34106402e+00
+ -3.13796566e+00 7.00485099e-01
+ 1.42964058e+00 -1.35536932e-01
+ -1.23440423e-01 4.60094177e-02
+ -2.86753037e+00 -5.21724160e-02
+ 2.67113726e+00 -1.83746924e+00
+ -1.35335062e+00 1.28238073e+00
+ -2.43569899e+00 1.25998539e+00
+ 1.26036740e-01 -2.35416844e-01
+ -1.35725745e+00 7.37788491e-01
+ -3.80897538e-01 3.30757889e-01
+ 6.58694434e-01 -1.07566603e+00
+ 2.11273640e+00 -9.02260632e-01
+ 4.00755057e-01 -2.49229150e-02
+ -1.80095812e+00 9.73099742e-01
+ -2.68408372e+00 1.63737364e+00
+ -2.66079826e+00 7.47289412e-01
+ -9.92321439e-02 -1.49331396e-01
+ 4.45678251e+00 -1.80352394e+00
+ 1.35962915e+00 -1.31554389e+00
+ -7.76601417e-01 -9.66173523e-02
+ 1.68096348e+00 -6.27235133e-01
+ 1.53081227e-01 -3.54216830e-01
+ -1.54913095e+00 3.43689269e-01
+ 5.29187357e-02 -6.73916964e-01
+ -2.06606084e+00 8.34784242e-01
+ 1.73701179e+00 -6.06467340e-01
+ 1.55856757e+00 -2.58642780e-01
+ 1.04349101e+00 -4.43027348e-01
+ -1.02397719e+00 1.01308824e+00
+ -2.13860204e-01 -4.73347361e-01
+ -2.59004955e+00 1.43367853e+00
+ 7.98457679e-01 2.18621627e-02
+ -1.32974762e+00 4.61802208e-01
+ 3.21419359e-01 2.30723316e-02
+ 2.87201888e-02 6.24566672e-02
+ -1.22261418e+00 6.02340363e-01
+ 1.28750335e+00 -3.34839548e-02
+ -9.67952623e-01 4.34470505e-01
+ 2.02850324e+00 -9.05160255e-01
+ -4.13946010e+00 2.33779091e+00
+ -4.47508806e-01 3.06440495e-01
+ -3.91543394e+00 1.68251022e+00
+ -6.45193001e-01 5.29781162e-01
+ -2.15518916e-02 5.07278355e-01
+ -2.83356868e+00 1.00670227e+00
+ 1.82989749e+00 -1.37329222e+00
+ -1.09330213e+00 1.08560688e+00
+ 1.90533722e+00 -1.28905879e+00
+ 2.33986084e+00 2.30642626e-02
+ 8.01940220e-01 -1.63986962e+00
+ -4.23415165e+00 2.07530423e+00
+ 9.33382522e-01 -7.62917211e-01
+ -1.84033954e+00 1.07469401e+00
+ -2.81938669e+00 1.07342024e+00
+ -7.05169988e-01 2.13124943e-01
+ 5.09598137e-01 1.32725493e-01
+ -2.34558226e+00 8.62383168e-01
+ -1.70322072e+00 2.70893796e-01
+ 1.23652660e+00 -7.53216034e-02
+ 2.84660646e+00 -3.48178304e-02
+ 2.50250128e+00 -1.27770855e+00
+ -1.00279469e+00 8.77194218e-01
+ -4.34674121e-02 -2.12091350e-01
+ -5.84151289e-01 1.50382340e-01
+ -1.79024013e+00 4.24972808e-01
+ -1.23434666e+00 -8.85546570e-02
+ 1.36575412e+00 -6.42639880e-01
+ -1.98429947e+00 2.27650336e-01
+ 2.36253589e+00 -1.51340773e+00
+ 8.79157643e-01 6.84142159e-01
+ -2.18577755e+00 2.76526200e-01
+ -3.55473434e-01 8.29976561e-01
+ 1.16442595e+00 -5.97699411e-01
+ -7.35528097e-01 2.40318183e-01
+ -1.73702631e-01 7.33788663e-02
+ -1.40451745e+00 3.24899628e-01
+ -2.05434385e+00 5.68123738e-01
+ 8.47876642e-01 -5.74224294e-01
+ -6.91955602e-01 1.26009087e+00
+ 2.56574498e+00 -1.15602581e+00
+ 3.93306545e+00 -1.38398209e+00
+ -2.73230251e+00 4.89062581e-01
+ -1.04315474e+00 6.06335547e-01
+ 1.23231431e+00 -4.46675065e-01
+ -3.93035285e+00 1.43287651e+00
+ -1.02132111e+00 9.58919791e-01
+ -1.49425352e+00 1.06456165e+00
+ -6.26485337e-01 1.03791402e+00
+ -6.61772998e-01 2.63275425e-01
+ -1.80940386e+00 5.70767403e-01
+ 9.83720450e-01 -1.39449756e-01
+ -2.24619662e+00 9.01044870e-01
+ 8.94343014e-01 5.31038678e-02
+ 1.95518199e-01 -2.81343295e-01
+ -2.30533019e-01 -1.74478106e-01
+ -2.01550361e+00 5.55958010e-01
+ -4.36281469e+00 1.94374226e+00
+ -5.18530457e+00 2.89278357e+00
+ 2.67289101e+00 -2.98511449e-01
+ -1.53566179e+00 -1.00588944e-01
+ -6.09943217e-02 -1.56986047e-01
+ -5.22146452e+00 1.66209208e+00
+ -3.69777478e+00 2.26154873e+00
+ 2.24607181e-01 -4.86934960e-01
+ 2.49909450e+00 -1.03033370e+00
+ -1.07841120e+00 8.22388054e-01
+ -3.20697089e+00 1.09536143e+00
+ 3.43524232e+00 -1.47289362e+00
+ -5.65784134e-01 4.60365175e-01
+ -1.76714734e+00 1.57752346e-01
+ -7.77620365e-01 5.60153443e-01
+ 6.34399352e-01 -5.22339836e-01
+ 2.91011875e+00 -9.72623380e-01
+ -1.19286824e+00 6.32370253e-01
+ -2.18327609e-01 8.23953181e-01
+ 3.42430842e-01 1.37098055e-01
+ 1.28658034e+00 -9.11357320e-01
+ 2.06914465e+00 -6.67556382e-01
+ -6.69451020e-01 -6.38605102e-01
+ -2.09312398e+00 1.16743634e+00
+ -3.63778357e+00 1.91919157e+00
+ 8.74685911e-01 -1.09931208e+00
+ -3.91496791e+00 1.00808357e+00
+ 1.29621330e+00 -8.32239802e-01
+ 9.00222045e-01 -1.31159793e+00
+ -1.12242062e+00 1.98517079e-01
+ -3.71932852e-01 1.31667093e-01
+ -2.23829610e+00 1.26328346e+00
+ -2.08365062e+00 9.93385336e-01
+ -1.91082720e+00 7.45866855e-01
+ 4.38024917e+00 -2.05901118e+00
+ -2.28872886e+00 6.85279335e-01
+ 1.01274497e-01 -3.26227153e-01
+ -5.04447572e-01 -3.18619513e-01
+ 1.28537006e+00 -1.04573551e+00
+ -7.83175212e-01 1.54791645e-01
+ -3.89239175e+00 1.60017929e+00
+ -8.87877111e-01 -1.04968005e-01
+ 9.32215179e-01 -5.58691113e-01
+ -6.44977127e-01 -2.23018375e-01
+ 1.10141900e+00 -1.00666432e+00
+ 2.92755687e-01 -1.45480350e-01
+ 7.73580681e-01 -2.21150567e-01
+ -1.40873709e+00 7.61548044e-01
+ -8.89031805e-01 -3.48542923e-01
+ 4.16844267e-01 -2.39914494e-01
+ -4.64265832e-01 7.29581138e-01
+ 1.99835179e+00 -7.70542813e-01
+ 4.20523191e-02 -2.18783563e-01
+ -6.32611758e-01 -3.09926115e-01
+ 6.82912198e-02 -8.48327050e-01
+ 1.92425229e+00 -1.37876951e+00
+ 3.49461782e+00 -1.88354255e+00
+ -3.25209026e+00 1.49809395e+00
+ 6.59273182e-01 -2.37435654e-01
+ -1.15517300e+00 8.46134387e-01
+ 1.26756151e+00 -4.58988026e-01
+ -3.99178418e+00 2.04153008e+00
+ 7.05687841e-01 -6.83433306e-01
+ -1.61997342e+00 8.16577004e-01
+ -3.89750399e-01 4.29753250e-01
+ -2.53026432e-01 4.92861432e-01
+ -3.16788324e+00 4.44285524e-01
+ -7.86248901e-01 1.12753716e+00
+ -3.02351433e+00 1.28419015e+00
+ -1.30131355e+00 1.71226678e+00
+ -4.08843475e+00 1.62063214e+00
+ -3.09209403e+00 1.19958520e+00
+ 1.49102271e+00 -1.11834864e+00
+ -3.18059348e+00 5.74587042e-01
+ 2.06054867e+00 3.25797860e-03
+ -3.50999200e+00 2.02412428e+00
+ -8.26610023e-01 3.46528211e-01
+ 2.00546034e+00 -4.07333110e-01
+ -9.69941653e-01 4.80953753e-01
+ 4.47925660e+00 -2.33127314e+00
+ 2.03845790e+00 -9.90439915e-01
+ -1.11349191e+00 4.31183918e-01
+ -4.03628396e+00 1.68509679e+00
+ -1.48177601e+00 7.74322088e-01
+ 3.07369385e+00 -9.57465886e-01
+ 2.39011286e+00 -6.44506921e-01
+ 2.91561991e+00 -8.78627328e-01
+ 1.10212733e+00 -4.21637388e-01
+ 5.31985231e-01 -6.17445696e-01
+ -6.82340929e-01 -2.93529716e-01
+ 1.94290679e+00 -4.64268634e-01
+ 1.92262116e+00 -7.93142835e-01
+ 4.73762800e+00 -1.63654174e+00
+ -3.17848641e+00 8.05791391e-01
+ 4.08739432e+00 -1.80816807e+00
+ -7.60648826e-01 1.24216138e-01
+ -2.24716400e+00 7.90020937e-01
+ 1.64284052e+00 -7.18784070e-01
+ 1.04410012e-01 -7.11195880e-02
+ 2.18268225e+00 -7.01767831e-01
+ 2.06218013e+00 -8.70251746e-01
+ -1.35266581e+00 7.08456358e-01
+ -1.38157779e+00 5.14401086e-01
+ -3.28326008e+00 1.20988399e+00
+ 8.85358917e-01 -8.12213495e-01
+ -2.34067500e+00 3.67657353e-01
+ 3.96878127e+00 -1.66841450e+00
+ 1.36518053e+00 -8.33436812e-01
+ 5.25771988e-01 -5.06121987e-01
+ -2.25948361e+00 1.30663765e+00
+ -2.57662070e+00 6.32114628e-01
+ -3.43134685e+00 2.38106008e+00
+ 2.31571924e+00 -1.56566818e+00
+ -2.95397202e+00 1.05661888e+00
+ -1.35331242e+00 6.76383411e-01
+ 1.40977132e+00 -1.17775938e+00
+ 1.52561996e+00 -9.83147176e-01
+ 2.26550832e+00 -2.10464123e-02
+ 6.23371684e-01 -5.30768122e-01
+ -4.42356624e-01 9.72226986e-01
+ 2.31517901e+00 -1.08468105e+00
+ 1.97236640e+00 -1.42016619e+00
+ 3.18618687e+00 -1.45056343e+00
+ -2.75880360e+00 5.40254980e-01
+ -1.92916581e+00 1.45029864e-01
+ 1.90022524e+00 -6.03805754e-01
+ -1.05446211e+00 5.74361752e-01
+ 1.45990390e+00 -9.28233993e-01
+ 5.14960557e+00 -2.07564096e+00
+ -7.53104842e-01 1.55876958e-01
+ 8.09490983e-02 -8.58886384e-02
+ -1.56894969e+00 4.53497227e-01
+ 1.36944658e-01 5.60670875e-01
+ -5.32635329e-01 4.40309945e-01
+ 1.32507853e+00 -5.83670099e-01
+ 1.20676031e+00 -8.02296831e-01
+ -3.65023422e+00 1.17211368e+00
+ 1.53393850e+00 -6.17771312e-01
+ -3.99977129e+00 1.71415137e+00
+ 5.70705058e-01 -4.60771539e-01
+ -2.20608002e+00 1.07866596e+00
+ -1.09040244e+00 6.77441076e-01
+ -5.09886482e-01 -1.97282128e-01
+ -1.58062785e+00 6.18333697e-01
+ -1.53295020e+00 4.02168701e-01
+ -5.18580598e-01 2.25767177e-01
+ 1.59514316e+00 -2.54983617e-01
+ -5.91938655e+00 2.68223782e+00
+ 2.84200509e+00 -1.04685313e+00
+ 1.31298664e+00 -1.16672614e+00
+ -2.36660033e+00 1.81359460e+00
+ 6.94163290e-02 3.76658816e-01
+ 2.33973934e+00 -8.33173023e-01
+ -8.24640389e-01 7.83717285e-01
+ -1.02888281e+00 1.04680766e+00
+ 1.34750745e+00 -5.89568160e-01
+ -2.48761231e+00 7.44199284e-01
+ -1.04501559e+00 4.72326911e-01
+ -3.14610089e+00 1.89843692e+00
+ 2.13003416e-01 5.76633620e-01
+ -1.69239608e+00 5.66070021e-01
+ 1.80491280e+00 -9.31701080e-01
+ -6.94362572e-02 6.96026587e-01
+ 1.36502578e+00 -6.85599000e-02
+ -7.76764337e-01 3.64328661e-01
+ -2.67322167e+00 6.80150021e-01
+ 1.84338485e+00 -1.18487494e+00
+ 2.88009231e+00 -1.25700411e+00
+ 1.17114433e+00 -7.69727080e-01
+ 2.11576167e+00 2.81502116e-01
+ -1.51470088e+00 2.61553540e-01
+ 1.18923669e-01 -1.17890202e-01
+ 4.48359786e+00 -1.81427466e+00
+ -1.27055948e+00 9.92388998e-01
+ -8.00276606e-01 9.11326621e-02
+ 7.51764024e-01 -1.03676498e-01
+ 1.35769348e-01 -2.11470084e-01
+ 2.50731332e+00 -1.12418270e+00
+ -2.49752781e-01 7.81224033e-02
+ -6.23037902e-01 3.16599691e-01
+ -3.93772902e+00 1.37195391e+00
+ 1.74256361e+00 -1.12363582e+00
+ -1.49737281e+00 5.98828310e-01
+ 7.75592115e-01 -4.64733802e-01
+ -2.26027693e+00 1.36991118e+00
+ -1.62849836e+00 7.36899107e-01
+ 2.36850751e+00 -9.32126872e-01
+ 5.86169745e+00 -2.49342512e+00
+ -5.37092226e-01 1.23821274e+00
+ 2.80535867e+00 -1.93363302e+00
+ -1.77638106e+00 9.10050276e-01
+ 3.02692018e+00 -1.60774676e+00
+ 1.97833084e+00 -1.50636531e+00
+ 9.09168906e-01 -8.83799359e-01
+ 2.39769655e+00 -7.56977869e-01
+ 1.47283981e+00 -1.06749890e+00
+ 2.92060943e-01 -6.07040605e-01
+ -2.09278201e+00 7.71858590e-01
+ 7.10015905e-01 -5.42768432e-01
+ -2.16826169e-01 1.56897896e-01
+ 4.56288247e+00 -2.08912680e+00
+ -6.63374020e-01 6.67325183e-01
+ 1.80564442e+00 -9.76366134e-01
+ 3.28720168e+00 -4.66575145e-01
+ -1.60463695e-01 -2.58428153e-01
+ 1.78590750e+00 -3.96427146e-01
+ 2.75950306e+00 -1.82102856e+00
+ -1.18234310e+00 6.28073320e-01
+ 4.11415835e+00 -2.33551216e+00
+ 1.38721004e+00 -2.77450622e-01
+ -2.94903545e+00 1.74813352e+00
+ 8.67290400e-01 -6.51667894e-01
+ 2.70022274e+00 -8.11832480e-01
+ -2.06766146e+00 8.24047249e-01
+ 3.90717142e+00 -1.20155758e+00
+ -2.95102809e+00 1.36667968e+00
+ 6.08815147e+00 -2.60737974e+00
+ 2.78576476e+00 -7.86628755e-01
+ -3.26258407e+00 1.09302450e+00
+ 1.59849422e+00 -1.09705202e+00
+ -2.50600710e-01 1.63243175e-01
+ -4.90477087e-01 -4.57729572e-01
+ -1.24837181e+00 3.22157840e-01
+ -2.46341049e+00 1.06517849e+00
+ 9.62880751e-01 4.56962496e-01
+ 3.99964487e-01 2.07472802e-01
+ 6.36657705e-01 -3.46400942e-02
+ 4.91231407e-02 -1.40289235e-02
+ -4.66683524e-02 -3.72326100e-01
+ -5.22049702e-01 -1.70440260e-01
+ 5.27062938e-01 -2.32628395e-01
+ -2.69440318e+00 1.18914874e+00
+ 3.65087539e+00 -1.53427267e+00
+ -1.16546364e-01 4.93245392e-02
+ 7.55931384e-01 -3.02980139e-01
+ 2.06338745e+00 -6.24841225e-01
+ 1.31177908e-01 7.29338183e-01
+ 1.48021784e+00 -6.39509896e-01
+ -5.98656707e-01 2.84525503e-01
+ -2.18611080e+00 1.79549812e+00
+ -2.91673624e+00 2.15772237e-01
+ -8.95591350e-01 7.68250538e-01
+ 1.36139762e+00 -1.93845144e-01
+ 5.45730414e+00 -2.28114404e+00
+ 3.22747247e-01 9.33582332e-01
+ -1.46384504e+00 1.12801186e-01
+ 4.26728166e-01 -2.33481242e-01
+ -1.41327270e+00 8.16103740e-01
+ -2.53998067e-01 1.44906646e-01
+ -1.32436467e+00 1.87556361e-01
+ -3.77313086e+00 1.32896038e+00
+ 3.77651731e+00 -1.76548043e+00
+ -2.45297093e+00 1.32571926e+00
+ -6.55900588e-01 3.56921462e-01
+ 9.25558722e-01 -4.51988954e-01
+ 1.20732231e+00 -3.02821614e-01
+ 3.72660154e-01 -1.89365208e-01
+ -1.77090939e+00 9.18087975e-01
+ 3.01127567e-01 2.67965829e-01
+ -1.76708900e+00 4.62069259e-01
+ -2.71812099e+00 1.57233508e+00
+ -5.35297633e-01 4.99231535e-01
+ 1.50507631e+00 -9.85763646e-01
+ 3.00424787e+00 -1.29837562e+00
+ -4.99311105e-01 3.91086482e-01
+ 1.30125207e+00 -1.26247924e-01
+ 4.01699483e-01 -4.46909391e-01
+ -1.33635257e+00 5.12068703e-01
+ 1.39229757e+00 -9.10974858e-01
+ -1.74229508e+00 1.49475978e+00
+ -1.21489414e+00 4.04193753e-01
+ -3.36537605e-01 -6.74335427e-01
+ -2.79186828e-01 8.48314720e-01
+ -2.03080140e+00 1.66599815e+00
+ -3.53064281e-01 -7.68582906e-04
+ -5.30305657e+00 2.91091546e+00
+ -1.20049972e+00 8.26578358e-01
+ 2.95906989e-01 2.40215920e-01
+ -1.42955534e+00 4.63480310e-01
+ -1.87856619e+00 8.21459385e-01
+ -2.71124720e+00 1.80246843e+00
+ -3.06933780e+00 1.22235760e+00
+ 5.21935582e-01 -1.27298218e+00
+ -1.34175797e+00 7.69018937e-01
+ -1.81962785e+00 1.15528991e+00
+ -3.99227550e-01 2.93821598e-01
+ 1.22533179e+00 -4.73846323e-01
+ -2.08068359e-01 -1.75039817e-01
+ -2.03068526e+00 1.50370503e+00
+ -3.27606113e+00 1.74906330e+00
+ -4.37802587e-01 -2.26956048e-01
+ -7.69774213e-02 -3.54922468e-01
+ 6.47160749e-02 -2.07334721e-01
+ -1.37791524e+00 4.43766709e-01
+ 3.29846803e+00 -1.04060799e+00
+ -3.63704046e+00 1.05800226e+00
+ -1.26716116e+00 1.13077353e+00
+ 1.98549075e+00 -1.31864807e+00
+ 1.85159500e+00 -5.78629560e-01
+ -1.55295206e+00 1.23655857e+00
+ 6.76026255e-01 9.18824125e-02
+ 1.23418960e+00 -4.68162027e-01
+ 2.43186642e+00 -9.22422440e-01
+ -3.18729701e+00 1.77582673e+00
+ -4.02945613e+00 1.14303496e+00
+ -1.92694576e-01 1.03301431e-01
+ 1.89554730e+00 -4.60128096e-01
+ -2.55626581e+00 1.16057084e+00
+ 6.89144365e-01 -9.94982900e-01
+ -4.44680606e+00 2.19751983e+00
+ -3.15196193e+00 1.18762993e+00
+ -1.17434977e+00 1.04534656e+00
+ 8.58386984e-02 -1.03947487e+00
+ 3.33354973e-01 5.54813610e-01
+ -9.37631808e-01 3.33450150e-01
+ -2.50232471e+00 5.39720635e-01
+ 1.03611949e+00 -7.16304095e-02
+ -2.05556816e-02 -3.28992265e-01
+ -2.24176201e+00 1.13077506e+00
+ 4.53583688e+00 -1.10710212e+00
+ 4.77389762e-01 -8.99445512e-01
+ -2.69075551e+00 6.83176866e-01
+ -2.21779724e+00 1.16916849e+00
+ -1.09669056e+00 2.10044765e-01
+ -8.45367920e-01 -8.45951423e-02
+ 4.37558941e-01 -6.95904256e-01
+ 1.84884195e+00 -1.71205136e-01
+ -8.36371957e-01 5.62862478e-01
+ 1.27786531e+00 -1.33362147e+00
+ 2.90684492e+00 -7.49892184e-01
+ -3.38652716e+00 1.51180670e+00
+ -1.30945978e+00 7.09261928e-01
+ -7.50471924e-01 -5.24637889e-01
+ 1.18580718e+00 -9.97943971e-04
+ -7.55395645e+00 3.19273590e+00
+ 1.72822535e+00 -1.20996962e+00
+ 5.67374320e-01 6.19573416e-01
+ -2.99163781e+00 1.79721534e+00
+ 1.49862187e+00 -6.05631846e-02
+ 1.79503506e+00 -4.90419706e-01
+ 3.85626054e+00 -1.95396324e+00
+ -9.39188410e-01 7.96498057e-01
+ 2.91986664e+00 -1.29392724e+00
+ -1.54265750e+00 6.40727933e-01
+ 1.14919794e+00 1.20834257e-01
+ 2.00936817e+00 -1.53728359e+00
+ 3.72468420e+00 -1.38704612e+00
+ -1.27794802e+00 3.48543179e-01
+ 3.63294077e-01 5.70623314e-01
+ 1.49381016e+00 -6.04500534e-01
+ 2.98912256e+00 -1.72295726e+00
+ -1.80833817e+00 2.94907625e-01
+ -3.19669622e+00 1.31888700e+00
+ 1.45889401e+00 -8.88448639e-01
+ -2.80045388e+00 1.01207060e+00
+ -4.78379567e+00 1.48646520e+00
+ 2.25510003e+00 -7.13372461e-01
+ -9.74441433e-02 -2.17766373e-01
+ 2.64468496e-01 -3.60842698e-01
+ -5.98821713e+00 3.20197892e+00
+ 2.67030213e-01 -5.36386416e-01
+ 2.24546960e+00 -8.13464649e-01
+ -4.89171414e-01 3.86255031e-01
+ -7.45713706e-01 6.29800380e-01
+ -3.30460503e-01 3.85127284e-01
+ -4.19588147e+00 1.52793198e+00
+ 5.42078582e-01 -2.61642741e-02
+ 4.24938513e-01 -5.72936751e-01
+ 2.82717288e+00 -6.75355024e-01
+ -1.44741788e+00 5.03578028e-01
+ -1.65547573e+00 7.76444277e-01
+ 2.20361170e+00 -1.40835680e+00
+ -3.69540235e+00 2.32953767e+00
+ -1.41909357e-01 2.28989778e-01
+ 1.92838879e+00 -8.72525737e-01
+ 1.40708100e+00 -6.81849638e-02
+ 1.24988112e+00 -1.39470590e-01
+ -2.39435855e+00 7.26587655e-01
+ 7.03985028e-01 4.85403277e-02
+ 4.05214529e+00 -9.16928318e-01
+ 3.74198837e-01 -5.04192358e-01
+ -8.43374127e-01 2.36064018e-01
+ -3.32253349e-01 7.47840055e-01
+ -6.03725210e+00 1.95173337e+00
+ 4.60829865e+00 -1.51191309e+00
+ -1.46247098e+00 1.11140916e+00
+ -9.60111157e-01 -1.23189114e-01
+ -7.49613187e-01 4.53614129e-01
+ -5.77838219e-01 2.07366469e-02
+ 8.07652950e-01 -5.16272662e-01
+ -6.02556049e-01 5.05318649e-01
+ -1.28712445e-01 2.57836512e-01
+ -5.27662820e+00 2.11790737e+00
+ 5.40819308e+00 -2.15366022e+00
+ 9.37742513e-02 -1.60221751e-01
+ 4.55902865e+00 -1.24646307e+00
+ -9.06582589e-01 1.92928110e-01
+ 2.99928996e+00 -8.04301218e-01
+ -3.24317381e+00 1.80076061e+00
+ 3.20421743e-01 8.76524679e-01
+ -5.29606705e-01 -3.16717696e-01
+ -1.77264560e+00 7.52686776e-01
+ -1.51706824e+00 8.43755103e-01
+ 1.52759111e+00 -7.86814243e-01
+ 4.74845617e-01 4.21319700e-01
+ 6.97829149e-01 -8.15664881e-01
+ 3.09564973e+00 -1.06202469e+00
+ 2.95320379e+00 -1.98963943e+00
+ -4.23033224e+00 1.41013338e+00
+ 1.48576206e+00 8.02908511e-02
+ 4.52041627e+00 -2.04620399e+00
+ 6.58403922e-01 -7.60781799e-01
+ 2.10667543e-01 1.15241731e-01
+ 1.77702583e+00 -8.10271859e-01
+ 2.41277385e+00 -1.46972042e+00
+ 1.50685525e+00 -1.99272545e-01
+ 7.61665522e-01 -4.11276152e-01
+ 1.18352312e+00 -9.59908608e-01
+ -3.32031305e-01 8.07500132e-02
+ 1.16813118e+00 -1.73095194e-01
+ 1.18363346e+00 -5.41565052e-01
+ 5.17702179e-01 -7.62442035e-01
+ 4.57401006e-01 -1.45951115e-02
+ 1.49377115e-01 2.99571605e-01
+ 1.40399453e+00 -1.30160353e+00
+ 5.26231567e-01 3.52783752e-01
+ -1.91136514e+00 4.24228635e-01
+ 1.74156701e+00 -9.92076776e-01
+ -4.89323391e+00 2.32483507e+00
+ 2.54011209e+00 -8.80366295e-01
+ -5.56925706e-01 1.48842026e-01
+ -2.35904668e+00 9.60474853e-01
+ 1.42216971e+00 -4.67062761e-01
+ -1.10809680e+00 7.68684300e-01
+ 4.09674726e+00 -1.90795680e+00
+ -2.23048923e+00 9.03812542e-01
+ 6.57025763e-01 1.36514871e-01
+ 2.10944145e+00 -9.78897838e-02
+ 1.22552525e+00 -2.50303867e-01
+ 2.84620103e-01 -5.30164020e-01
+ -2.13562585e+00 1.03503056e+00
+ 1.32414902e-01 -8.14190240e-03
+ -5.82433561e-01 3.21020292e-01
+ -5.06473247e-01 3.11530419e-01
+ 1.57162465e+00 -1.20763919e+00
+ -1.43155284e+00 -2.51203698e-02
+ -1.47093713e+00 -1.39620999e-01
+ -2.65765643e+00 1.06091403e+00
+ 2.45992927e+00 -5.88815836e-01
+ -1.28440162e+00 -1.99377398e-01
+ 6.11257504e-01 -3.73577401e-01
+ -3.46606103e-01 6.06081290e-01
+ 3.76687505e+00 -8.80181424e-01
+ -1.03725103e+00 1.45177517e+00
+ 2.76659936e+00 -1.09361320e+00
+ -3.61311296e+00 9.75032455e-01
+ 3.22878655e+00 -9.69497365e-01
+ 1.43560379e+00 -5.52524585e-01
+ 2.94042153e+00 -1.79747037e+00
+ 1.30739580e+00 2.47989248e-01
+ -4.05056982e-01 1.22831715e+00
+ -2.25827421e+00 2.30604626e-01
+ 3.69262926e-01 4.32714650e-02
+ -5.52064063e-01 6.07806340e-01
+ 7.03325987e+00 -2.17956730e+00
+ -2.37823835e-01 -8.28068639e-01
+ -4.84279888e-01 5.67765194e-01
+ -3.15863410e+00 1.02241617e+00
+ -3.39561593e+00 1.36876374e+00
+ -2.78482934e+00 6.81641104e-01
+ -4.37604334e+00 2.23826340e+00
+ -2.54049692e+00 8.22676745e-01
+ 3.73264822e+00 -9.93498732e-01
+ -3.49536064e+00 1.84771519e+00
+ 9.81801604e-01 -5.21278776e-01
+ 1.52996831e+00 -1.27386206e+00
+ -9.23490293e-01 5.29099482e-01
+ -2.76999461e+00 9.24831872e-01
+ -3.30029834e-01 -2.49645555e-01
+ -1.71156166e+00 5.44940854e-01
+ -2.37009487e+00 5.83826982e-01
+ -3.03216865e+00 1.04922722e+00
+ -2.19539936e+00 1.37558730e+00
+ 1.15350207e+00 -6.15318535e-01
+ 4.62011792e+00 -2.46714517e+00
+ 1.52627952e-02 -1.00618283e-01
+ -1.10399342e+00 4.87413533e-01
+ 3.55448194e+00 -9.10394190e-01
+ -5.21890321e+00 2.44710745e+00
+ 1.54289749e+00 -6.54269311e-01
+ 2.67935674e+00 -9.92758863e-01
+ 1.05801310e+00 2.60054285e-02
+ 1.52509097e+00 -4.08768600e-01
+ 3.27576917e+00 -1.28769406e+00
+ 1.71008412e-01 -2.68739994e-01
+ -9.83351344e-04 7.02495897e-02
+ -7.60795056e-03 1.61968285e-01
+ -1.80620472e+00 4.24934471e-01
+ 2.32023297e-02 -2.57284559e-01
+ 3.98219478e-01 -4.65361935e-01
+ 6.63476988e-01 -3.29823196e-02
+ 4.00154707e+00 -1.01792211e+00
+ -1.50286870e+00 9.46875359e-01
+ -2.22717585e+00 7.50636195e-01
+ -3.47381508e-01 -6.51596975e-01
+ 2.08076453e+00 -8.22800165e-01
+ 2.05099963e+00 -4.00868250e-01
+ 3.52576988e-02 -2.54418565e-01
+ 1.57342042e+00 -7.62166492e-02
+ -1.47019722e+00 3.40861172e-01
+ -1.21156090e+00 3.21891246e-01
+ 3.79729047e+00 -1.54350764e+00
+ 1.26459678e-02 6.99203693e-01
+ 1.53974177e-01 4.68643204e-01
+ -1.73923561e-01 -1.26229768e-01
+ 4.54644993e+00 -2.13951783e+00
+ 1.46022547e-01 -4.57084165e-01
+ 6.50048037e+00 -2.78872609e+00
+ -1.51934912e+00 1.03216768e+00
+ -3.06483575e+00 1.81101446e+00
+ -2.38212125e+00 9.19559042e-01
+ -1.81319611e+00 8.10545112e-01
+ 1.70951294e+00 -6.10712680e-01
+ 1.67974156e+00 -1.51241453e+00
+ -5.94795113e+00 2.56893813e+00
+ 3.62633110e-01 -7.46965304e-01
+ -2.44042594e+00 8.52761797e-01
+ 3.32412550e+00 -1.28439899e+00
+ 4.74860766e+00 -1.72821964e+00
+ 1.29072541e+00 -8.24872902e-01
+ -1.69450702e+00 4.09600876e-01
+ 1.29705411e+00 1.22300809e-01
+ -2.63597613e+00 8.55612913e-01
+ 9.28467301e-01 -2.63550114e-02
+ 2.44670264e+00 -4.10123002e-01
+ 1.06408206e+00 -5.03361942e-01
+ 5.12384049e-02 -1.27116595e-02
+ -1.06731272e+00 -1.76205029e-01
+ -9.45454582e-01 3.74404917e-01
+ 2.54343689e+00 -7.13810545e-01
+ -2.54460335e+00 1.31590265e+00
+ 1.89864233e+00 -3.98436339e-01
+ -1.93990133e+00 6.01474630e-01
+ -1.35938824e+00 4.00751788e-01
+ 2.38567018e+00 -6.13904880e-01
+ 2.18748050e-01 2.62631712e-01
+ -2.01388788e+00 1.41474031e+00
+ 2.74014581e+00 -1.27448105e+00
+ -2.13828583e+00 1.13616144e+00
+ 5.98730932e+00 -2.53430080e+00
+ -1.72872795e+00 1.53702057e+00
+ -2.53263962e+00 1.27342410e+00
+ 1.34326968e+00 -1.99395088e-01
+ 3.83352666e-01 -1.25683065e-01
+ -2.35630657e+00 5.54116983e-01
+ -1.94900838e+00 5.76270178e-01
+ -1.36699108e+00 -3.40904824e-01
+ -2.34727346e+00 -1.93054940e-02
+ -3.82779777e+00 1.83025664e+00
+ -4.31602080e+00 9.21605705e-01
+ 5.54098133e-01 2.33991419e-01
+ -4.53591188e+00 1.99833353e+00
+ -3.92715909e+00 1.83231482e+00
+ 3.91344440e-01 -1.11355111e-01
+ 3.48576363e+00 -1.41379449e+00
+ -1.42858690e+00 3.84532286e-01
+ 1.79519859e+00 -9.23486448e-01
+ 8.49691242e-01 -1.76551084e-01
+ 1.53618138e+00 8.23835015e-02
+ 5.91476520e-02 3.88296940e-02
+ 1.44837346e+00 -7.24097604e-01
+ -6.79008418e-01 4.04078097e-01
+ 2.87555510e+00 -9.51825076e-01
+ -1.12379101e+00 2.93457714e-01
+ 1.45263980e+00 -6.01960544e-01
+ -2.55741621e-01 9.26233518e-01
+ 3.54570714e+00 -1.41521877e+00
+ -1.61542388e+00 6.57844512e-01
+ -3.22844269e-01 3.02823546e-01
+ 1.03523913e+00 -6.92730711e-01
+ 1.11084909e+00 -3.50823642e-01
+ 3.41268693e+00 -1.90865862e+00
+ 7.67062858e-01 -9.48792160e-01
+ -5.49798016e+00 1.71139960e+00
+ 1.14865798e+00 -6.12669150e-01
+ -2.18256680e+00 7.78634462e-01
+ 4.78857389e+00 -2.55555085e+00
+ -1.85555569e+00 8.04311615e-01
+ -4.22278799e+00 2.01162524e+00
+ -1.56556149e+00 1.54353907e+00
+ -3.11527864e+00 1.65973526e+00
+ 2.66342611e+00 -1.20449402e+00
+ 1.57635314e+00 -1.48716308e-01
+ -6.35606865e-01 2.59701180e-01
+ 1.02431976e+00 -6.76929904e-01
+ 1.12973772e+00 1.49473892e-02
+ -9.12758116e-01 2.21533933e-01
+ -2.98014470e+00 1.71651189e+00
+ 2.74016965e+00 -9.47893923e-01
+ -3.47830591e+00 1.34941430e+00
+ 1.74757562e+00 -3.72503752e-01
+ 5.55820383e-01 -6.47992466e-01
+ -1.19871928e+00 9.82429151e-01
+ -2.53040133e+00 2.10671307e+00
+ -1.94085605e+00 1.38938137e+00
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 413b824e369da..1e9ef345b7435 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -16,9 +16,12 @@ often used for exploratory analysis and/or as a component of a hierarchical
 supervised learning pipeline (in which distinct classifiers or regression
 models are trained for each cluster). 
 
-MLlib supports
-[k-means](http://en.wikipedia.org/wiki/K-means_clustering) clustering, one of
-the most commonly used clustering algorithms that clusters the data points into
+MLlib supports the following models:
+
+### k-means
+
+[k-means](http://en.wikipedia.org/wiki/K-means_clustering) is one of the
+most commonly used clustering algorithms that clusters the data points into a
 predefined number of clusters. The MLlib implementation includes a parallelized
 variant of the [k-means++](http://en.wikipedia.org/wiki/K-means%2B%2B) method
 called [kmeans||](http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf).
@@ -34,6 +37,20 @@ a given dataset, the algorithm returns the best clustering result).
 * *initializationSteps* determines the number of steps in the k-means\|\| algorithm.
 * *epsilon* determines the distance threshold within which we consider k-means to have converged. 
 
+### Gaussian mixture
+
+A [Gaussian Mixture Model](http://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model)
+represents a composite distribution whereby points are drawn from one of *k* Gaussian sub-distributions, 
+each with its own probability.  The MLlib implementation uses the
+[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
+ algorithm to induce the maximum-likelihood model given a set of samples.  The implementation
+has the following parameters:
+
+* *k* is the number of desired clusters.
+* *convergenceTol* is the maximum change in log-likelihood at which we consider convergence achieved.
+* *maxIterations* is the maximum number of iterations to perform without reaching convergence.
+* *initialModel* is an optional starting point from which to start the EM algorithm. If this parameter is omitted, a random starting point will be constructed from the data.
+
 ### Power Iteration Clustering
 
 Power iteration clustering is a scalable and efficient algorithm for clustering points given pointwise mutual affinity values.  Internally the algorithm:
@@ -56,6 +73,8 @@ Example outputs for a dataset inspired by the paper - but with five clusters ins
 
 ### Examples
 
+#### k-means
+
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 The following code snippets can be executed in `spark-shell`.
@@ -168,6 +187,112 @@ print("Within Set Sum of Squared Error = " + str(WSSSE))
 
 </div>
 
+#### GaussianMixture
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+In the following example after loading and parsing data, we use a
+[GaussianMixture](api/scala/index.html#org.apache.spark.mllib.clustering.GaussianMixture) 
+object to cluster the data into two clusters. The number of desired clusters is passed 
+to the algorithm. We then output the parameters of the mixture model.
+
+{% highlight scala %}
+import org.apache.spark.mllib.clustering.GaussianMixture
+import org.apache.spark.mllib.linalg.Vectors
+
+// Load and parse the data
+val data = sc.textFile("data/mllib/gmm_data.txt")
+val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble))).cache()
+
+// Cluster the data into two classes using GaussianMixture
+val gmm = new GaussianMixture().setK(2).run(parsedData)
+
+// output parameters of max-likelihood model
+for (i <- 0 until gmm.k) {
+  println("weight=%f\nmu=%s\nsigma=\n%s\n" format 
+    (gmm.weights(i), gmm.gaussians(i).mu, gmm.gaussians(i).sigma))
+}
+
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+All of MLlib's methods use Java-friendly types, so you can import and call them there the same
+way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
+Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
+calling `.rdd()` on your `JavaRDD` object. A self-contained application example
+that is equivalent to the provided example in Scala is given below:
+
+{% highlight java %}
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.clustering.GaussianMixture;
+import org.apache.spark.mllib.clustering.GaussianMixtureModel;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.SparkConf;
+
+public class GaussianMixtureExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("GaussianMixture Example");
+    JavaSparkContext sc = new JavaSparkContext(conf);
+
+    // Load and parse data
+    String path = "data/mllib/gmm_data.txt";
+    JavaRDD<String> data = sc.textFile(path);
+    JavaRDD<Vector> parsedData = data.map(
+      new Function<String, Vector>() {
+        public Vector call(String s) {
+          String[] sarray = s.trim().split(" ");
+          double[] values = new double[sarray.length];
+          for (int i = 0; i < sarray.length; i++)
+            values[i] = Double.parseDouble(sarray[i]);
+          return Vectors.dense(values);
+        }
+      }
+    );
+    parsedData.cache();
+
+    // Cluster the data into two classes using GaussianMixture
+    GaussianMixtureModel gmm = new GaussianMixture().setK(2).run(parsedData.rdd());
+
+    // Output the parameters of the mixture model
+    for(int j=0; j<gmm.k(); j++) {
+        System.out.println("weight=%f\nmu=%s\nsigma=\n%s\n",
+            gmm.weights()[j], gmm.gaussians()[j].mu(), gmm.gaussians()[j].sigma());
+    }
+  }
+}
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+In the following example after loading and parsing data, we use a
+[GaussianMixture](api/python/pyspark.mllib.html#pyspark.mllib.clustering.GaussianMixture)
+object to cluster the data into two clusters. The number of desired clusters is passed 
+to the algorithm. We then output the parameters of the mixture model.
+
+{% highlight python %}
+from pyspark.mllib.clustering import GaussianMixture
+from numpy import array
+
+# Load and parse the data
+data = sc.textFile("data/mllib/gmm_data.txt")
+parsedData = data.map(lambda line: array([float(x) for x in line.strip().split(' ')]))
+
+# Build the model (cluster the data)
+gmm = GaussianMixture.train(parsedData, 2)
+
+# output parameters of model
+for i in range(2):
+    print ("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu,
+        "sigma = ", gmm.gaussians[i].sigma.toArray())
+
+{% endhighlight %}
+</div>
+
+</div>
+
 In order to run the above application, follow the instructions
 provided in the [Self-Contained Applications](quick-start.html#self-contained-applications)
 section of the Spark
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 73728bb35eb96..7779fbc9c49e4 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -25,6 +25,8 @@ filtering, dimensionality reduction, as well as underlying optimization primitiv
   * alternating least squares (ALS)
 * [Clustering](mllib-clustering.html)
   * k-means
+  * Gaussian mixture
+  * power iteration
 * [Dimensionality reduction](mllib-dimensionality-reduction.html)
   * singular value decomposition (SVD)
   * principal component analysis (PCA)

From cc6e53119d7a51b95b19244f50b25814088b4d11 Mon Sep 17 00:00:00 2001
From: lianhuiwang <lianhuiwang09@gmail.com>
Date: Fri, 6 Feb 2015 10:48:31 -0800
Subject: [PATCH 010/357] [SPARK-5653][YARN] In ApplicationMaster rename
 isDriver to isClusterMode

in ApplicationMaster rename isDriver to isClusterMode,because in Client it uses isClusterMode,ApplicationMaster should keep consistent with it and uses isClusterMode.Also isClusterMode is easier to understand.
andrewor14 sryza

Author: lianhuiwang <lianhuiwang09@gmail.com>

Closes #4430 from lianhuiwang/am-isDriver-rename and squashes the following commits:

f9f3ed0 [lianhuiwang] rename isDriver to isClusterMode
---
 .../spark/deploy/yarn/ApplicationMaster.scala | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 37e98e01fddf7..4cc320c5d59b5 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -54,7 +54,7 @@ private[spark] class ApplicationMaster(
   private val sparkConf = new SparkConf()
   private val yarnConf: YarnConfiguration = SparkHadoopUtil.get.newConfiguration(sparkConf)
     .asInstanceOf[YarnConfiguration]
-  private val isDriver = args.userClass != null
+  private val isClusterMode = args.userClass != null
 
   // Default to numExecutors * 2, with minimum of 3
   private val maxNumExecutorFailures = sparkConf.getInt("spark.yarn.max.executor.failures",
@@ -81,7 +81,7 @@ private[spark] class ApplicationMaster(
     try {
       val appAttemptId = client.getAttemptId()
 
-      if (isDriver) {
+      if (isClusterMode) {
         // Set the web ui port to be ephemeral for yarn so we don't conflict with
         // other spark processes running on the same box
         System.setProperty("spark.ui.port", "0")
@@ -139,7 +139,7 @@ private[spark] class ApplicationMaster(
       // doAs in order for the credentials to be passed on to the executor containers.
       val securityMgr = new SecurityManager(sparkConf)
 
-      if (isDriver) {
+      if (isClusterMode) {
         runDriver(securityMgr)
       } else {
         runExecutorLauncher(securityMgr)
@@ -162,7 +162,7 @@ private[spark] class ApplicationMaster(
    * from the application code.
    */
   final def getDefaultFinalStatus() = {
-    if (isDriver) {
+    if (isClusterMode) {
       FinalApplicationStatus.SUCCEEDED
     } else {
       FinalApplicationStatus.UNDEFINED
@@ -243,7 +243,7 @@ private[spark] class ApplicationMaster(
   private def runAMActor(
       host: String,
       port: String,
-      isDriver: Boolean): Unit = {
+      isClusterMode: Boolean): Unit = {
     
     val driverUrl = AkkaUtils.address(
       AkkaUtils.protocol(actorSystem),
@@ -251,7 +251,7 @@ private[spark] class ApplicationMaster(
       host,
       port,
       YarnSchedulerBackend.ACTOR_NAME)
-    actor = actorSystem.actorOf(Props(new AMActor(driverUrl, isDriver)), name = "YarnAM")
+    actor = actorSystem.actorOf(Props(new AMActor(driverUrl, isClusterMode)), name = "YarnAM")
   }
 
   private def runDriver(securityMgr: SecurityManager): Unit = {
@@ -272,7 +272,7 @@ private[spark] class ApplicationMaster(
       runAMActor(
         sc.getConf.get("spark.driver.host"),
         sc.getConf.get("spark.driver.port"),
-        isDriver = true)
+        isClusterMode = true)
       registerAM(sc.ui.map(_.appUIAddress).getOrElse(""), securityMgr)
       userClassThread.join()
     }
@@ -427,7 +427,7 @@ private[spark] class ApplicationMaster(
     sparkConf.set("spark.driver.host", driverHost)
     sparkConf.set("spark.driver.port", driverPort.toString)
 
-    runAMActor(driverHost, driverPort.toString, isDriver = false)
+    runAMActor(driverHost, driverPort.toString, isClusterMode = false)
   }
 
   /** Add the Yarn IP filter that is required for properly securing the UI. */
@@ -435,7 +435,7 @@ private[spark] class ApplicationMaster(
     val proxyBase = System.getenv(ApplicationConstants.APPLICATION_WEB_PROXY_BASE_ENV)
     val amFilter = "org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter"
     val params = client.getAmIpFilterParams(yarnConf, proxyBase)
-    if (isDriver) {
+    if (isClusterMode) {
       System.setProperty("spark.ui.filters", amFilter)
       params.foreach { case (k, v) => System.setProperty(s"spark.$amFilter.param.$k", v) }
     } else {
@@ -491,7 +491,7 @@ private[spark] class ApplicationMaster(
   /**
    * An actor that communicates with the driver's scheduler backend.
    */
-  private class AMActor(driverUrl: String, isDriver: Boolean) extends Actor {
+  private class AMActor(driverUrl: String, isClusterMode: Boolean) extends Actor {
     var driver: ActorSelection = _
 
     override def preStart() = {
@@ -503,7 +503,7 @@ private[spark] class ApplicationMaster(
       driver ! RegisterClusterManager
       // In cluster mode, the AM can directly monitor the driver status instead
       // of trying to deduce it from the lifecycle of the driver's actor
-      if (!isDriver) {
+      if (!isClusterMode) {
         context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])
       }
     }
@@ -513,7 +513,7 @@ private[spark] class ApplicationMaster(
         logInfo(s"Driver terminated or disconnected! Shutting down. $x")
         // In cluster mode, do not rely on the disassociated event to exit
         // This avoids potentially reporting incorrect exit codes if the driver fails
-        if (!isDriver) {
+        if (!isClusterMode) {
           finish(FinalApplicationStatus.SUCCEEDED, ApplicationMaster.EXIT_SUCCESS)
         }
 

From 1a88f20de798030a7d5713bd267f612ba5617fca Mon Sep 17 00:00:00 2001
From: Sandy Ryza <sandy@cloudera.com>
Date: Fri, 6 Feb 2015 10:53:16 -0800
Subject: [PATCH 011/357] SPARK-4337. [YARN] Add ability to cancel pending
 requests

Author: Sandy Ryza <sandy@cloudera.com>

Closes #4141 from sryza/sandy-spark-4337 and squashes the following commits:

a98bd20 [Sandy Ryza] Andrew's comments
cdaab7f [Sandy Ryza] SPARK-4337. Add ability to cancel pending requests to YARN
---
 .../spark/deploy/yarn/YarnAllocator.scala     | 65 ++++++++++++-------
 .../deploy/yarn/YarnAllocatorSuite.scala      | 54 +++++++++++++--
 2 files changed, 89 insertions(+), 30 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 0dbb6154b3039..12c62a659d799 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -69,8 +69,7 @@ private[yarn] class YarnAllocator(
   }
 
   // Visible for testing.
-  val allocatedHostToContainersMap =
-    new HashMap[String, collection.mutable.Set[ContainerId]]
+  val allocatedHostToContainersMap = new HashMap[String, collection.mutable.Set[ContainerId]]
   val allocatedContainerToHostMap = new HashMap[ContainerId, String]
 
   // Containers that we no longer care about. We've either already told the RM to release them or
@@ -84,7 +83,7 @@ private[yarn] class YarnAllocator(
   private var executorIdCounter = 0
   @volatile private var numExecutorsFailed = 0
 
-  @volatile private var maxExecutors = args.numExecutors
+  @volatile private var targetNumExecutors = args.numExecutors
 
   // Keep track of which container is running which executor to remove the executors later
   private val executorIdToContainer = new HashMap[String, Container]
@@ -133,10 +132,12 @@ private[yarn] class YarnAllocator(
     amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, resource).map(_.size).sum
 
   /**
-   * Request as many executors from the ResourceManager as needed to reach the desired total.
+   * Request as many executors from the ResourceManager as needed to reach the desired total. If
+   * the requested total is smaller than the current number of running executors, no executors will
+   * be killed.
    */
   def requestTotalExecutors(requestedTotal: Int): Unit = synchronized {
-    maxExecutors = requestedTotal
+    targetNumExecutors = requestedTotal
   }
 
   /**
@@ -147,8 +148,8 @@ private[yarn] class YarnAllocator(
       val container = executorIdToContainer.remove(executorId).get
       internalReleaseContainer(container)
       numExecutorsRunning -= 1
-      maxExecutors -= 1
-      assert(maxExecutors >= 0, "Allocator killed more executors than are allocated!")
+      targetNumExecutors -= 1
+      assert(targetNumExecutors >= 0, "Allocator killed more executors than are allocated!")
     } else {
       logWarning(s"Attempted to kill unknown executor $executorId!")
     }
@@ -163,15 +164,8 @@ private[yarn] class YarnAllocator(
    * This must be synchronized because variables read in this method are mutated by other methods.
    */
   def allocateResources(): Unit = synchronized {
-    val numPendingAllocate = getNumPendingAllocate
-    val missing = maxExecutors - numPendingAllocate - numExecutorsRunning
+    updateResourceRequests()
 
-    if (missing > 0) {
-      logInfo(s"Will request $missing executor containers, each with ${resource.getVirtualCores} " +
-        s"cores and ${resource.getMemory} MB memory including $memoryOverhead MB overhead")
-    }
-
-    addResourceRequests(missing)
     val progressIndicator = 0.1f
     // Poll the ResourceManager. This doubles as a heartbeat if there are no pending container
     // requests.
@@ -201,15 +195,36 @@ private[yarn] class YarnAllocator(
   }
 
   /**
-   * Request numExecutors additional containers from YARN. Visible for testing.
+   * Update the set of container requests that we will sync with the RM based on the number of
+   * executors we have currently running and our target number of executors.
+   *
+   * Visible for testing.
    */
-  def addResourceRequests(numExecutors: Int): Unit = {
-    for (i <- 0 until numExecutors) {
-      val request = new ContainerRequest(resource, null, null, RM_REQUEST_PRIORITY)
-      amClient.addContainerRequest(request)
-      val nodes = request.getNodes
-      val hostStr = if (nodes == null || nodes.isEmpty) "Any" else nodes.last
-      logInfo("Container request (host: %s, capability: %s".format(hostStr, resource))
+  def updateResourceRequests(): Unit = {
+    val numPendingAllocate = getNumPendingAllocate
+    val missing = targetNumExecutors - numPendingAllocate - numExecutorsRunning
+
+    if (missing > 0) {
+      logInfo(s"Will request $missing executor containers, each with ${resource.getVirtualCores} " +
+        s"cores and ${resource.getMemory} MB memory including $memoryOverhead MB overhead")
+
+      for (i <- 0 until missing) {
+        val request = new ContainerRequest(resource, null, null, RM_REQUEST_PRIORITY)
+        amClient.addContainerRequest(request)
+        val nodes = request.getNodes
+        val hostStr = if (nodes == null || nodes.isEmpty) "Any" else nodes.last
+        logInfo(s"Container request (host: $hostStr, capability: $resource)")
+      }
+    } else if (missing < 0) {
+      val numToCancel = math.min(numPendingAllocate, -missing)
+      logInfo(s"Canceling requests for $numToCancel executor containers")
+
+      val matchingRequests = amClient.getMatchingRequests(RM_REQUEST_PRIORITY, ANY_HOST, resource)
+      if (!matchingRequests.isEmpty) {
+        matchingRequests.head.take(numToCancel).foreach(amClient.removeContainerRequest)
+      } else {
+        logWarning("Expected to find pending requests, but found none.")
+      }
     }
   }
 
@@ -266,7 +281,7 @@ private[yarn] class YarnAllocator(
    * containersToUse or remaining.
    *
    * @param allocatedContainer container that was given to us by YARN
-   * @location resource name, either a node, rack, or *
+   * @param location resource name, either a node, rack, or *
    * @param containersToUse list of containers that will be used
    * @param remaining list of containers that will not be used
    */
@@ -294,7 +309,7 @@ private[yarn] class YarnAllocator(
   private def runAllocatedContainers(containersToUse: ArrayBuffer[Container]): Unit = {
     for (container <- containersToUse) {
       numExecutorsRunning += 1
-      assert(numExecutorsRunning <= maxExecutors)
+      assert(numExecutorsRunning <= targetNumExecutors)
       val executorHostname = container.getNodeId.getHost
       val containerId = container.getId
       executorIdCounter += 1
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
index 024b25f9d3365..3c224f148802e 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
@@ -107,8 +107,8 @@ class YarnAllocatorSuite extends FunSuite with Matchers with BeforeAndAfterEach
 
   test("single container allocated") {
     // request a single container and receive it
-    val handler = createAllocator()
-    handler.addResourceRequests(1)
+    val handler = createAllocator(1)
+    handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
     handler.getNumPendingAllocate should be (1)
 
@@ -123,8 +123,8 @@ class YarnAllocatorSuite extends FunSuite with Matchers with BeforeAndAfterEach
 
   test("some containers allocated") {
     // request a few containers and receive some of them
-    val handler = createAllocator()
-    handler.addResourceRequests(4)
+    val handler = createAllocator(4)
+    handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
     handler.getNumPendingAllocate should be (4)
 
@@ -144,7 +144,7 @@ class YarnAllocatorSuite extends FunSuite with Matchers with BeforeAndAfterEach
 
   test("receive more containers than requested") {
     val handler = createAllocator(2)
-    handler.addResourceRequests(2)
+    handler.updateResourceRequests()
     handler.getNumExecutorsRunning should be (0)
     handler.getNumPendingAllocate should be (2)
 
@@ -162,6 +162,50 @@ class YarnAllocatorSuite extends FunSuite with Matchers with BeforeAndAfterEach
     handler.allocatedHostToContainersMap.contains("host4") should be (false)
   }
 
+  test("decrease total requested executors") {
+    val handler = createAllocator(4)
+    handler.updateResourceRequests()
+    handler.getNumExecutorsRunning should be (0)
+    handler.getNumPendingAllocate should be (4)
+
+    handler.requestTotalExecutors(3)
+    handler.updateResourceRequests()
+    handler.getNumPendingAllocate should be (3)
+
+    val container = createContainer("host1")
+    handler.handleAllocatedContainers(Array(container))
+
+    handler.getNumExecutorsRunning should be (1)
+    handler.allocatedContainerToHostMap.get(container.getId).get should be ("host1")
+    handler.allocatedHostToContainersMap.get("host1").get should contain (container.getId)
+
+    handler.requestTotalExecutors(2)
+    handler.updateResourceRequests()
+    handler.getNumPendingAllocate should be (1)
+  }
+
+  test("decrease total requested executors to less than currently running") {
+    val handler = createAllocator(4)
+    handler.updateResourceRequests()
+    handler.getNumExecutorsRunning should be (0)
+    handler.getNumPendingAllocate should be (4)
+
+    handler.requestTotalExecutors(3)
+    handler.updateResourceRequests()
+    handler.getNumPendingAllocate should be (3)
+
+    val container1 = createContainer("host1")
+    val container2 = createContainer("host2")
+    handler.handleAllocatedContainers(Array(container1, container2))
+
+    handler.getNumExecutorsRunning should be (2)
+
+    handler.requestTotalExecutors(1)
+    handler.updateResourceRequests()
+    handler.getNumPendingAllocate should be (0)
+    handler.getNumExecutorsRunning should be (2)
+  }
+
   test("memory exceeded diagnostic regexes") {
     val diagnostics =
       "Container [pid=12465,containerID=container_1412887393566_0003_01_000002] is running " +

From fe3740c4c859d087b714c666741a29061bba5f58 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 6 Feb 2015 10:54:23 -0800
Subject: [PATCH 012/357] [SPARK-5636] Ramp up faster in dynamic allocation

A recent patch #4051 made the initial number default to 0. With this change, any Spark application using dynamic allocation's default settings will ramp up very slowly. Since we never request more executors than needed to saturate the pending tasks, it is safe to ramp up quickly. The current default of 60 may be too slow.

Author: Andrew Or <andrew@databricks.com>

Closes #4409 from andrewor14/dynamic-allocation-interval and squashes the following commits:

d3cc485 [Andrew Or] Lower request interval
---
 .../scala/org/apache/spark/ExecutorAllocationManager.scala  | 6 +++---
 docs/configuration.md                                       | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 5d5288bb6e60d..8b38366e0325c 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -76,15 +76,15 @@ private[spark] class ExecutorAllocationManager(
   private val maxNumExecutors = conf.getInt("spark.dynamicAllocation.maxExecutors",
     Integer.MAX_VALUE)
 
-  // How long there must be backlogged tasks for before an addition is triggered
+  // How long there must be backlogged tasks for before an addition is triggered (seconds)
   private val schedulerBacklogTimeout = conf.getLong(
-    "spark.dynamicAllocation.schedulerBacklogTimeout", 60)
+    "spark.dynamicAllocation.schedulerBacklogTimeout", 5)
 
   // Same as above, but used only after `schedulerBacklogTimeout` is exceeded
   private val sustainedSchedulerBacklogTimeout = conf.getLong(
     "spark.dynamicAllocation.sustainedSchedulerBacklogTimeout", schedulerBacklogTimeout)
 
-  // How long an executor must be idle for before it is removed
+  // How long an executor must be idle for before it is removed (seconds)
   private val executorIdleTimeout = conf.getLong(
     "spark.dynamicAllocation.executorIdleTimeout", 600)
 
diff --git a/docs/configuration.md b/docs/configuration.md
index 4c86cb7c16238..00e973c245005 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1140,7 +1140,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.dynamicAllocation.schedulerBacklogTimeout</code></td>
-  <td>60</td>
+  <td>5</td>
   <td>
     If dynamic allocation is enabled and there have been pending tasks backlogged for more than
     this duration (in seconds), new executors will be requested. For more detail, see this

From c01b9852ea2f7d453249b07d89e62af71bd26e3d Mon Sep 17 00:00:00 2001
From: Masayoshi TSUZUKI <tsudukim@oss.nttdata.co.jp>
Date: Fri, 6 Feb 2015 10:58:26 -0800
Subject: [PATCH 013/357] [SPARK-5396] Syntax error in spark scripts on
 windows.

Modified syntax error in spark-submit2.cmd. Command prompt doesn't have "defined" operator.

Author: Masayoshi TSUZUKI <tsudukim@oss.nttdata.co.jp>

Closes #4428 from tsudukim/feature/SPARK-5396 and squashes the following commits:

ec18465 [Masayoshi TSUZUKI] [SPARK-5396] Syntax error in spark scripts on windows.
---
 bin/spark-submit2.cmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/spark-submit2.cmd b/bin/spark-submit2.cmd
index 12244a9cb04fb..446cbc74b74f9 100644
--- a/bin/spark-submit2.cmd
+++ b/bin/spark-submit2.cmd
@@ -25,7 +25,7 @@ set ORIG_ARGS=%*
 rem Reset the values of all variables used
 set SPARK_SUBMIT_DEPLOY_MODE=client
 
-if not defined %SPARK_CONF_DIR% (
+if [%SPARK_CONF_DIR%] == [] (
   set SPARK_CONF_DIR=%SPARK_HOME%\conf
 )
 set SPARK_SUBMIT_PROPERTIES_FILE=%SPARK_CONF_DIR%\spark-defaults.conf

From 9792bec596113a6f5f4534772b7539255403b082 Mon Sep 17 00:00:00 2001
From: Stephen Haberman <stephen@exigencecorp.com>
Date: Fri, 6 Feb 2015 11:03:56 -0800
Subject: [PATCH 014/357] [SPARK-4877] Allow user first classes to extend
 classes in the parent.

Previously, the classloader isolation was almost too good, such
that if a child class needed to load/reference a class that was
only available in the parent, it could not do so.

This adds tests for that case, the user-first Fake2 class extends
the only-in-parent Fake3 class.

It also sneaks in a fix where only the first stage seemed to work,
and on subsequent stages, a LinkageError happened because classes
from the user-first classpath were getting defined twice.

Author: Stephen Haberman <stephen@exigencecorp.com>

Closes #3725 from stephenh/4877_user_first_parent_inheritance and squashes the following commits:

dabcd35 [Stephen Haberman] [SPARK-4877] Respect userClassPathFirst for the driver code too.
3d0fa7c [Stephen Haberman] [SPARK-4877] Allow user first classes to extend classes in the parent.
---
 .../scala/org/apache/spark/TestUtils.scala    | 34 +++++++++++++++----
 .../org/apache/spark/deploy/SparkSubmit.scala | 17 ++++++----
 .../executor/ExecutorURLClassLoader.scala     | 12 ++++++-
 .../ExecutorURLClassLoaderSuite.scala         | 16 ++++++---
 4 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala
index 34078142f5385..be081c3825566 100644
--- a/core/src/main/scala/org/apache/spark/TestUtils.scala
+++ b/core/src/main/scala/org/apache/spark/TestUtils.scala
@@ -43,11 +43,20 @@ private[spark] object TestUtils {
    * Note: if this is used during class loader tests, class names should be unique
    * in order to avoid interference between tests.
    */
-  def createJarWithClasses(classNames: Seq[String], value: String = ""): URL = {
+  def createJarWithClasses(
+      classNames: Seq[String],
+      toStringValue: String = "",
+      classNamesWithBase: Seq[(String, String)] = Seq(),
+      classpathUrls: Seq[URL] = Seq()): URL = {
     val tempDir = Utils.createTempDir()
-    val files = for (name <- classNames) yield createCompiledClass(name, tempDir, value)
+    val files1 = for (name <- classNames) yield {
+      createCompiledClass(name, tempDir, toStringValue, classpathUrls = classpathUrls) 
+    }
+    val files2 = for ((childName, baseName) <- classNamesWithBase) yield {
+      createCompiledClass(childName, tempDir, toStringValue, baseName, classpathUrls)
+    }
     val jarFile = new File(tempDir, "testJar-%s.jar".format(System.currentTimeMillis()))
-    createJar(files, jarFile)
+    createJar(files1 ++ files2, jarFile)
   }
 
 
@@ -85,15 +94,26 @@ private[spark] object TestUtils {
   }
 
   /** Creates a compiled class with the given name. Class file will be placed in destDir. */
-  def createCompiledClass(className: String, destDir: File, value: String = ""): File = {
+  def createCompiledClass(
+      className: String,
+      destDir: File,
+      toStringValue: String = "",
+      baseClass: String = null,
+      classpathUrls: Seq[URL] = Seq()): File = {
     val compiler = ToolProvider.getSystemJavaCompiler
+    val extendsText = Option(baseClass).map { c => s" extends ${c}" }.getOrElse("")
     val sourceFile = new JavaSourceFromString(className,
-      "public class " + className + " implements java.io.Serializable {" +
-      "  @Override public String toString() { return \"" + value + "\"; }}")
+      "public class " + className + extendsText + " implements java.io.Serializable {" +
+      "  @Override public String toString() { return \"" + toStringValue + "\"; }}")
 
     // Calling this outputs a class file in pwd. It's easier to just rename the file than
     // build a custom FileManager that controls the output location.
-    compiler.getTask(null, null, null, null, null, Seq(sourceFile)).call()
+    val options = if (classpathUrls.nonEmpty) {
+      Seq("-classpath", classpathUrls.map { _.getFile }.mkString(File.pathSeparator))
+    } else {
+      Seq()
+    }
+    compiler.getTask(null, null, null, options, null, Seq(sourceFile)).call()
 
     val fileName = className + ".class"
     val result = new File(fileName)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 8bbfcd2997dc6..9d25e647f1703 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -20,11 +20,9 @@ package org.apache.spark.deploy
 import java.io.{File, PrintStream}
 import java.lang.reflect.{Modifier, InvocationTargetException}
 import java.net.URL
-
 import scala.collection.mutable.{ArrayBuffer, HashMap, Map}
 
 import org.apache.hadoop.fs.Path
-
 import org.apache.ivy.Ivy
 import org.apache.ivy.core.LogOptions
 import org.apache.ivy.core.module.descriptor.{DefaultExcludeRule, DefaultDependencyDescriptor, DefaultModuleDescriptor}
@@ -35,9 +33,10 @@ import org.apache.ivy.core.retrieve.RetrieveOptions
 import org.apache.ivy.core.settings.IvySettings
 import org.apache.ivy.plugins.matcher.GlobPatternMatcher
 import org.apache.ivy.plugins.resolver.{ChainResolver, IBiblioResolver}
-
 import org.apache.spark.executor.ExecutorURLClassLoader
 import org.apache.spark.util.Utils
+import org.apache.spark.executor.ChildExecutorURLClassLoader
+import org.apache.spark.executor.MutableURLClassLoader
 
 /**
  * Main gateway of launching a Spark application.
@@ -389,8 +388,14 @@ object SparkSubmit {
       printStream.println("\n")
     }
 
-    val loader = new ExecutorURLClassLoader(new Array[URL](0),
-      Thread.currentThread.getContextClassLoader)
+    val loader =
+      if (sysProps.getOrElse("spark.files.userClassPathFirst", "false").toBoolean) {
+        new ChildExecutorURLClassLoader(new Array[URL](0),
+          Thread.currentThread.getContextClassLoader)
+      } else {
+        new ExecutorURLClassLoader(new Array[URL](0),
+          Thread.currentThread.getContextClassLoader)
+      }
     Thread.currentThread.setContextClassLoader(loader)
 
     for (jar <- childClasspath) {
@@ -434,7 +439,7 @@ object SparkSubmit {
     }
   }
 
-  private def addJarToClasspath(localJar: String, loader: ExecutorURLClassLoader) {
+  private def addJarToClasspath(localJar: String, loader: MutableURLClassLoader) {
     val uri = Utils.resolveURI(localJar)
     uri.getScheme match {
       case "file" | "local" =>
diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorURLClassLoader.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorURLClassLoader.scala
index 218ed7b5d2d39..8011e75944aac 100644
--- a/core/src/main/scala/org/apache/spark/executor/ExecutorURLClassLoader.scala
+++ b/core/src/main/scala/org/apache/spark/executor/ExecutorURLClassLoader.scala
@@ -39,7 +39,17 @@ private[spark] class ChildExecutorURLClassLoader(urls: Array[URL], parent: Class
       super.addURL(url)
     }
     override def findClass(name: String): Class[_] = {
-      super.findClass(name)
+      val loaded = super.findLoadedClass(name)
+      if (loaded != null) {
+        return loaded
+      }
+      try {
+        super.findClass(name)
+      } catch {
+        case e: ClassNotFoundException => {
+          parentClassLoader.loadClass(name)
+        }
+      }
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorURLClassLoaderSuite.scala b/core/src/test/scala/org/apache/spark/executor/ExecutorURLClassLoaderSuite.scala
index e2050e95a1b88..b7912c09d1410 100644
--- a/core/src/test/scala/org/apache/spark/executor/ExecutorURLClassLoaderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/executor/ExecutorURLClassLoaderSuite.scala
@@ -26,10 +26,14 @@ import org.apache.spark.util.Utils
 
 class ExecutorURLClassLoaderSuite extends FunSuite {
 
-  val childClassNames = List("FakeClass1", "FakeClass2")
-  val parentClassNames = List("FakeClass1", "FakeClass2", "FakeClass3")
-  val urls = List(TestUtils.createJarWithClasses(childClassNames, "1")).toArray
-  val urls2 = List(TestUtils.createJarWithClasses(parentClassNames, "2")).toArray
+  val urls2 = List(TestUtils.createJarWithClasses(
+      classNames = Seq("FakeClass1", "FakeClass2", "FakeClass3"),
+      toStringValue = "2")).toArray
+  val urls = List(TestUtils.createJarWithClasses(
+      classNames = Seq("FakeClass1"),
+      classNamesWithBase = Seq(("FakeClass2", "FakeClass3")), // FakeClass3 is in parent
+      toStringValue = "1",
+      classpathUrls = urls2)).toArray
 
   test("child first") {
     val parentLoader = new URLClassLoader(urls2, null)
@@ -37,6 +41,8 @@ class ExecutorURLClassLoaderSuite extends FunSuite {
     val fakeClass = classLoader.loadClass("FakeClass2").newInstance()
     val fakeClassVersion = fakeClass.toString
     assert(fakeClassVersion === "1")
+    val fakeClass2 = classLoader.loadClass("FakeClass2").newInstance()
+    assert(fakeClass.getClass === fakeClass2.getClass)
   }
 
   test("parent first") {
@@ -45,6 +51,8 @@ class ExecutorURLClassLoaderSuite extends FunSuite {
     val fakeClass = classLoader.loadClass("FakeClass1").newInstance()
     val fakeClassVersion = fakeClass.toString
     assert(fakeClassVersion === "2")
+    val fakeClass2 = classLoader.loadClass("FakeClass1").newInstance()
+    assert(fakeClass.getClass === fakeClass2.getClass)
   }
 
   test("child first can fall back") {

From 6072fcc14ee1a4eba793e725fcb2cb2ffebd5b60 Mon Sep 17 00:00:00 2001
From: lianhuiwang <lianhuiwang09@gmail.com>
Date: Fri, 6 Feb 2015 11:09:37 -0800
Subject: [PATCH 015/357] [SPARK-5593][Core]Replace BlockManagerListener with
 ExecutorListener in ExecutorAllocationListener

More strictly, in ExecutorAllocationListener, we need to replace onBlockManagerAdded, onBlockManagerRemoved with onExecutorAdded,onExecutorRemoved. because at some time, onExecutorAdded and onExecutorRemoved are more accurate to express these meanings. example at SPARK-5529, BlockManager has been removed,but executor is existed.
 andrewor14 sryza

Author: lianhuiwang <lianhuiwang09@gmail.com>

Closes #4369 from lianhuiwang/SPARK-5593 and squashes the following commits:

333367c [lianhuiwang] Replace BlockManagerListener with ExecutorListener in ExecutorAllocationListener
---
 .../spark/ExecutorAllocationManager.scala     |  9 +++---
 .../ExecutorAllocationManagerSuite.scala      | 32 +++++++++----------
 2 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 8b38366e0325c..02d54bf3b53cc 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -486,8 +486,8 @@ private[spark] class ExecutorAllocationManager(
       }
     }
 
-    override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded): Unit = {
-      val executorId = blockManagerAdded.blockManagerId.executorId
+    override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = {
+      val executorId = executorAdded.executorId
       if (executorId != SparkContext.DRIVER_IDENTIFIER) {
         // This guards against the race condition in which the `SparkListenerTaskStart`
         // event is posted before the `SparkListenerBlockManagerAdded` event, which is
@@ -498,9 +498,8 @@ private[spark] class ExecutorAllocationManager(
       }
     }
 
-    override def onBlockManagerRemoved(
-        blockManagerRemoved: SparkListenerBlockManagerRemoved): Unit = {
-      allocationManager.onExecutorRemoved(blockManagerRemoved.blockManagerId.executorId)
+    override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = {
+      allocationManager.onExecutorRemoved(executorRemoved.executorId)
     }
 
     /**
diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
index 57081ddd959a5..c2869628afbb3 100644
--- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable
 import org.scalatest.{FunSuite, PrivateMethodTester}
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
-import org.apache.spark.storage.BlockManagerId
+import org.apache.spark.scheduler.cluster.ExecutorInfo
 
 /**
  * Test add and remove behavior of ExecutorAllocationManager.
@@ -144,8 +144,8 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
 
     // Verify that running a task reduces the cap
     sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(1, 3)))
-    sc.listenerBus.postToAll(SparkListenerBlockManagerAdded(
-      0L, BlockManagerId("executor-1", "host1", 1), 100L))
+    sc.listenerBus.postToAll(SparkListenerExecutorAdded(
+      0L, "executor-1", new ExecutorInfo("host1", 1)))
     sc.listenerBus.postToAll(SparkListenerTaskStart(1, 0, createTaskInfo(0, 0, "executor-1")))
     assert(numExecutorsPending(manager) === 4)
     assert(addExecutors(manager) === 1)
@@ -578,30 +578,28 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     assert(removeTimes(manager).isEmpty)
 
     // New executors have registered
-    sc.listenerBus.postToAll(SparkListenerBlockManagerAdded(
-      0L, BlockManagerId("executor-1", "host1", 1), 100L))
+    sc.listenerBus.postToAll(SparkListenerExecutorAdded(
+      0L, "executor-1", new ExecutorInfo("host1", 1)))
     assert(executorIds(manager).size === 1)
     assert(executorIds(manager).contains("executor-1"))
     assert(removeTimes(manager).size === 1)
     assert(removeTimes(manager).contains("executor-1"))
-    sc.listenerBus.postToAll(SparkListenerBlockManagerAdded(
-      0L, BlockManagerId("executor-2", "host2", 1), 100L))
+    sc.listenerBus.postToAll(SparkListenerExecutorAdded(
+      0L, "executor-2", new ExecutorInfo("host2", 1)))
     assert(executorIds(manager).size === 2)
     assert(executorIds(manager).contains("executor-2"))
     assert(removeTimes(manager).size === 2)
     assert(removeTimes(manager).contains("executor-2"))
 
     // Existing executors have disconnected
-    sc.listenerBus.postToAll(SparkListenerBlockManagerRemoved(
-      0L, BlockManagerId("executor-1", "host1", 1)))
+    sc.listenerBus.postToAll(SparkListenerExecutorRemoved(0L, "executor-1", ""))
     assert(executorIds(manager).size === 1)
     assert(!executorIds(manager).contains("executor-1"))
     assert(removeTimes(manager).size === 1)
     assert(!removeTimes(manager).contains("executor-1"))
 
     // Unknown executor has disconnected
-    sc.listenerBus.postToAll(SparkListenerBlockManagerRemoved(
-      0L, BlockManagerId("executor-3", "host3", 1)))
+    sc.listenerBus.postToAll(SparkListenerExecutorRemoved(0L, "executor-3", ""))
     assert(executorIds(manager).size === 1)
     assert(removeTimes(manager).size === 1)
   }
@@ -613,8 +611,8 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     assert(removeTimes(manager).isEmpty)
 
     sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, createTaskInfo(0, 0, "executor-1")))
-    sc.listenerBus.postToAll(SparkListenerBlockManagerAdded(
-      0L, BlockManagerId("executor-1", "host1", 1), 100L))
+    sc.listenerBus.postToAll(SparkListenerExecutorAdded(
+      0L, "executor-1", new ExecutorInfo("host1", 1)))
     assert(executorIds(manager).size === 1)
     assert(executorIds(manager).contains("executor-1"))
     assert(removeTimes(manager).size === 0)
@@ -625,16 +623,16 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     val manager = sc.executorAllocationManager.get
     assert(executorIds(manager).isEmpty)
     assert(removeTimes(manager).isEmpty)
-    sc.listenerBus.postToAll(SparkListenerBlockManagerAdded(
-      0L, BlockManagerId("executor-1", "host1", 1), 100L))
+    sc.listenerBus.postToAll(SparkListenerExecutorAdded(
+      0L, "executor-1", new ExecutorInfo("host1", 1)))
     sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, createTaskInfo(0, 0, "executor-1")))
 
     assert(executorIds(manager).size === 1)
     assert(executorIds(manager).contains("executor-1"))
     assert(removeTimes(manager).size === 0)
 
-    sc.listenerBus.postToAll(SparkListenerBlockManagerAdded(
-      0L, BlockManagerId("executor-2", "host1", 1), 100L))
+    sc.listenerBus.postToAll(SparkListenerExecutorAdded(
+      0L, "executor-2", new ExecutorInfo("host1", 1)))
     assert(executorIds(manager).size === 2)
     assert(executorIds(manager).contains("executor-2"))
     assert(removeTimes(manager).size === 1)

From 4cdb26c174e479a144950d12e1ad180f361af1fd Mon Sep 17 00:00:00 2001
From: Makoto Fukuhara <fukuo33@gmail.com>
Date: Fri, 6 Feb 2015 11:11:38 -0800
Subject: [PATCH 016/357] [SPARK-5618][Spark Core][Minor] Optimise utility
 code.

Author: Makoto Fukuhara <fukuo33@gmail.com>

Closes #4396 from fukuo33/fix-unnecessary-regex and squashes the following commits:

cd07fd6 [Makoto Fukuhara] fix unnecessary regex.
---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 72d15e65bcde6..61d287ca9c3ac 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1162,9 +1162,9 @@ private[spark] object Utils extends Logging {
     // finding the call site of a method.
     val SPARK_CORE_CLASS_REGEX =
       """^org\.apache\.spark(\.api\.java)?(\.util)?(\.rdd)?(\.broadcast)?\.[A-Z]""".r
-    val SCALA_CLASS_REGEX = """^scala""".r
+    val SCALA_CORE_CLASS_PREFIX = "scala"
     val isSparkCoreClass = SPARK_CORE_CLASS_REGEX.findFirstIn(className).isDefined
-    val isScalaClass = SCALA_CLASS_REGEX.findFirstIn(className).isDefined
+    val isScalaClass = className.startsWith(SCALA_CORE_CLASS_PREFIX)
     // If the class is a Spark internal class or a Scala class, then exclude.
     isSparkCoreClass || isScalaClass
   }

From 32e964c410e7083b43264c46291e93cd206a8038 Mon Sep 17 00:00:00 2001
From: Kostas Sakellis <kostas@cloudera.com>
Date: Fri, 6 Feb 2015 11:13:00 -0800
Subject: [PATCH 017/357] SPARK-2450 Adds executor log links to Web UI

Adds links to stderr/stdout in the executor tab of the webUI for:
1) Standalone
2) Yarn client
3) Yarn cluster

This tries to add the log url support in a general way so as to make it easy to add support for all the
cluster managers. This is done by using environment variables to pass to the executor the log urls. The
SPARK_LOG_URL_ prefix is used and so additional logs besides stderr/stdout can also be added.

To propagate this information to the UI we use the onExecutorAdded spark listener event.

Although this commit doesn't add log urls when running on a mesos cluster, it should be possible to add using the same mechanism.

Author: Kostas Sakellis <kostas@cloudera.com>
Author: Josh Rosen <joshrosen@databricks.com>

Closes #3486 from ksakellis/kostas-spark-2450 and squashes the following commits:

d190936 [Josh Rosen] Fix a few minor style / formatting nits. Reset listener after each test Don't null listener out at end of main().
8673fe1 [Kostas Sakellis] CR feedback. Hide the log column if there are no logs available
5bf6952 [Kostas Sakellis] [SPARK-2450] [CORE] Adds exeuctor log links to Web UI
---
 .../spark/deploy/worker/ExecutorRunner.scala  |  7 +++
 .../apache/spark/deploy/worker/Worker.scala   |  1 +
 .../CoarseGrainedExecutorBackend.scala        |  8 ++-
 .../cluster/CoarseGrainedClusterMessage.scala |  6 +-
 .../CoarseGrainedSchedulerBackend.scala       |  4 +-
 .../scheduler/cluster/ExecutorData.scala      |  5 +-
 .../scheduler/cluster/ExecutorInfo.scala      |  9 +--
 .../cluster/mesos/MesosSchedulerBackend.scala |  3 +-
 .../apache/spark/ui/exec/ExecutorsPage.scala  | 31 ++++++++--
 .../apache/spark/ui/exec/ExecutorsTab.scala   |  6 ++
 .../org/apache/spark/util/JsonProtocol.scala  |  6 +-
 .../spark/deploy/JsonProtocolSuite.scala      |  2 +-
 .../spark/deploy/LogUrlsStandaloneSuite.scala | 59 +++++++++++++++++++
 .../deploy/worker/ExecutorRunnerTest.scala    |  2 +-
 .../mesos/MesosSchedulerBackendSuite.scala    |  4 +-
 .../apache/spark/util/JsonProtocolSuite.scala | 12 +++-
 .../spark/deploy/yarn/ExecutorRunnable.scala  | 12 +++-
 .../spark/deploy/yarn/YarnClusterSuite.scala  | 31 +++++++++-
 18 files changed, 178 insertions(+), 30 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
index bc9f78b9e5c77..0add3064da452 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
@@ -43,6 +43,7 @@ private[spark] class ExecutorRunner(
     val worker: ActorRef,
     val workerId: String,
     val host: String,
+    val webUiPort: Int,
     val sparkHome: File,
     val executorDir: File,
     val workerUrl: String,
@@ -134,6 +135,12 @@ private[spark] class ExecutorRunner(
       // In case we are running this from within the Spark Shell, avoid creating a "scala"
       // parent process for the executor command
       builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")
+
+      // Add webUI log urls
+      val baseUrl = s"http://$host:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="
+      builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")
+      builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")
+
       process = builder.start()
       val header = "Spark Executor Command: %s\n%s\n\n".format(
         command.mkString("\"", "\" \"", "\""), "=" * 40)
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index b20f5c0c82895..10929eb516041 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -362,6 +362,7 @@ private[spark] class Worker(
             self,
             workerId,
             host,
+            webUiPort,
             sparkHome,
             executorDir,
             akkaUrl,
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index bc72c8970319c..3a42f8b157977 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -49,10 +49,16 @@ private[spark] class CoarseGrainedExecutorBackend(
   override def preStart() {
     logInfo("Connecting to driver: " + driverUrl)
     driver = context.actorSelection(driverUrl)
-    driver ! RegisterExecutor(executorId, hostPort, cores)
+    driver ! RegisterExecutor(executorId, hostPort, cores, extractLogUrls)
     context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])
   }
 
+  def extractLogUrls: Map[String, String] = {
+    val prefix = "SPARK_LOG_URL_"
+    sys.env.filterKeys(_.startsWith(prefix))
+      .map(e => (e._1.substring(prefix.length).toLowerCase, e._2))
+  }
+
   override def receiveWithLogging = {
     case RegisteredExecutor =>
       logInfo("Successfully registered with driver")
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index 1da6fe976da5b..9bf74f4be198d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -39,7 +39,11 @@ private[spark] object CoarseGrainedClusterMessages {
   case class RegisterExecutorFailed(message: String) extends CoarseGrainedClusterMessage
 
   // Executors to driver
-  case class RegisterExecutor(executorId: String, hostPort: String, cores: Int)
+  case class RegisterExecutor(
+      executorId: String,
+      hostPort: String,
+      cores: Int,
+      logUrls: Map[String, String])
     extends CoarseGrainedClusterMessage {
     Utils.checkHostPort(hostPort, "Expected host port")
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 103a5c053c289..9d2fb4f3b4729 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -86,7 +86,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
     }
 
     def receiveWithLogging = {
-      case RegisterExecutor(executorId, hostPort, cores) =>
+      case RegisterExecutor(executorId, hostPort, cores, logUrls) =>
         Utils.checkHostPort(hostPort, "Host port expected " + hostPort)
         if (executorDataMap.contains(executorId)) {
           sender ! RegisterExecutorFailed("Duplicate executor ID: " + executorId)
@@ -98,7 +98,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
           totalCoreCount.addAndGet(cores)
           totalRegisteredExecutors.addAndGet(1)
           val (host, _) = Utils.parseHostPort(hostPort)
-          val data = new ExecutorData(sender, sender.path.address, host, cores, cores)
+          val data = new ExecutorData(sender, sender.path.address, host, cores, cores, logUrls)
           // This must be synchronized because variables mutated
           // in this block are read when requesting executors
           CoarseGrainedSchedulerBackend.this.synchronized {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala
index eb52ddfb1eab1..5e571efe76720 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala
@@ -33,5 +33,6 @@ private[cluster] class ExecutorData(
    val executorAddress: Address,
    override val executorHost: String,
    var freeCores: Int,
-   override val totalCores: Int
-) extends ExecutorInfo(executorHost, totalCores)
+   override val totalCores: Int,
+   override val logUrlMap: Map[String, String]
+) extends ExecutorInfo(executorHost, totalCores, logUrlMap)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorInfo.scala
index b4738e64c9391..7f218566146a1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorInfo.scala
@@ -25,8 +25,8 @@ import org.apache.spark.annotation.DeveloperApi
 @DeveloperApi
 class ExecutorInfo(
    val executorHost: String,
-   val totalCores: Int
-) {
+   val totalCores: Int,
+   val logUrlMap: Map[String, String]) {
 
   def canEqual(other: Any): Boolean = other.isInstanceOf[ExecutorInfo]
 
@@ -34,12 +34,13 @@ class ExecutorInfo(
     case that: ExecutorInfo =>
       (that canEqual this) &&
         executorHost == that.executorHost &&
-        totalCores == that.totalCores
+        totalCores == that.totalCores &&
+        logUrlMap == that.logUrlMap
     case _ => false
   }
 
   override def hashCode(): Int = {
-    val state = Seq(executorHost, totalCores)
+    val state = Seq(executorHost, totalCores, logUrlMap)
     state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
index c3c546be6da15..cfb6592e14aa8 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
@@ -270,7 +270,8 @@ private[spark] class MesosSchedulerBackend(
       mesosTasks.foreach { case (slaveId, tasks) =>
         slaveIdToWorkerOffer.get(slaveId).foreach(o =>
           listenerBus.post(SparkListenerExecutorAdded(System.currentTimeMillis(), slaveId,
-            new ExecutorInfo(o.host, o.cores)))
+            // TODO: Add support for log urls for Mesos
+            new ExecutorInfo(o.host, o.cores, Map.empty)))
         )
         d.launchTasks(Collections.singleton(slaveIdToOffer(slaveId).getId), tasks, filters)
       }
diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
index 363cb96de7998..956608d7c0cbe 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
@@ -26,7 +26,8 @@ import org.apache.spark.ui.{ToolTips, UIUtils, WebUIPage}
 import org.apache.spark.util.Utils
 
 /** Summary information about an executor to display in the UI. */
-private case class ExecutorSummaryInfo(
+// Needs to be private[ui] because of a false positive MiMa failure.
+private[ui] case class ExecutorSummaryInfo(
     id: String,
     hostPort: String,
     rddBlocks: Int,
@@ -40,7 +41,8 @@ private case class ExecutorSummaryInfo(
     totalInputBytes: Long,
     totalShuffleRead: Long,
     totalShuffleWrite: Long,
-    maxMemory: Long)
+    maxMemory: Long,
+    executorLogs: Map[String, String])
 
 private[ui] class ExecutorsPage(
     parent: ExecutorsTab,
@@ -55,6 +57,7 @@ private[ui] class ExecutorsPage(
     val diskUsed = storageStatusList.map(_.diskUsed).sum
     val execInfo = for (statusId <- 0 until storageStatusList.size) yield getExecInfo(statusId)
     val execInfoSorted = execInfo.sortBy(_.id)
+    val logsExist = execInfo.filter(_.executorLogs.nonEmpty).nonEmpty
 
     val execTable =
       <table class={UIUtils.TABLE_CLASS_STRIPED}>
@@ -79,10 +82,11 @@ private[ui] class ExecutorsPage(
               Shuffle Write
             </span>
           </th>
+          {if (logsExist) <th class="sorttable_nosort">Logs</th> else Seq.empty}
           {if (threadDumpEnabled) <th class="sorttable_nosort">Thread Dump</th> else Seq.empty}
         </thead>
         <tbody>
-          {execInfoSorted.map(execRow)}
+          {execInfoSorted.map(execRow(_, logsExist))}
         </tbody>
       </table>
 
@@ -107,7 +111,7 @@ private[ui] class ExecutorsPage(
   }
 
   /** Render an HTML row representing an executor */
-  private def execRow(info: ExecutorSummaryInfo): Seq[Node] = {
+  private def execRow(info: ExecutorSummaryInfo, logsExist: Boolean): Seq[Node] = {
     val maximumMemory = info.maxMemory
     val memoryUsed = info.memoryUsed
     val diskUsed = info.diskUsed
@@ -138,6 +142,21 @@ private[ui] class ExecutorsPage(
       <td sorttable_customkey={info.totalShuffleWrite.toString}>
         {Utils.bytesToString(info.totalShuffleWrite)}
       </td>
+      {
+        if (logsExist) {
+          <td>
+            {
+              info.executorLogs.map { case (logName, logUrl) =>
+                <div>
+                  <a href={logUrl}>
+                    {logName}
+                  </a>
+                </div>
+              }
+            }
+          </td>
+        }
+      }
       {
         if (threadDumpEnabled) {
           val encodedId = URLEncoder.encode(info.id, "UTF-8")
@@ -168,6 +187,7 @@ private[ui] class ExecutorsPage(
     val totalInputBytes = listener.executorToInputBytes.getOrElse(execId, 0L)
     val totalShuffleRead = listener.executorToShuffleRead.getOrElse(execId, 0L)
     val totalShuffleWrite = listener.executorToShuffleWrite.getOrElse(execId, 0L)
+    val executorLogs = listener.executorToLogUrls.getOrElse(execId, Map.empty)
 
     new ExecutorSummaryInfo(
       execId,
@@ -183,7 +203,8 @@ private[ui] class ExecutorsPage(
       totalInputBytes,
       totalShuffleRead,
       totalShuffleWrite,
-      maxMem
+      maxMem,
+      executorLogs
     )
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
index dd1c2b78c4094..a38cb75fdd8c6 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
@@ -51,9 +51,15 @@ class ExecutorsListener(storageStatusListener: StorageStatusListener) extends Sp
   val executorToOutputBytes = HashMap[String, Long]()
   val executorToShuffleRead = HashMap[String, Long]()
   val executorToShuffleWrite = HashMap[String, Long]()
+  val executorToLogUrls = HashMap[String, Map[String, String]]()
 
   def storageStatusList = storageStatusListener.storageStatusList
 
+  override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded) = synchronized {
+    val eid = executorAdded.executorId
+    executorToLogUrls(eid) = executorAdded.executorInfo.logUrlMap
+  }
+
   override def onTaskStart(taskStart: SparkListenerTaskStart) = synchronized {
     val eid = taskStart.taskInfo.executorId
     executorToTasksActive(eid) = executorToTasksActive.getOrElse(eid, 0) + 1
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 8e0e41ad3782e..c8407bbcb780b 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -383,7 +383,8 @@ private[spark] object JsonProtocol {
 
   def executorInfoToJson(executorInfo: ExecutorInfo): JValue = {
     ("Host" -> executorInfo.executorHost) ~
-    ("Total Cores" -> executorInfo.totalCores)
+    ("Total Cores" -> executorInfo.totalCores) ~
+    ("Log Urls" -> mapToJson(executorInfo.logUrlMap))
   }
 
   /** ------------------------------ *
@@ -792,7 +793,8 @@ private[spark] object JsonProtocol {
   def executorInfoFromJson(json: JValue): ExecutorInfo = {
     val executorHost = (json \ "Host").extract[String]
     val totalCores = (json \ "Total Cores").extract[Int]
-    new ExecutorInfo(executorHost, totalCores)
+    val logUrls = mapFromJson(json \ "Log Urls").toMap
+    new ExecutorInfo(executorHost, totalCores, logUrls)
   }
 
   /** -------------------------------- *
diff --git a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
index aa65f7e8915e6..ed02ca81e405c 100644
--- a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
@@ -117,7 +117,7 @@ class JsonProtocolSuite extends FunSuite {
   }
 
   def createExecutorRunner(): ExecutorRunner = {
-    new ExecutorRunner("appId", 123, createAppDesc(), 4, 1234, null, "workerId", "host",
+    new ExecutorRunner("appId", 123, createAppDesc(), 4, 1234, null, "workerId", "host", 123,
       new File("sparkHome"), new File("workDir"), "akka://worker",
       new SparkConf, Seq("localDir"), ExecutorState.RUNNING)
   }
diff --git a/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala b/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
new file mode 100644
index 0000000000000..f33bdc73e40ac
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy
+
+import scala.collection.mutable
+
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+import org.apache.spark.scheduler.cluster.ExecutorInfo
+import org.apache.spark.scheduler.{SparkListenerExecutorAdded, SparkListener}
+import org.apache.spark.{SparkContext, LocalSparkContext}
+
+class LogUrlsStandaloneSuite extends FunSuite with LocalSparkContext with BeforeAndAfter {
+
+  /** Length of time to wait while draining listener events. */
+  val WAIT_TIMEOUT_MILLIS = 10000
+
+  before {
+    sc = new SparkContext("local-cluster[2,1,512]", "test")
+  }
+
+  test("verify log urls get propagated from workers") {
+    val listener = new SaveExecutorInfo
+    sc.addSparkListener(listener)
+
+    val rdd1 = sc.parallelize(1 to 100, 4)
+    val rdd2 = rdd1.map(_.toString)
+    rdd2.setName("Target RDD")
+    rdd2.count()
+
+    assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
+    listener.addedExecutorInfos.values.foreach { info =>
+      assert(info.logUrlMap.nonEmpty)
+    }
+  }
+
+  private class SaveExecutorInfo extends SparkListener {
+    val addedExecutorInfos = mutable.Map[String, ExecutorInfo]()
+
+    override def onExecutorAdded(executor: SparkListenerExecutorAdded) {
+      addedExecutorInfos(executor.executorId) = executor.executorInfo
+    }
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
index 6f233d7cf97aa..76511699e5ac5 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
@@ -32,7 +32,7 @@ class ExecutorRunnerTest extends FunSuite {
     val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
     val appDesc = new ApplicationDescription("app name", Some(8), 500,
       Command("foo", Seq(appId), Map(), Seq(), Seq(), Seq()), "appUiUrl")
-    val er = new ExecutorRunner(appId, 1, appDesc, 8, 500, null, "blah", "worker321",
+    val er = new ExecutorRunner(appId, 1, appDesc, 8, 500, null, "blah", "worker321", 123,
       new File(sparkHome), new File("ooga"), "blah", new SparkConf, Seq("localDir"),
       ExecutorState.RUNNING)
     val builder = CommandUtils.buildProcessBuilder(appDesc.command, 512, sparkHome, er.substituteVariables)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala
index f2ff98eb72daf..46ab02bfef780 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala
@@ -43,7 +43,7 @@ class MesosSchedulerBackendSuite extends FunSuite with LocalSparkContext with Ea
     conf.set("spark.mesos.executor.home" , "/mesos-home")
 
     val listenerBus = EasyMock.createMock(classOf[LiveListenerBus])
-    listenerBus.post(SparkListenerExecutorAdded(EasyMock.anyLong, "s1", new ExecutorInfo("host1", 2)))
+    listenerBus.post(SparkListenerExecutorAdded(EasyMock.anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
     EasyMock.replay(listenerBus)
 
     val sc = EasyMock.createMock(classOf[SparkContext])
@@ -88,7 +88,7 @@ class MesosSchedulerBackendSuite extends FunSuite with LocalSparkContext with Ea
     val taskScheduler = EasyMock.createMock(classOf[TaskSchedulerImpl])
 
     val listenerBus = EasyMock.createMock(classOf[LiveListenerBus])
-    listenerBus.post(SparkListenerExecutorAdded(EasyMock.anyLong, "s1", new ExecutorInfo("host1", 2)))
+    listenerBus.post(SparkListenerExecutorAdded(EasyMock.anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
     EasyMock.replay(listenerBus)
 
     val sc = EasyMock.createMock(classOf[SparkContext])
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index 6577ebaa2e9a8..842f54529baf0 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -76,8 +76,9 @@ class JsonProtocolSuite extends FunSuite {
     val unpersistRdd = SparkListenerUnpersistRDD(12345)
     val applicationStart = SparkListenerApplicationStart("The winner of all", None, 42L, "Garfield")
     val applicationEnd = SparkListenerApplicationEnd(42L)
+    val logUrlMap = Map("stderr" -> "mystderr", "stdout" -> "mystdout").toMap
     val executorAdded = SparkListenerExecutorAdded(executorAddedTime, "exec1",
-      new ExecutorInfo("Hostee.awesome.com", 11))
+      new ExecutorInfo("Hostee.awesome.com", 11, logUrlMap))
     val executorRemoved = SparkListenerExecutorRemoved(executorRemovedTime, "exec2", "test reason")
 
     testEvent(stageSubmitted, stageSubmittedJsonString)
@@ -100,13 +101,14 @@ class JsonProtocolSuite extends FunSuite {
   }
 
   test("Dependent Classes") {
+    val logUrlMap = Map("stderr" -> "mystderr", "stdout" -> "mystdout").toMap
     testRDDInfo(makeRddInfo(2, 3, 4, 5L, 6L))
     testStageInfo(makeStageInfo(10, 20, 30, 40L, 50L))
     testTaskInfo(makeTaskInfo(999L, 888, 55, 777L, false))
     testTaskMetrics(makeTaskMetrics(
       33333L, 44444L, 55555L, 66666L, 7, 8, hasHadoopInput = false, hasOutput = false))
     testBlockManagerId(BlockManagerId("Hong", "Kong", 500))
-    testExecutorInfo(new ExecutorInfo("host", 43))
+    testExecutorInfo(new ExecutorInfo("host", 43, logUrlMap))
 
     // StorageLevel
     testStorageLevel(StorageLevel.NONE)
@@ -1463,7 +1465,11 @@ class JsonProtocolSuite extends FunSuite {
       |  "Executor ID": "exec1",
       |  "Executor Info": {
       |    "Host": "Hostee.awesome.com",
-      |    "Total Cores": 11
+      |    "Total Cores": 11,
+      |    "Log Urls" : {
+      |      "stderr" : "mystderr",
+      |      "stdout" : "mystdout"
+      |    }
       |  }
       |}
     """
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
index ee2002a35f523..408cf09b9bdfa 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
@@ -56,7 +56,7 @@ class ExecutorRunnable(
   var rpc: YarnRPC = YarnRPC.create(conf)
   var nmClient: NMClient = _
   val yarnConf: YarnConfiguration = new YarnConfiguration(conf)
-  lazy val env = prepareEnvironment
+  lazy val env = prepareEnvironment(container)
   
   def run = {
     logInfo("Starting Executor Container")
@@ -254,7 +254,7 @@ class ExecutorRunnable(
     localResources
   }
 
-  private def prepareEnvironment: HashMap[String, String] = {
+  private def prepareEnvironment(container: Container): HashMap[String, String] = {
     val env = new HashMap[String, String]()
     val extraCp = sparkConf.getOption("spark.executor.extraClassPath")
     Client.populateClasspath(null, yarnConf, sparkConf, env, extraCp)
@@ -270,6 +270,14 @@ class ExecutorRunnable(
       YarnSparkHadoopUtil.setEnvFromInputString(env, userEnvs)
     }
 
+    // Add log urls
+    sys.env.get("SPARK_USER").foreach { user =>
+      val baseUrl = "http://%s/node/containerlogs/%s/%s"
+        .format(container.getNodeHttpAddress, ConverterUtils.toString(container.getId), user)
+      env("SPARK_LOG_URL_STDERR") = s"$baseUrl/stderr?start=0"
+      env("SPARK_LOG_URL_STDOUT") = s"$baseUrl/stdout?start=0"
+    }
+
     System.getenv().filterKeys(_.startsWith("SPARK")).foreach { case (k, v) => env(k) = v }
     env
   }
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index 7165918e1bfcf..eda40efc4c77f 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -21,16 +21,17 @@ import java.io.File
 import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConversions._
+import scala.collection.mutable
 
 import com.google.common.base.Charsets
 import com.google.common.io.Files
-import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}
-
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.server.MiniYARNCluster
+import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}
 
 import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException}
-import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.scheduler.cluster.ExecutorInfo
+import org.apache.spark.scheduler.{SparkListener, SparkListenerExecutorAdded}
 import org.apache.spark.util.Utils
 
 class YarnClusterSuite extends FunSuite with BeforeAndAfterAll with Matchers with Logging {
@@ -143,6 +144,11 @@ class YarnClusterSuite extends FunSuite with BeforeAndAfterAll with Matchers wit
     var result = File.createTempFile("result", null, tempDir)
     YarnClusterDriver.main(Array("yarn-client", result.getAbsolutePath()))
     checkResult(result)
+
+    // verify log urls are present
+    YarnClusterDriver.listener.addedExecutorInfos.values.foreach { info =>
+      assert(info.logUrlMap.nonEmpty)
+    }
   }
 
   test("run Spark in yarn-cluster mode") {
@@ -156,6 +162,11 @@ class YarnClusterSuite extends FunSuite with BeforeAndAfterAll with Matchers wit
       "--num-executors", "1")
     Client.main(args)
     checkResult(result)
+
+    // verify log urls are present.
+    YarnClusterDriver.listener.addedExecutorInfos.values.foreach { info =>
+      assert(info.logUrlMap.nonEmpty)
+    }
   }
 
   test("run Spark in yarn-cluster mode unsuccessfully") {
@@ -203,8 +214,19 @@ class YarnClusterSuite extends FunSuite with BeforeAndAfterAll with Matchers wit
 
 }
 
+private class SaveExecutorInfo extends SparkListener {
+  val addedExecutorInfos = mutable.Map[String, ExecutorInfo]()
+
+  override def onExecutorAdded(executor : SparkListenerExecutorAdded) {
+    addedExecutorInfos(executor.executorId) = executor.executorInfo
+  }
+}
+
 private object YarnClusterDriver extends Logging with Matchers {
 
+  val WAIT_TIMEOUT_MILLIS = 10000
+  var listener: SaveExecutorInfo = null
+
   def main(args: Array[String]) = {
     if (args.length != 2) {
       System.err.println(
@@ -216,12 +238,15 @@ private object YarnClusterDriver extends Logging with Matchers {
       System.exit(1)
     }
 
+    listener = new SaveExecutorInfo
     val sc = new SparkContext(new SparkConf().setMaster(args(0))
       .setAppName("yarn \"test app\" 'with quotes' and \\back\\slashes and $dollarSigns"))
+    sc.addSparkListener(listener)
     val status = new File(args(1))
     var result = "failure"
     try {
       val data = sc.parallelize(1 to 4, 4).collect().toSet
+      assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
       data should be (Set(1, 2, 3, 4))
       result = "success"
     } finally {

From 0d74bd7fd7b2722d08eddc5c269b8b2b6cb47635 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 6 Feb 2015 11:14:58 -0800
Subject: [PATCH 018/357] [SPARK-5555] Enable UISeleniumSuite tests

This patch enables UISeleniumSuite, a set of tests for the Spark application web UI.  These tests were previously disabled because they were slow, but I think we now have sufficient test time budget that the benefit of enabling them outweighs the time costs.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #4334 from JoshRosen/enable-uiseleniumsuite and squashes the following commits:

4ab9477 [Josh Rosen] Use BeforeAndAfterAll to cleanup WebDriver
71efc72 [Josh Rosen] Update broken UISeleniumSuite tests; use random port #.
a5ab595 [Josh Rosen] Enable UISeleniumSuite tests.
---
 .../org/apache/spark/ui/UISeleniumSuite.scala | 29 +++++++++++++------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index e85a436cdba17..6a972381faf14 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -32,12 +32,21 @@ import org.apache.spark.api.java.StorageLevels
 import org.apache.spark.shuffle.FetchFailedException
 
 /**
- * Selenium tests for the Spark Web UI.  These tests are not run by default
- * because they're slow.
+ * Selenium tests for the Spark Web UI.
  */
-@DoNotDiscover
-class UISeleniumSuite extends FunSuite with WebBrowser with Matchers {
-  implicit val webDriver: WebDriver = new HtmlUnitDriver
+class UISeleniumSuite extends FunSuite with WebBrowser with Matchers with BeforeAndAfterAll {
+
+  implicit var webDriver: WebDriver = _
+
+  override def beforeAll(): Unit = {
+    webDriver = new HtmlUnitDriver
+  }
+
+  override def afterAll(): Unit = {
+    if (webDriver != null) {
+      webDriver.quit()
+    }
+  }
 
   /**
    * Create a test SparkContext with the SparkUI enabled.
@@ -48,6 +57,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers {
       .setMaster("local")
       .setAppName("test")
       .set("spark.ui.enabled", "true")
+      .set("spark.ui.port", "0")
     val sc = new SparkContext(conf)
     assert(sc.ui.isDefined)
     sc
@@ -93,7 +103,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers {
       }
       eventually(timeout(5 seconds), interval(50 milliseconds)) {
         go to (sc.ui.get.appUIAddress.stripSuffix("/") + "/stages")
-        find(id("active")).get.text should be("Active Stages (0)")
+        find(id("active")) should be(None)  // Since we hide empty tables
         find(id("failed")).get.text should be("Failed Stages (1)")
       }
 
@@ -105,7 +115,7 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers {
       }
       eventually(timeout(5 seconds), interval(50 milliseconds)) {
         go to (sc.ui.get.appUIAddress.stripSuffix("/") + "/stages")
-        find(id("active")).get.text should be("Active Stages (0)")
+        find(id("active")) should be(None)  // Since we hide empty tables
         // The failure occurs before the stage becomes active, hence we should still show only one
         // failed stage, not two:
         find(id("failed")).get.text should be("Failed Stages (1)")
@@ -167,13 +177,14 @@ class UISeleniumSuite extends FunSuite with WebBrowser with Matchers {
 
   test("job progress bars should handle stage / task failures") {
     withSpark(newSparkContext()) { sc =>
-      val data = sc.parallelize(Seq(1, 2, 3)).map(identity).groupBy(identity)
+      val data = sc.parallelize(Seq(1, 2, 3), 1).map(identity).groupBy(identity)
       val shuffleHandle =
         data.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]].shuffleHandle
       // Simulate fetch failures:
       val mappedData = data.map { x =>
         val taskContext = TaskContext.get
-        if (taskContext.attemptNumber == 0) {  // Cause this stage to fail on its first attempt.
+        if (taskContext.taskAttemptId() == 1) {
+          // Cause the post-shuffle stage to fail on its first attempt with a single task failure
           val env = SparkEnv.get
           val bmAddress = env.blockManager.blockManagerId
           val shuffleId = shuffleHandle.shuffleId

From 80f3bcb58f836cfe1829c85bdd349c10525c8a5e Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 6 Feb 2015 11:22:11 -0800
Subject: [PATCH 019/357] [SPARK-5652][Mllib] Use broadcasted weights in
 LogisticRegressionModel

`LogisticRegressionModel`'s `predictPoint` should directly use broadcasted weights. This pr also fixes the compilation errors of two unit test suite: `JavaLogisticRegressionSuite ` and `JavaLinearRegressionSuite`.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #4429 from viirya/use_bcvalue and squashes the following commits:

5a797e5 [Liang-Chi Hsieh] Use broadcasted weights. Fix compilation error.
---
 .../spark/mllib/classification/LogisticRegression.scala   | 8 ++++----
 .../ml/classification/JavaLogisticRegressionSuite.java    | 4 ++--
 .../spark/ml/regression/JavaLinearRegressionSuite.java    | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index a668e7a7a359f..9a391bfff76a3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -33,7 +33,7 @@ import org.apache.spark.rdd.RDD
  *
  * @param weights Weights computed for every feature.
  * @param intercept Intercept computed for this model. (Only used in Binary Logistic Regression.
- *                  In Multinomial Logistic Regression, the intercepts will not be a single values,
+ *                  In Multinomial Logistic Regression, the intercepts will not be a single value,
  *                  so the intercepts will be part of the weights.)
  * @param numFeatures the dimension of the features.
  * @param numClasses the number of possible outcomes for k classes classification problem in
@@ -107,7 +107,7 @@ class LogisticRegressionModel (
     // If dataMatrix and weightMatrix have the same dimension, it's binary logistic regression.
     if (numClasses == 2) {
       require(numFeatures == weightMatrix.size)
-      val margin = dot(weights, dataMatrix) + intercept
+      val margin = dot(weightMatrix, dataMatrix) + intercept
       val score = 1.0 / (1.0 + math.exp(-margin))
       threshold match {
         case Some(t) => if (score > t) 1.0 else 0.0
@@ -116,11 +116,11 @@ class LogisticRegressionModel (
     } else {
       val dataWithBiasSize = weightMatrix.size / (numClasses - 1)
 
-      val weightsArray = weights match {
+      val weightsArray = weightMatrix match {
         case dv: DenseVector => dv.values
         case _ =>
           throw new IllegalArgumentException(
-            s"weights only supports dense vector but got type ${weights.getClass}.")
+            s"weights only supports dense vector but got type ${weightMatrix.getClass}.")
       }
 
       val margins = (0 until numClasses - 1).map { i =>
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
index 26284023b0f69..d4b664479255d 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
@@ -84,7 +84,7 @@ public void logisticRegressionWithSetters() {
       .setThreshold(0.6)
       .setProbabilityCol("myProbability");
     LogisticRegressionModel model = lr.fit(dataset);
-    assert(model.fittingParamMap().apply(lr.maxIter()) == 10);
+    assert(model.fittingParamMap().apply(lr.maxIter()).equals(10));
     assert(model.fittingParamMap().apply(lr.regParam()).equals(1.0));
     assert(model.fittingParamMap().apply(lr.threshold()).equals(0.6));
     assert(model.getThreshold() == 0.6);
@@ -109,7 +109,7 @@ public void logisticRegressionWithSetters() {
     // Call fit() with new params, and check as many params as we can.
     LogisticRegressionModel model2 = lr.fit(dataset, lr.maxIter().w(5), lr.regParam().w(0.1),
         lr.threshold().w(0.4), lr.probabilityCol().w("theProb"));
-    assert(model2.fittingParamMap().apply(lr.maxIter()) == 5);
+    assert(model2.fittingParamMap().apply(lr.maxIter()).equals(5));
     assert(model2.fittingParamMap().apply(lr.regParam()).equals(0.1));
     assert(model2.fittingParamMap().apply(lr.threshold()).equals(0.4));
     assert(model2.getThreshold() == 0.4);
diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
index 5bd616e74d86c..40d5a92bb32af 100644
--- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
@@ -76,13 +76,13 @@ public void linearRegressionWithSetters() {
         .setMaxIter(10)
         .setRegParam(1.0);
     LinearRegressionModel model = lr.fit(dataset);
-    assert(model.fittingParamMap().apply(lr.maxIter()) == 10);
+    assert(model.fittingParamMap().apply(lr.maxIter()).equals(10));
     assert(model.fittingParamMap().apply(lr.regParam()).equals(1.0));
 
     // Call fit() with new params, and check as many params as we can.
     LinearRegressionModel model2 =
         lr.fit(dataset, lr.maxIter().w(5), lr.regParam().w(0.1), lr.predictionCol().w("thePred"));
-    assert(model2.fittingParamMap().apply(lr.maxIter()) == 5);
+    assert(model2.fittingParamMap().apply(lr.maxIter()).equals(5));
     assert(model2.fittingParamMap().apply(lr.regParam()).equals(0.1));
     assert(model2.getPredictionCol().equals("thePred"));
   }

From fb6c0cbac414f3c43dec2ab886ca8a1097b781f7 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 6 Feb 2015 11:47:32 -0800
Subject: [PATCH 020/357] [HOTFIX] Fix test build break in
 ExecutorAllocationManagerSuite.

This was caused because #3486 added a new field to ExecutorInfo and #4369
added new tests that created ExecutorInfos.  These patches were merged in
quick succession and were never tested together, hence the compilation error.
---
 .../spark/ExecutorAllocationManagerSuite.scala       | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
index c2869628afbb3..9eb87f016068d 100644
--- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -145,7 +145,7 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     // Verify that running a task reduces the cap
     sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(1, 3)))
     sc.listenerBus.postToAll(SparkListenerExecutorAdded(
-      0L, "executor-1", new ExecutorInfo("host1", 1)))
+      0L, "executor-1", new ExecutorInfo("host1", 1, Map.empty)))
     sc.listenerBus.postToAll(SparkListenerTaskStart(1, 0, createTaskInfo(0, 0, "executor-1")))
     assert(numExecutorsPending(manager) === 4)
     assert(addExecutors(manager) === 1)
@@ -579,13 +579,13 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
 
     // New executors have registered
     sc.listenerBus.postToAll(SparkListenerExecutorAdded(
-      0L, "executor-1", new ExecutorInfo("host1", 1)))
+      0L, "executor-1", new ExecutorInfo("host1", 1, Map.empty)))
     assert(executorIds(manager).size === 1)
     assert(executorIds(manager).contains("executor-1"))
     assert(removeTimes(manager).size === 1)
     assert(removeTimes(manager).contains("executor-1"))
     sc.listenerBus.postToAll(SparkListenerExecutorAdded(
-      0L, "executor-2", new ExecutorInfo("host2", 1)))
+      0L, "executor-2", new ExecutorInfo("host2", 1, Map.empty)))
     assert(executorIds(manager).size === 2)
     assert(executorIds(manager).contains("executor-2"))
     assert(removeTimes(manager).size === 2)
@@ -612,7 +612,7 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
 
     sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, createTaskInfo(0, 0, "executor-1")))
     sc.listenerBus.postToAll(SparkListenerExecutorAdded(
-      0L, "executor-1", new ExecutorInfo("host1", 1)))
+      0L, "executor-1", new ExecutorInfo("host1", 1, Map.empty)))
     assert(executorIds(manager).size === 1)
     assert(executorIds(manager).contains("executor-1"))
     assert(removeTimes(manager).size === 0)
@@ -624,7 +624,7 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     assert(executorIds(manager).isEmpty)
     assert(removeTimes(manager).isEmpty)
     sc.listenerBus.postToAll(SparkListenerExecutorAdded(
-      0L, "executor-1", new ExecutorInfo("host1", 1)))
+      0L, "executor-1", new ExecutorInfo("host1", 1, Map.empty)))
     sc.listenerBus.postToAll(SparkListenerTaskStart(0, 0, createTaskInfo(0, 0, "executor-1")))
 
     assert(executorIds(manager).size === 1)
@@ -632,7 +632,7 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     assert(removeTimes(manager).size === 0)
 
     sc.listenerBus.postToAll(SparkListenerExecutorAdded(
-      0L, "executor-2", new ExecutorInfo("host1", 1)))
+      0L, "executor-2", new ExecutorInfo("host1", 1, Map.empty)))
     assert(executorIds(manager).size === 2)
     assert(executorIds(manager).contains("executor-2"))
     assert(removeTimes(manager).size === 1)

From af2a2a263ac5d890e84d012b75fcb50e02c9ede8 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 6 Feb 2015 11:50:20 -0800
Subject: [PATCH 021/357] [SPARK-4361][Doc] Add more docs for Hadoop
 Configuration

I'm trying to point out reusing a Configuration in these APIs is dangerous. Any better idea?

Author: zsxwing <zsxwing@gmail.com>

Closes #3225 from zsxwing/SPARK-4361 and squashes the following commits:

fe4e3d5 [zsxwing] Add more docs for Hadoop Configuration
---
 .../scala/org/apache/spark/SparkContext.scala | 20 +++++++++++--
 .../spark/api/java/JavaSparkContext.scala     | 28 +++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 24490fddc5c6a..5623587c36fa6 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -288,7 +288,12 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   // the bound port to the cluster manager properly
   ui.foreach(_.bind())
 
-  /** A default Hadoop Configuration for the Hadoop code (e.g. file systems) that we reuse. */
+  /**
+   * A default Hadoop Configuration for the Hadoop code (e.g. file systems) that we reuse.
+   *
+   * '''Note:''' As it will be reused in all Hadoop RDDs, it's better not to modify it unless you
+   * plan to set some global configurations for all Hadoop RDDs.
+   */
   val hadoopConfiguration = SparkHadoopUtil.get.newConfiguration(conf)
 
   // Add each JAR given through the constructor
@@ -694,7 +699,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * necessary info (e.g. file name for a filesystem-based dataset, table name for HyperTable),
    * using the older MapReduce API (`org.apache.hadoop.mapred`).
    *
-   * @param conf JobConf for setting up the dataset
+   * @param conf JobConf for setting up the dataset. Note: This will be put into a Broadcast.
+   *             Therefore if you plan to reuse this conf to create multiple RDDs, you need to make
+   *             sure you won't modify the conf. A safe approach is always creating a new conf for
+   *             a new RDD.
    * @param inputFormatClass Class of the InputFormat
    * @param keyClass Class of the keys
    * @param valueClass Class of the values
@@ -830,6 +838,14 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * Get an RDD for a given Hadoop file with an arbitrary new API InputFormat
    * and extra configuration options to pass to the input format.
    *
+   * @param conf Configuration for setting up the dataset. Note: This will be put into a Broadcast.
+   *             Therefore if you plan to reuse this conf to create multiple RDDs, you need to make
+   *             sure you won't modify the conf. A safe approach is always creating a new conf for
+   *             a new RDD.
+   * @param fClass Class of the InputFormat
+   * @param kClass Class of the keys
+   * @param vClass Class of the values
+   *
    * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
index 97f5c9f257e09..6d6ed693be752 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
@@ -373,6 +373,15 @@ class JavaSparkContext(val sc: SparkContext)
    * other necessary info (e.g. file name for a filesystem-based dataset, table name for HyperTable,
    * etc).
    *
+   * @param conf JobConf for setting up the dataset. Note: This will be put into a Broadcast.
+   *             Therefore if you plan to reuse this conf to create multiple RDDs, you need to make
+   *             sure you won't modify the conf. A safe approach is always creating a new conf for
+   *             a new RDD.
+   * @param inputFormatClass Class of the InputFormat
+   * @param keyClass Class of the keys
+   * @param valueClass Class of the values
+   * @param minPartitions Minimum number of Hadoop Splits to generate.
+   *
    * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
@@ -395,6 +404,14 @@ class JavaSparkContext(val sc: SparkContext)
    * Get an RDD for a Hadoop-readable dataset from a Hadooop JobConf giving its InputFormat and any
    * other necessary info (e.g. file name for a filesystem-based dataset, table name for HyperTable,
    *
+   * @param conf JobConf for setting up the dataset. Note: This will be put into a Broadcast.
+   *             Therefore if you plan to reuse this conf to create multiple RDDs, you need to make
+   *             sure you won't modify the conf. A safe approach is always creating a new conf for
+   *             a new RDD.
+   * @param inputFormatClass Class of the InputFormat
+   * @param keyClass Class of the keys
+   * @param valueClass Class of the values
+   *
    * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
@@ -476,6 +493,14 @@ class JavaSparkContext(val sc: SparkContext)
    * Get an RDD for a given Hadoop file with an arbitrary new API InputFormat
    * and extra configuration options to pass to the input format.
    *
+   * @param conf Configuration for setting up the dataset. Note: This will be put into a Broadcast.
+   *             Therefore if you plan to reuse this conf to create multiple RDDs, you need to make
+   *             sure you won't modify the conf. A safe approach is always creating a new conf for
+   *             a new RDD.
+   * @param fClass Class of the InputFormat
+   * @param kClass Class of the keys
+   * @param vClass Class of the values
+   *
    * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
@@ -675,6 +700,9 @@ class JavaSparkContext(val sc: SparkContext)
 
   /**
    * Returns the Hadoop configuration used for the Hadoop code (e.g. file systems) we reuse.
+   *
+   * '''Note:''' As it will be reused in all Hadoop RDDs, it's better not to modify it unless you
+   * plan to set some global configurations for all Hadoop RDDs.
    */
   def hadoopConfiguration(): Configuration = {
     sc.hadoopConfiguration

From d34f79c8db79ae461fadae190446ebc19091bec9 Mon Sep 17 00:00:00 2001
From: WangTaoTheTonic <wangtao111@huawei.com>
Date: Fri, 6 Feb 2015 11:57:02 -0800
Subject: [PATCH 022/357] [SPARK-2945][YARN][Doc]add doc for
 spark.executor.instances

https://issues.apache.org/jira/browse/SPARK-2945

spark.executor.instances works. As this JIRA recommended, we should add docs for this common config.

Author: WangTaoTheTonic <wangtao111@huawei.com>

Closes #4350 from WangTaoTheTonic/SPARK-2945 and squashes the following commits:

4c3913a [WangTaoTheTonic] not compatible with dynamic allocation
5fa9c46 [WangTaoTheTonic] add doc for spark.executor.instances
---
 docs/running-on-yarn.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 68ab127bcf087..2b93eef6c26ed 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -104,6 +104,13 @@ Most of the configs are the same for Spark on YARN as for other deployment modes
     Comma-separated list of files to be placed in the working directory of each executor.
   </td>
 </tr>
+<tr>
+ <td><code>spark.executor.instances</code></td>
+  <td>2</td>
+  <td>
+    The number of executors. Note that this property is incompatible with <code>spark.dynamicAllocation.enabled</code>.
+  </td>
+</tr>
 <tr>
  <td><code>spark.yarn.executor.memoryOverhead</code></td>
   <td>executorMemory * 0.07, with minimum of 384 </td>

From 70e5b030a78ddfdcc8c9eee568009f277dee0872 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Fri, 6 Feb 2015 12:08:22 -0800
Subject: [PATCH 023/357] [SPARK-5628] Add version option to spark-ec2

Every proper command line tool should include a `--version` option or something similar.

This PR adds this to `spark-ec2` using the standard functionality provided by `optparse`.

One thing we don't do here is follow the Python convention of setting `__version__`, since it seems awkward given how `spark-ec2` is laid out.

Author: Nicholas Chammas <nicholas.chammas@gmail.com>

Closes #4414 from nchammas/spark-ec2-show-version and squashes the following commits:

914cab5 [Nicholas Chammas] add version info
---
 ec2/spark_ec2.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index abab209a05ba0..0de4a62e203fd 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -39,6 +39,9 @@
 from optparse import OptionParser
 from sys import stderr
 
+SPARK_EC2_VERSION = "1.2.0"
+SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
+
 VALID_SPARK_VERSIONS = set([
     "0.7.3",
     "0.8.0",
@@ -54,9 +57,8 @@
     "1.2.0",
 ])
 
-DEFAULT_SPARK_VERSION = "1.2.0"
+DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION
 DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark"
-SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
 MESOS_SPARK_EC2_BRANCH = "branch-1.3"
 
 # A URL prefix from which to fetch AMI information
@@ -103,12 +105,10 @@ class UsageError(Exception):
 # Configure and parse our command-line arguments
 def parse_args():
     parser = OptionParser(
-        usage="spark-ec2 [options] <action> <cluster_name>"
-        + "\n\n<action> can be: launch, destroy, login, stop, start, get-master, reboot-slaves",
-        add_help_option=False)
-    parser.add_option(
-        "-h", "--help", action="help",
-        help="Show this help message and exit")
+        prog="spark-ec2",
+        version="%prog {v}".format(v=SPARK_EC2_VERSION),
+        usage="%prog [options] <action> <cluster_name>\n\n"
+        + "<action> can be: launch, destroy, login, stop, start, get-master, reboot-slaves")
     parser.add_option(
         "-s", "--slaves", type="int", default=1,
         help="Number of slaves to launch (default: %default)")

From d433816157bb3ae1f0fbe44efec43a0c906d9f82 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 6 Feb 2015 12:13:44 -0800
Subject: [PATCH 024/357] [SPARK-5650][SQL] Support optional 'FROM' clause

In Hive, 'FROM' clause is optional. This pr supports it.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #4426 from viirya/optional_from and squashes the following commits:

fe81f31 [Liang-Chi Hsieh] Support optional 'FROM' clause.
---
 .../org/apache/spark/sql/hive/HiveQl.scala    | 19 ++++++++++++++-----
 ... clause-0-b42b408a87b258921240058f880a721a |  1 +
 .../sql/hive/execution/HiveQuerySuite.scala   |  3 +++
 3 files changed, 18 insertions(+), 5 deletions(-)
 create mode 100644 sql/hive/src/test/resources/golden/no from clause-0-b42b408a87b258921240058f880a721a

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index c19a091719190..6265c61a1dabe 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -556,9 +556,14 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
     case Token("TOK_TRUNCATETABLE",
           Token("TOK_TABLE_PARTITION",table)::Nil) =>  NativePlaceholder
 
-    case Token("TOK_QUERY",
-           Token("TOK_FROM", fromClause :: Nil) ::
-           insertClauses) =>
+    case Token("TOK_QUERY", queryArgs)
+        if Seq("TOK_FROM", "TOK_INSERT").contains(queryArgs.head.getText) =>
+
+      val (fromClause: Option[ASTNode], insertClauses) = queryArgs match {
+        case Token("TOK_FROM", args: Seq[ASTNode]) :: insertClauses =>
+          (Some(args.head), insertClauses)
+        case Token("TOK_INSERT", _) :: Nil => (None, queryArgs)
+      }
 
       // Return one query for each insert clause.
       val queries = insertClauses.map { case Token("TOK_INSERT", singleInsert) =>
@@ -599,8 +604,12 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
               "TOK_LATERAL_VIEW"),
             singleInsert)
         }
-
-        val relations = nodeToRelation(fromClause)
+ 
+        val relations = fromClause match {
+          case Some(f) => nodeToRelation(f)
+          case None => NoRelation
+        }
+ 
         val withWhere = whereClause.map { whereNode =>
           val Seq(whereExpr) = whereNode.getChildren.toSeq
           Filter(nodeToExpr(whereExpr), relations)
diff --git a/sql/hive/src/test/resources/golden/no from clause-0-b42b408a87b258921240058f880a721a b/sql/hive/src/test/resources/golden/no from clause-0-b42b408a87b258921240058f880a721a
new file mode 100644
index 0000000000000..390d344ecb9d3
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/no from clause-0-b42b408a87b258921240058f880a721a	
@@ -0,0 +1 @@
+1	1	-1
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index bb73ff1ea7e43..d16a1e0b73faa 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -202,6 +202,9 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
   createQueryTest("having no references",
     "SELECT key FROM src GROUP BY key HAVING COUNT(*) > 1")
 
+  createQueryTest("no from clause",
+    "SELECT 1, +1, -1")
+
   createQueryTest("boolean = number",
     """
       |SELECT

From 500dc2b4b3136029457e708859fe27da93b1f9e8 Mon Sep 17 00:00:00 2001
From: Tobias Schlatter <tobias@meisch.ch>
Date: Fri, 6 Feb 2015 12:15:02 -0800
Subject: [PATCH 025/357] [SPARK-5640] Synchronize ScalaReflection where
 necessary

Author: Tobias Schlatter <tobias@meisch.ch>

Closes #4431 from gzm0/sync-scala-refl and squashes the following commits:

c5da21e [Tobias Schlatter] [SPARK-5640] Synchronize ScalaReflection where necessary
---
 .../org/apache/spark/sql/catalyst/ScalaReflection.scala      | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 0445f3aa07912..5d9c331ca5178 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -103,10 +103,11 @@ trait ScalaReflection {
   }
 
   /** Returns a catalyst DataType and its nullability for the given Scala Type using reflection. */
-  def schemaFor[T: TypeTag]: Schema = schemaFor(typeOf[T])
+  def schemaFor[T: TypeTag]: Schema =
+    ScalaReflectionLock.synchronized { schemaFor(typeOf[T]) }
 
   /** Returns a catalyst DataType and its nullability for the given Scala Type using reflection. */
-  def schemaFor(tpe: `Type`): Schema = {
+  def schemaFor(tpe: `Type`): Schema = ScalaReflectionLock.synchronized {
     val className: String = tpe.erasure.typeSymbol.asClass.fullName
     tpe match {
       case t if Utils.classIsLoadable(className) &&

From a958d60975147fb1afc76fcbd80f65ac8d78759a Mon Sep 17 00:00:00 2001
From: q00251598 <qiyadong@huawei.com>
Date: Fri, 6 Feb 2015 12:29:26 -0800
Subject: [PATCH 026/357] [SPARK-5619][SQL] Support 'show roles' in HiveContext

Author: q00251598 <qiyadong@huawei.com>

Closes #4397 from watermen/SPARK-5619 and squashes the following commits:

f819b6c [q00251598] Support show roles in HiveContext.
---
 sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 6265c61a1dabe..be63aa1a93aee 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -68,6 +68,7 @@ private[hive] object HiveQl {
     "TOK_SHOWLOCKS",
     "TOK_UNLOCKTABLE",
 
+    "TOK_SHOW_ROLES",
     "TOK_CREATEROLE",
     "TOK_DROPROLE",
     "TOK_GRANT",

From 0b7eb3f3b700080bf6cb810d092709a8a468e5db Mon Sep 17 00:00:00 2001
From: OopsOutOfMemory <victorshengli@126.com>
Date: Fri, 6 Feb 2015 12:33:20 -0800
Subject: [PATCH 027/357] [SPARK-5324][SQL] Results of describe can't be
 queried

Make below code works.
```
sql("DESCRIBE test").registerTempTable("describeTest")
sql("SELECT * FROM describeTest").collect()
```

Author: OopsOutOfMemory <victorshengli@126.com>
Author: Sheng, Li <OopsOutOfMemory@users.noreply.github.com>

Closes #4249 from OopsOutOfMemory/desc_query and squashes the following commits:

6fee13d [OopsOutOfMemory] up-to-date
e71430a [Sheng, Li] Update HiveOperatorQueryableSuite.scala
3ba1058 [OopsOutOfMemory] change to default argument
aac7226 [OopsOutOfMemory] Merge branch 'master' into desc_query
68eb6dd [OopsOutOfMemory] Merge branch 'desc_query' of github.com:OopsOutOfMemory/spark into desc_query
354ad71 [OopsOutOfMemory] query describe command
d541a35 [OopsOutOfMemory] refine test suite
e1da481 [OopsOutOfMemory] refine test suite
a780539 [OopsOutOfMemory] Merge branch 'desc_query' of github.com:OopsOutOfMemory/spark into desc_query
0015f82 [OopsOutOfMemory] code style
dd0aaef [OopsOutOfMemory] code style
c7d606d [OopsOutOfMemory] rename test suite
75f2342 [OopsOutOfMemory] refine code and test suite
f942c9b [OopsOutOfMemory] initial
11559ae [OopsOutOfMemory] code style
c5fdecf [OopsOutOfMemory] code style
aeaea5f [OopsOutOfMemory] rename test suite
ac2c3bb [OopsOutOfMemory] refine code and test suite
544573e [OopsOutOfMemory] initial
---
 .../org/apache/spark/sql/sources/ddl.scala    | 11 ++--
 .../org/apache/spark/sql/hive/HiveQl.scala    |  5 +-
 .../HiveOperatorQueryableSuite.scala          | 51 +++++++++++++++++++
 .../sql/hive/execution/HiveQuerySuite.scala   |  2 +-
 4 files changed, 61 insertions(+), 8 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveOperatorQueryableSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index 9c37e0169ff85..a692ef51b31ed 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -296,11 +296,14 @@ private[sql] case class ResolvedDataSource(provider: Class[_], relation: BaseRel
 private[sql] case class DescribeCommand(
     table: LogicalPlan,
     isExtended: Boolean) extends Command {
-  override def output = Seq(
+  override val output = Seq(
     // Column names are based on Hive.
-    AttributeReference("col_name", StringType, nullable = false)(),
-    AttributeReference("data_type", StringType, nullable = false)(),
-    AttributeReference("comment", StringType, nullable = false)())
+    AttributeReference("col_name", StringType, nullable = false, 
+      new MetadataBuilder().putString("comment", "name of the column").build())(),
+    AttributeReference("data_type", StringType, nullable = false, 
+      new MetadataBuilder().putString("comment", "data type of the column").build())(),
+    AttributeReference("comment", StringType, nullable = false, 
+      new MetadataBuilder().putString("comment", "comment of the column").build())())
 }
 
 private[sql] case class CreateTableUsing(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index be63aa1a93aee..4b7fa06532963 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -497,15 +497,14 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
                 // TODO: Actually, a user may mean tableName.columnName. Need to resolve this issue.
                 val tableIdent = extractTableIdent(nameParts.head)
                 DescribeCommand(
-                  UnresolvedRelation(tableIdent, None), extended.isDefined)
+                  UnresolvedRelation(tableIdent, None), isExtended = extended.isDefined)
               case Token(".", dbName :: tableName :: colName :: Nil) =>
                 // It is describing a column with the format like "describe db.table column".
                 NativePlaceholder
               case tableName =>
                 // It is describing a table with the format like "describe table".
                 DescribeCommand(
-                  UnresolvedRelation(Seq(tableName.getText), None),
-                  extended.isDefined)
+                  UnresolvedRelation(Seq(tableName.getText), None), isExtended = extended.isDefined)
             }
           }
           // All other cases.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveOperatorQueryableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveOperatorQueryableSuite.scala
new file mode 100644
index 0000000000000..efbef68cd4447
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveOperatorQueryableSuite.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import org.apache.spark.sql.{Row, QueryTest}
+import org.apache.spark.sql.hive.test.TestHive._
+
+/**
+ * A set of tests that validates commands can also be queried by like a table
+ */
+class HiveOperatorQueryableSuite extends QueryTest {
+  test("SPARK-5324 query result of describe command") {
+    loadTestTable("src")
+
+    // register a describe command to be a temp table
+    sql("desc src").registerTempTable("mydesc")
+    checkAnswer(
+      sql("desc mydesc"),
+      Seq(
+        Row("col_name", "string", "name of the column"),
+        Row("data_type", "string", "data type of the column"),
+        Row("comment", "string", "comment of the column")))
+
+    checkAnswer(
+      sql("select * from mydesc"),
+      Seq(
+        Row("key", "int", null),
+        Row("value", "string", null)))
+
+    checkAnswer(
+      sql("select col_name, data_type, comment from mydesc"),
+      Seq(
+        Row("key", "int", null),
+        Row("value", "string", null)))
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index d16a1e0b73faa..27047ce4b1b0b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -59,7 +59,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
     Locale.setDefault(originalLocale)
   }
 
-  test("SPARK-4908: concurent hive native commands") {
+  test("SPARK-4908: concurrent hive native commands") {
     (1 to 100).par.map { _ =>
       sql("USE default")
       sql("SHOW TABLES")

From 3eccf29ce061559c86e6f7338851932fc89a9afd Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 6 Feb 2015 12:38:07 -0800
Subject: [PATCH 028/357] [SPARK-5595][SPARK-5603][SQL] Add a rule to do
 PreInsert type casting and field renaming and invalidating in memory cache
 after INSERT

This PR adds a rule to Analyzer that will add preinsert data type casting and field renaming to the select clause in an `INSERT INTO/OVERWRITE` statement. Also, with the change of this PR, we always invalidate our in memory data cache after inserting into a BaseRelation.

cc marmbrus liancheng

Author: Yin Huai <yhuai@databricks.com>

Closes #4373 from yhuai/insertFollowUp and squashes the following commits:

08237a7 [Yin Huai] Merge remote-tracking branch 'upstream/master' into insertFollowUp
316542e [Yin Huai] Doc update.
c9ccfeb [Yin Huai] Revert a unnecessary change.
84aecc4 [Yin Huai] Address comments.
1951fe1 [Yin Huai] Merge remote-tracking branch 'upstream/master'
c18da34 [Yin Huai] Invalidate cache after insert.
727f21a [Yin Huai] Preinsert casting and renaming.
---
 .../org/apache/spark/sql/SQLContext.scala     |  6 +-
 .../apache/spark/sql/json/JSONRelation.scala  | 12 ++-
 .../sql/sources/DataSourceStrategy.scala      |  2 +-
 .../apache/spark/sql/sources/commands.scala   | 10 ++-
 .../apache/spark/sql/sources/interfaces.scala | 16 ++++
 .../org/apache/spark/sql/sources/rules.scala  | 76 ++++++++++++++++++
 .../org/apache/spark/sql/json/JsonSuite.scala | 25 ++++++
 .../spark/sql/sources/DataSourceTest.scala    |  6 +-
 .../spark/sql/sources/InsertIntoSuite.scala   | 80 +++++++++++++++++++
 .../apache/spark/sql/hive/HiveContext.scala   |  1 +
 10 files changed, 227 insertions(+), 7 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 706ef6ad4f174..bf3990671029e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -91,7 +91,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   @transient
   protected[sql] lazy val analyzer: Analyzer =
-    new Analyzer(catalog, functionRegistry, caseSensitive = true)
+    new Analyzer(catalog, functionRegistry, caseSensitive = true) {
+      override val extendedRules =
+        sources.PreInsertCastAndRename ::
+        Nil
+    }
 
   @transient
   protected[sql] lazy val optimizer: Optimizer = DefaultOptimizer
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
index f27585d05a986..c4e14c6c92908 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
@@ -72,7 +72,6 @@ private[sql] case class JSONRelation(
     userSpecifiedSchema: Option[StructType])(
     @transient val sqlContext: SQLContext)
   extends TableScan with InsertableRelation {
-
   // TODO: Support partitioned JSON relation.
   private def baseRDD = sqlContext.sparkContext.textFile(path)
 
@@ -99,10 +98,21 @@ private[sql] case class JSONRelation(
             s"Unable to clear output directory ${filesystemPath.toString} prior"
               + s" to INSERT OVERWRITE a JSON table:\n${e.toString}")
       }
+      // Write the data.
       data.toJSON.saveAsTextFile(path)
+      // Right now, we assume that the schema is not changed. We will not update the schema.
+      // schema = data.schema
     } else {
       // TODO: Support INSERT INTO
       sys.error("JSON table only support INSERT OVERWRITE for now.")
     }
   }
+
+  override def hashCode(): Int = 41 * (41 + path.hashCode) + schema.hashCode()
+
+  override def equals(other: Any): Boolean = other match {
+    case that: JSONRelation =>
+      (this.path == that.path) && (this.schema == that.schema)
+    case _ => false
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
index d23ffb8b7a960..624369afe87b5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -59,7 +59,7 @@ private[sql] object DataSourceStrategy extends Strategy {
       if (partition.nonEmpty) {
         sys.error(s"Insert into a partition is not allowed because $l is not partitioned.")
       }
-      execution.ExecutedCommand(InsertIntoRelation(t, query, overwrite)) :: Nil
+      execution.ExecutedCommand(InsertIntoDataSource(l, query, overwrite)) :: Nil
 
     case _ => Nil
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
index d7942dc30934b..c9cd0e6e93829 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala
@@ -19,17 +19,21 @@ package org.apache.spark.sql.sources
 import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.RunnableCommand
+import org.apache.spark.sql.execution.{LogicalRDD, RunnableCommand}
 
-private[sql] case class InsertIntoRelation(
-    relation: InsertableRelation,
+private[sql] case class InsertIntoDataSource(
+    logicalRelation: LogicalRelation,
     query: LogicalPlan,
     overwrite: Boolean)
   extends RunnableCommand {
 
   override def run(sqlContext: SQLContext) = {
+    val relation = logicalRelation.relation.asInstanceOf[InsertableRelation]
     relation.insert(DataFrame(sqlContext, query), overwrite)
 
+    // Invalidate the cache.
+    sqlContext.cacheManager.invalidateCache(logicalRelation)
+
     Seq.empty[Row]
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 40fc1f2aa2724..a640ba57e0885 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -158,6 +158,22 @@ trait CatalystScan extends BaseRelation {
 }
 
 @DeveloperApi
+/**
+ * ::DeveloperApi::
+ * A BaseRelation that can be used to insert data into it through the insert method.
+ * If overwrite in insert method is true, the old data in the relation should be overwritten with
+ * the new data. If overwrite in insert method is false, the new data should be appended.
+ *
+ * InsertableRelation has the following three assumptions.
+ * 1. It assumes that the data (Rows in the DataFrame) provided to the insert method
+ * exactly matches the ordinal of fields in the schema of the BaseRelation.
+ * 2. It assumes that the schema of this relation will not be changed.
+ * Even if the insert method updates the schema (e.g. a relation of JSON or Parquet data may have a
+ * schema update after an insert operation), the new schema will not be used.
+ * 3. It assumes that fields of the data provided in the insert method are nullable.
+ * If a data source needs to check the actual nullability of a field, it needs to do it in the
+ * insert method.
+ */
 trait InsertableRelation extends BaseRelation {
   def insert(data: DataFrame, overwrite: Boolean): Unit
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
new file mode 100644
index 0000000000000..4ed22d363da5b
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources
+
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Alias}
+import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.types.DataType
+
+/**
+ * A rule to do pre-insert data type casting and field renaming. Before we insert into
+ * an [[InsertableRelation]], we will use this rule to make sure that
+ * the columns to be inserted have the correct data type and fields have the correct names.
+ * @param resolver The resolver used by the Analyzer.
+ */
+private[sql] object PreInsertCastAndRename extends Rule[LogicalPlan] {
+  def apply(plan: LogicalPlan): LogicalPlan = {
+    plan.transform {
+      // Wait until children are resolved.
+      case p: LogicalPlan if !p.childrenResolved => p
+
+      // We are inserting into an InsertableRelation.
+      case i @ InsertIntoTable(
+      l @ LogicalRelation(r: InsertableRelation), partition, child, overwrite) => {
+        // First, make sure the data to be inserted have the same number of fields with the
+        // schema of the relation.
+        if (l.output.size != child.output.size) {
+          sys.error(
+            s"$l requires that the query in the SELECT clause of the INSERT INTO/OVERWRITE " +
+              s"statement generates the same number of columns as its schema.")
+        }
+        castAndRenameChildOutput(i, l.output, child)
+      }
+    }
+  }
+
+  /** If necessary, cast data types and rename fields to the expected types and names. */
+  def castAndRenameChildOutput(
+      insertInto: InsertIntoTable,
+      expectedOutput: Seq[Attribute],
+      child: LogicalPlan) = {
+    val newChildOutput = expectedOutput.zip(child.output).map {
+      case (expected, actual) =>
+        val needCast = !DataType.equalsIgnoreNullability(expected.dataType, actual.dataType)
+        // We want to make sure the filed names in the data to be inserted exactly match
+        // names in the schema.
+        val needRename = expected.name != actual.name
+        (needCast, needRename) match {
+          case (true, _) => Alias(Cast(actual, expected.dataType), expected.name)()
+          case (false, true) => Alias(actual, expected.name)()
+          case (_, _) => actual
+        }
+    }
+
+    if (newChildOutput == child.output) {
+      insertInto
+    } else {
+      insertInto.copy(child = Project(newChildOutput, child))
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 1396c6b7246d1..926ba68828ee8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.Dsl._
 import org.apache.spark.sql.json.JsonRDD.{compatibleType, enforceCorrectType}
+import org.apache.spark.sql.sources.LogicalRelation
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext._
 import org.apache.spark.sql.types._
@@ -923,6 +924,30 @@ class JsonSuite extends QueryTest {
       sql("select structWithArrayFields.field1[1], structWithArrayFields.field2[3] from complexTable"),
       Row(5, null)
     )
+  }
 
+  test("JSONRelation equality test") {
+    val relation1 =
+      JSONRelation("path", 1.0, Some(StructType(StructField("a", IntegerType, true) :: Nil)))(null)
+    val logicalRelation1 = LogicalRelation(relation1)
+    val relation2 =
+      JSONRelation("path", 0.5, Some(StructType(StructField("a", IntegerType, true) :: Nil)))(
+        org.apache.spark.sql.test.TestSQLContext)
+    val logicalRelation2 = LogicalRelation(relation2)
+    val relation3 =
+      JSONRelation("path", 1.0, Some(StructType(StructField("b", StringType, true) :: Nil)))(null)
+    val logicalRelation3 = LogicalRelation(relation3)
+
+    assert(relation1 === relation2)
+    assert(logicalRelation1.sameResult(logicalRelation2),
+      s"$logicalRelation1 and $logicalRelation2 should be considered having the same result.")
+
+    assert(relation1 !== relation3)
+    assert(!logicalRelation1.sameResult(logicalRelation3),
+      s"$logicalRelation1 and $logicalRelation3 should be considered not having the same result.")
+
+    assert(relation2 !== relation3)
+    assert(!logicalRelation2.sameResult(logicalRelation3),
+      s"$logicalRelation2 and $logicalRelation3 should be considered not having the same result.")
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
index 9626252e742e5..53f5f7426e9e6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
@@ -28,7 +28,11 @@ abstract class DataSourceTest extends QueryTest with BeforeAndAfter {
   implicit val caseInsensisitiveContext = new SQLContext(TestSQLContext.sparkContext) {
     @transient
     override protected[sql] lazy val analyzer: Analyzer =
-      new Analyzer(catalog, functionRegistry, caseSensitive = false)
+      new Analyzer(catalog, functionRegistry, caseSensitive = false) {
+        override val extendedRules =
+          PreInsertCastAndRename ::
+          Nil
+      }
   }
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertIntoSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertIntoSuite.scala
index f91cea6a37060..36e504e759152 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertIntoSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertIntoSuite.scala
@@ -63,6 +63,41 @@ class InsertIntoSuite extends DataSourceTest with BeforeAndAfterAll {
     )
   }
 
+  test("PreInsert casting and renaming") {
+    sql(
+      s"""
+        |INSERT OVERWRITE TABLE jsonTable SELECT a * 2, a * 4 FROM jt
+      """.stripMargin)
+
+    checkAnswer(
+      sql("SELECT a, b FROM jsonTable"),
+      (1 to 10).map(i => Row(i * 2, s"${i * 4}"))
+    )
+
+    sql(
+      s"""
+        |INSERT OVERWRITE TABLE jsonTable SELECT a * 4 AS A, a * 6 as c FROM jt
+      """.stripMargin)
+
+    checkAnswer(
+      sql("SELECT a, b FROM jsonTable"),
+      (1 to 10).map(i => Row(i * 4, s"${i * 6}"))
+    )
+  }
+
+  test("SELECT clause generating a different number of columns is not allowed.") {
+    val message = intercept[RuntimeException] {
+      sql(
+        s"""
+        |INSERT OVERWRITE TABLE jsonTable SELECT a FROM jt
+      """.stripMargin)
+    }.getMessage
+    assert(
+      message.contains("generates the same number of columns as its schema"),
+      "SELECT clause generating a different number of columns should not be not allowed."
+    )
+  }
+
   test("INSERT OVERWRITE a JSONRelation multiple times") {
     sql(
       s"""
@@ -93,4 +128,49 @@ class InsertIntoSuite extends DataSourceTest with BeforeAndAfterAll {
       """.stripMargin)
     }
   }
+
+  test("Caching")  {
+    // Cached Query Execution
+    cacheTable("jsonTable")
+    assertCached(sql("SELECT * FROM jsonTable"))
+    checkAnswer(
+      sql("SELECT * FROM jsonTable"),
+      (1 to 10).map(i => Row(i, s"str$i")))
+
+    assertCached(sql("SELECT a FROM jsonTable"))
+    checkAnswer(
+      sql("SELECT a FROM jsonTable"),
+      (1 to 10).map(Row(_)).toSeq)
+
+    assertCached(sql("SELECT a FROM jsonTable WHERE a < 5"))
+    checkAnswer(
+      sql("SELECT a FROM jsonTable WHERE a < 5"),
+      (1 to 4).map(Row(_)).toSeq)
+
+    assertCached(sql("SELECT a * 2 FROM jsonTable"))
+    checkAnswer(
+      sql("SELECT a * 2 FROM jsonTable"),
+      (1 to 10).map(i => Row(i * 2)).toSeq)
+
+    assertCached(sql("SELECT x.a, y.a FROM jsonTable x JOIN jsonTable y ON x.a = y.a + 1"), 2)
+    checkAnswer(
+      sql("SELECT x.a, y.a FROM jsonTable x JOIN jsonTable y ON x.a = y.a + 1"),
+      (2 to 10).map(i => Row(i, i - 1)).toSeq)
+
+    // Insert overwrite and keep the same schema.
+    sql(
+      s"""
+        |INSERT OVERWRITE TABLE jsonTable SELECT a * 2, b FROM jt
+      """.stripMargin)
+    // jsonTable should be recached.
+    assertCached(sql("SELECT * FROM jsonTable"))
+    // The cached data is the new data.
+    checkAnswer(
+      sql("SELECT a, b FROM jsonTable"),
+      sql("SELECT a * 2, b FROM jt").collect())
+
+    // Verify uncaching
+    uncacheTable("jsonTable")
+    assertCached(sql("SELECT * FROM jsonTable"), 0)
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index d2371d4a5519e..ad37b7d0e6f59 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -324,6 +324,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
         catalog.PreInsertionCasts ::
         ExtractPythonUdfs ::
         ResolveUdtfsAlias ::
+        sources.PreInsertCastAndRename ::
         Nil
     }
 

From b62c35245a6c9bee6a335aa5dcfcca2547e85406 Mon Sep 17 00:00:00 2001
From: OopsOutOfMemory <victorshengli@126.com>
Date: Fri, 6 Feb 2015 12:41:28 -0800
Subject: [PATCH 029/357] [SQL][HiveConsole][DOC] HiveConsole `correct
 hiveconsole imports`

Sorry for that PR #4330 has some mistakes.

I correct it....  so it works correctly now.

Author: OopsOutOfMemory <victorshengli@126.com>

Closes #4389 from OopsOutOfMemory/doc and squashes the following commits:

843eed9 [OopsOutOfMemory] correct hiveconsole imports
---
 project/SparkBuild.scala | 1 +
 sql/README.md            | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index f63f9c1982bb5..5e3051c091030 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -276,6 +276,7 @@ object Hive {
         |import org.apache.spark.sql.catalyst.plans.logical._
         |import org.apache.spark.sql.catalyst.rules._
         |import org.apache.spark.sql.catalyst.util._
+        |import org.apache.spark.sql.Dsl._
         |import org.apache.spark.sql.execution
         |import org.apache.spark.sql.hive._
         |import org.apache.spark.sql.hive.test.TestHive._
diff --git a/sql/README.md b/sql/README.md
index 254ab8eb6376f..a79249965ee67 100644
--- a/sql/README.md
+++ b/sql/README.md
@@ -29,12 +29,19 @@ From here you can execute queries with HiveQl and manipulate DataFrame by using
 catalyst$ build/sbt hive/console
 
 [info] Starting scala interpreter...
+import org.apache.spark.sql.catalyst.analysis._
+import org.apache.spark.sql.catalyst.dsl._
+import org.apache.spark.sql.catalyst.errors._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.execution
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.parquet.ParquetTestData
-Welcome to Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_45).
 Type in expressions to have them evaluated.
 Type :help for more information.
 

From bc36356080e3b52aaf61fc1e6b204146ab96d29f Mon Sep 17 00:00:00 2001
From: wangfei <wangfei1@huawei.com>
Date: Fri, 6 Feb 2015 12:42:23 -0800
Subject: [PATCH 030/357] [SQL][Minor] Remove cache keyword in SqlParser

Since cache keyword already defined in `SparkSQLParser` and `SqlParser` of catalyst is a more general parser which should not cover keywords related to underlying compute engine, to remove  cache keyword in  `SqlParser`.

Author: wangfei <wangfei1@huawei.com>

Closes #4393 from scwf/remove-cache-keyword and squashes the following commits:

10ade16 [wangfei] remove cache keyword in sql parser
---
 .../src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index a9bd079c7049d..1c588ee2f7176 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -57,7 +57,6 @@ class SqlParser extends AbstractSparkSQLParser {
   protected val AVG = Keyword("AVG")
   protected val BETWEEN = Keyword("BETWEEN")
   protected val BY = Keyword("BY")
-  protected val CACHE = Keyword("CACHE")
   protected val CASE = Keyword("CASE")
   protected val CAST = Keyword("CAST")
   protected val COALESCE = Keyword("COALESCE")

From 4793c8402a19afe4df51129a7f99e07494a76af2 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Fri, 6 Feb 2015 13:08:09 -0800
Subject: [PATCH 031/357] [SPARK-5278][SQL] Introduce UnresolvedGetField and
 complete the check of ambiguous reference to fields

When the `GetField` chain(`a.b.c.d.....`) is interrupted by `GetItem` like `a.b[0].c.d....`, then the check of ambiguous reference to fields is broken.
The reason is that: for something like `a.b[0].c.d`, we first parse it to `GetField(GetField(GetItem(Unresolved("a.b"), 0), "c"), "d")`. Then in `LogicalPlan#resolve`, we resolve `"a.b"` and build a `GetField` chain from bottom(the relation). But for the 2 outer `GetFiled`, we have to resolve them in `Analyzer` or do it in `GetField` lazily, check data type of child, search needed field, etc. which is similar to what we have done in `LogicalPlan#resolve`.
So in this PR, the fix is just copy the same logic in `LogicalPlan#resolve` to `Analyzer`, which is simple and quick, but I do suggest introduce `UnresolvedGetFiled` like I explained in https://github.com/apache/spark/pull/2405.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #4068 from cloud-fan/simple and squashes the following commits:

a6857b5 [Wenchen Fan] fix import order
8411c40 [Wenchen Fan] use UnresolvedGetField
---
 .../apache/spark/sql/catalyst/SqlParser.scala |  2 +-
 .../sql/catalyst/analysis/Analyzer.scala      | 34 +++++++++++++------
 .../sql/catalyst/analysis/unresolved.scala    | 12 +++++++
 .../spark/sql/catalyst/dsl/package.scala      |  4 +--
 .../catalyst/expressions/complexTypes.scala   | 24 ++-----------
 .../sql/catalyst/optimizer/Optimizer.scala    |  2 +-
 .../catalyst/plans/logical/LogicalPlan.scala  | 33 ++----------------
 .../ExpressionEvaluationSuite.scala           | 25 ++++++++++----
 .../optimizer/ConstantFoldingSuite.scala      |  4 +--
 .../scala/org/apache/spark/sql/Column.scala   |  3 +-
 .../org/apache/spark/sql/hive/HiveQl.scala    |  2 +-
 .../hive/execution/HiveResolutionSuite.scala  | 27 +++++++++------
 12 files changed, 84 insertions(+), 88 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index 1c588ee2f7176..124f083669358 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -372,7 +372,7 @@ class SqlParser extends AbstractSparkSQLParser {
     | expression ~ ("[" ~> expression <~ "]") ^^
       { case base ~ ordinal => GetItem(base, ordinal) }
     | (expression <~ ".") ~ ident ^^
-      { case base ~ fieldName => GetField(base, fieldName) }
+      { case base ~ fieldName => UnresolvedGetField(base, fieldName) }
     | cast
     | "(" ~> expression <~ ")"
     | function
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index ae1aee02c64a5..0b59ed1739566 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -285,7 +285,7 @@ class Analyzer(catalog: Catalog,
 
       case q: LogicalPlan =>
         logTrace(s"Attempting to resolve ${q.simpleString}")
-        q transformExpressions {
+        q transformExpressionsUp  {
           case u @ UnresolvedAttribute(name) if resolver(name, VirtualColumn.groupingIdName) &&
             q.isInstanceOf[GroupingAnalytics] =>
             // Resolve the virtual column GROUPING__ID for the operator GroupingAnalytics
@@ -295,15 +295,8 @@ class Analyzer(catalog: Catalog,
             val result = q.resolveChildren(name, resolver).getOrElse(u)
             logDebug(s"Resolving $u to $result")
             result
-
-          // Resolve field names using the resolver.
-          case f @ GetField(child, fieldName) if !f.resolved && child.resolved =>
-            child.dataType match {
-              case StructType(fields) =>
-                val resolvedFieldName = fields.map(_.name).find(resolver(_, fieldName))
-                resolvedFieldName.map(n => f.copy(fieldName = n)).getOrElse(f)
-              case _ => f
-            }
+          case UnresolvedGetField(child, fieldName) if child.resolved =>
+            resolveGetField(child, fieldName)
         }
     }
 
@@ -312,6 +305,27 @@ class Analyzer(catalog: Catalog,
      */
     protected def containsStar(exprs: Seq[Expression]): Boolean =
       exprs.exists(_.collect { case _: Star => true }.nonEmpty)
+
+    /**
+     * Returns the resolved `GetField`, and report error if no desired field or over one
+     * desired fields are found.
+     */
+    protected def resolveGetField(expr: Expression, fieldName: String): Expression = {
+      expr.dataType match {
+        case StructType(fields) =>
+          val actualField = fields.filter(f => resolver(f.name, fieldName))
+          if (actualField.length == 0) {
+            sys.error(
+              s"No such struct field $fieldName in ${fields.map(_.name).mkString(", ")}")
+          } else if (actualField.length == 1) {
+            val field = actualField(0)
+            GetField(expr, field, fields.indexOf(field))
+          } else {
+            sys.error(s"Ambiguous reference to fields ${actualField.mkString(", ")}")
+          }
+        case otherType => sys.error(s"GetField is not valid on fields of type $otherType")
+      }
+    }
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index f35921e2a772c..f959a50564011 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -177,3 +177,15 @@ case class ResolvedStar(expressions: Seq[NamedExpression]) extends Star {
   override def expand(input: Seq[Attribute], resolver: Resolver): Seq[NamedExpression] = expressions
   override def toString = expressions.mkString("ResolvedStar(", ", ", ")")
 }
+
+case class UnresolvedGetField(child: Expression, fieldName: String) extends UnaryExpression {
+  override def dataType = throw new UnresolvedException(this, "dataType")
+  override def foldable = throw new UnresolvedException(this, "foldable")
+  override def nullable = throw new UnresolvedException(this, "nullable")
+  override lazy val resolved = false
+
+  override def eval(input: Row = null): EvaluatedType =
+    throw new TreeNodeException(this, s"No function to evaluate expression. type: ${this.nodeName}")
+
+  override def toString = s"$child.$fieldName"
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 417659eed5957..51a09ac0e1249 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -22,7 +22,7 @@ import java.sql.{Date, Timestamp}
 import scala.language.implicitConversions
 import scala.reflect.runtime.universe.{TypeTag, typeTag}
 
-import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedGetField, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
@@ -101,7 +101,7 @@ package object dsl {
     def isNotNull = IsNotNull(expr)
 
     def getItem(ordinal: Expression) = GetItem(expr, ordinal)
-    def getField(fieldName: String) = GetField(expr, fieldName)
+    def getField(fieldName: String) = UnresolvedGetField(expr, fieldName)
 
     def cast(to: DataType) = Cast(expr, to)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
index 1bc34f71441fe..66e2e5c4bafce 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
@@ -73,39 +73,19 @@ case class GetItem(child: Expression, ordinal: Expression) extends Expression {
 /**
  * Returns the value of fields in the Struct `child`.
  */
-case class GetField(child: Expression, fieldName: String) extends UnaryExpression {
+case class GetField(child: Expression, field: StructField, ordinal: Int) extends UnaryExpression {
   type EvaluatedType = Any
 
   def dataType = field.dataType
   override def nullable = child.nullable || field.nullable
   override def foldable = child.foldable
 
-  protected def structType = child.dataType match {
-    case s: StructType => s
-    case otherType => sys.error(s"GetField is not valid on fields of type $otherType")
-  }
-
-  lazy val field =
-    structType.fields
-        .find(_.name == fieldName)
-        .getOrElse(sys.error(s"No such field $fieldName in ${child.dataType}"))
-
-  lazy val ordinal = structType.fields.indexOf(field)
-
-  override lazy val resolved = childrenResolved && fieldResolved
-
-  /** Returns true only if the fieldName is found in the child struct. */
-  private def fieldResolved = child.dataType match {
-    case StructType(fields) => fields.map(_.name).contains(fieldName)
-    case _ => false
-  }
-
   override def eval(input: Row): Any = {
     val baseValue = child.eval(input).asInstanceOf[Row]
     if (baseValue == null) null else baseValue(ordinal)
   }
 
-  override def toString = s"$child.$fieldName"
+  override def toString = s"$child.${field.name}"
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 376a9f36568a7..8c8f2896eb99b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -206,7 +206,7 @@ object NullPropagation extends Rule[LogicalPlan] {
       case e @ IsNotNull(c) if !c.nullable => Literal(true, BooleanType)
       case e @ GetItem(Literal(null, _), _) => Literal(null, e.dataType)
       case e @ GetItem(_, Literal(null, _)) => Literal(null, e.dataType)
-      case e @ GetField(Literal(null, _), _) => Literal(null, e.dataType)
+      case e @ GetField(Literal(null, _), _, _) => Literal(null, e.dataType)
       case e @ EqualNullSafe(Literal(null, _), r) => IsNull(r)
       case e @ EqualNullSafe(l, Literal(null, _)) => IsNull(l)
       case e @ Count(expr) if !expr.nullable => Count(Literal(1))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 65ae066e4b4b5..8d30528328946 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.plans.logical
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.analysis.Resolver
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedGetField, Resolver}
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
@@ -160,11 +160,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
 
       // One match, but we also need to extract the requested nested field.
       case Seq((a, nestedFields)) =>
-        val aliased =
-          Alias(
-            resolveNesting(nestedFields, a, resolver),
-            nestedFields.last)() // Preserve the case of the user's field access.
-        Some(aliased)
+        Some(Alias(nestedFields.foldLeft(a: Expression)(UnresolvedGetField), nestedFields.last)())
 
       // No matches.
       case Seq() =>
@@ -177,31 +173,6 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
           this, s"Ambiguous references to $name: ${ambiguousReferences.mkString(",")}")
     }
   }
-
-  /**
-   * Given a list of successive nested field accesses, and a based expression, attempt to resolve
-   * the actual field lookups on this expression.
-   */
-  private def resolveNesting(
-      nestedFields: List[String],
-      expression: Expression,
-      resolver: Resolver): Expression = {
-
-    (nestedFields, expression.dataType) match {
-      case (Nil, _) => expression
-      case (requestedField :: rest, StructType(fields)) =>
-        val actualField = fields.filter(f => resolver(f.name, requestedField))
-        if (actualField.length == 0) {
-          sys.error(
-            s"No such struct field $requestedField in ${fields.map(_.name).mkString(", ")}")
-        } else if (actualField.length == 1) {
-          resolveNesting(rest, GetField(expression, actualField(0).name), resolver)
-        } else {
-          sys.error(s"Ambiguous reference to fields ${actualField.mkString(", ")}")
-        }
-      case (_, dt) => sys.error(s"Can't access nested field in type $dt")
-    }
-  }
 }
 
 /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
index 25d1c105a00a6..7cf6c80194f6c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
@@ -26,6 +26,7 @@ import org.scalatest.FunSuite
 import org.scalatest.Matchers._
 
 import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.analysis.UnresolvedGetField
 import org.apache.spark.sql.types._
 
 
@@ -846,23 +847,33 @@ class ExpressionEvaluationSuite extends FunSuite {
     checkEvaluation(GetItem(BoundReference(4, typeArray, true),
       Literal(null, IntegerType)), null, row)
 
-    checkEvaluation(GetField(BoundReference(2, typeS, nullable = true), "a"), "aa", row)
-    checkEvaluation(GetField(Literal(null, typeS), "a"), null, row)
+    def quickBuildGetField(expr: Expression, fieldName: String) = {
+      expr.dataType match {
+        case StructType(fields) =>
+          val field = fields.find(_.name == fieldName).get
+          GetField(expr, field, fields.indexOf(field))
+      }
+    }
+
+    def quickResolve(u: UnresolvedGetField) = quickBuildGetField(u.child, u.fieldName)
+
+    checkEvaluation(quickBuildGetField(BoundReference(2, typeS, nullable = true), "a"), "aa", row)
+    checkEvaluation(quickBuildGetField(Literal(null, typeS), "a"), null, row)
 
     val typeS_notNullable = StructType(
       StructField("a", StringType, nullable = false)
         :: StructField("b", StringType, nullable = false) :: Nil
     )
 
-    assert(GetField(BoundReference(2,typeS, nullable = true), "a").nullable === true)
-    assert(GetField(BoundReference(2, typeS_notNullable, nullable = false), "a").nullable === false)
+    assert(quickBuildGetField(BoundReference(2,typeS, nullable = true), "a").nullable === true)
+    assert(quickBuildGetField(BoundReference(2, typeS_notNullable, nullable = false), "a").nullable === false)
 
-    assert(GetField(Literal(null, typeS), "a").nullable === true)
-    assert(GetField(Literal(null, typeS_notNullable), "a").nullable === true)
+    assert(quickBuildGetField(Literal(null, typeS), "a").nullable === true)
+    assert(quickBuildGetField(Literal(null, typeS_notNullable), "a").nullable === true)
 
     checkEvaluation('c.map(typeMap).at(3).getItem("aa"), "bb", row)
     checkEvaluation('c.array(typeArray.elementType).at(4).getItem(1), "bb", row)
-    checkEvaluation('c.struct(typeS).at(2).getField("a"), "aa", row)
+    checkEvaluation(quickResolve('c.struct(typeS).at(2).getField("a")), "aa", row)
   }
 
   test("arithmetic") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
index 9fdf3efa02bb6..e22c62505860a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.analysis.EliminateAnalysisOperators
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedGetField, EliminateAnalysisOperators}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.plans.PlanTest
@@ -184,7 +184,7 @@ class ConstantFoldingSuite extends PlanTest {
 
           GetItem(Literal(null, ArrayType(IntegerType)), 1) as 'c3,
           GetItem(Literal(Seq(1), ArrayType(IntegerType)), Literal(null, IntegerType)) as 'c4,
-          GetField(
+          UnresolvedGetField(
             Literal(null, StructType(Seq(StructField("a", IntegerType, true)))),
             "a") as 'c5,
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 4c2aeadae9492..878b2b0556de7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -23,6 +23,7 @@ import scala.language.implicitConversions
 import org.apache.spark.sql.Dsl.lit
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{Subquery, Project, LogicalPlan}
+import org.apache.spark.sql.catalyst.analysis.UnresolvedGetField
 import org.apache.spark.sql.types._
 
 
@@ -505,7 +506,7 @@ trait Column extends DataFrame {
   /**
    * An expression that gets a field by name in a [[StructField]].
    */
-  def getField(fieldName: String): Column = exprToColumn(GetField(expr, fieldName))
+  def getField(fieldName: String): Column = exprToColumn(UnresolvedGetField(expr, fieldName))
 
   /**
    * An expression that returns a substring.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 4b7fa06532963..2a4b88092179f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -1038,7 +1038,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       nodeToExpr(qualifier) match {
         case UnresolvedAttribute(qualifierName) =>
           UnresolvedAttribute(qualifierName + "." + cleanIdentifier(attr))
-        case other => GetField(other, attr)
+        case other => UnresolvedGetField(other, attr)
       }
 
     /* Stars (*) */
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
index 178ece447758e..ff8130ae5f6bc 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.sql.hive.execution
 
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.spark.sql.hive.test.TestHive.{sparkContext, sql}
+import org.apache.spark.sql.hive.test.TestHive.{sparkContext, jsonRDD, sql}
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 
 case class Nested(a: Int, B: Int)
@@ -29,16 +28,24 @@ case class Data(a: Int, B: Int, n: Nested, nestedArray: Seq[Nested])
  */
 class HiveResolutionSuite extends HiveComparisonTest {
 
-  case class NestedData(a: Seq[NestedData2], B: NestedData2)
-  case class NestedData2(a: NestedData3, B: NestedData3)
-  case class NestedData3(a: Int, B: Int)
-
   test("SPARK-3698: case insensitive test for nested data") {
-    sparkContext.makeRDD(Seq.empty[NestedData]).registerTempTable("nested")
+    jsonRDD(sparkContext.makeRDD(
+      """{"a": [{"a": {"a": 1}}]}""" :: Nil)).registerTempTable("nested")
     // This should be successfully analyzed
     sql("SELECT a[0].A.A from nested").queryExecution.analyzed
   }
 
+  test("SPARK-5278: check ambiguous reference to fields") {
+    jsonRDD(sparkContext.makeRDD(
+      """{"a": [{"b": 1, "B": 2}]}""" :: Nil)).registerTempTable("nested")
+
+    // there are 2 filed matching field name "b", we should report Ambiguous reference error
+    val exception = intercept[RuntimeException] {
+      sql("SELECT a[0].b from nested").queryExecution.analyzed
+    }
+    assert(exception.getMessage.contains("Ambiguous reference to fields"))
+  }
+
   createQueryTest("table.attr",
     "SELECT src.key FROM src ORDER BY key LIMIT 1")
 
@@ -68,7 +75,7 @@ class HiveResolutionSuite extends HiveComparisonTest {
 
   test("case insensitivity with scala reflection") {
     // Test resolution with Scala Reflection
-    TestHive.sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
+    sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
       .registerTempTable("caseSensitivityTest")
 
     val query = sql("SELECT a, b, A, B, n.a, n.b, n.A, n.B FROM caseSensitivityTest")
@@ -79,14 +86,14 @@ class HiveResolutionSuite extends HiveComparisonTest {
 
   ignore("case insensitivity with scala reflection joins") {
     // Test resolution with Scala Reflection
-    TestHive.sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
+    sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
       .registerTempTable("caseSensitivityTest")
 
     sql("SELECT * FROM casesensitivitytest a JOIN casesensitivitytest b ON a.a = b.a").collect()
   }
 
   test("nested repeated resolution") {
-    TestHive.sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
+    sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
       .registerTempTable("nestedRepeatedTest")
     assert(sql("SELECT nestedArray[0].a FROM nestedRepeatedTest").collect().head(0) === 1)
   }

From 3d3ecd77411bfdd1c72be51616c46a6caf839be2 Mon Sep 17 00:00:00 2001
From: OopsOutOfMemory <victorshengli@126.com>
Date: Fri, 6 Feb 2015 13:20:10 -0800
Subject: [PATCH 032/357] [SPARK-5586][Spark Shell][SQL] Make `sqlContext`
 available in spark shell

Result is like this
```
15/02/05 13:41:22 INFO SparkILoop: Created spark context..
Spark context available as sc.
15/02/05 13:41:22 INFO SparkILoop: Created sql context..
SQLContext available as sqlContext.

scala> sq
sql          sqlContext   sqlParser    sqrt
```

Author: OopsOutOfMemory <victorshengli@126.com>

Closes #4387 from OopsOutOfMemory/sqlContextInShell and squashes the following commits:

c7f5203 [OopsOutOfMemory] auto-import sql() function
e160697 [OopsOutOfMemory] Merge branch 'sqlContextInShell' of https://github.com/OopsOutOfMemory/spark into sqlContextInShell
37c0a16 [OopsOutOfMemory] auto detect hive support
a9c59d9 [OopsOutOfMemory] rename and reduce range of imports
6b9e309 [OopsOutOfMemory] Merge branch 'master' into sqlContextInShell
cae652f [OopsOutOfMemory] make sqlContext available in spark shell
---
 .../org/apache/spark/repl/SparkILoop.scala    | 19 +++++++++++++++++++
 .../apache/spark/repl/SparkILoopInit.scala    | 10 ++++++++++
 .../scala/org/apache/spark/repl/Main.scala    | 18 ++++++++++++++++++
 .../org/apache/spark/repl/SparkILoop.scala    | 12 +++++++++++-
 4 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index 72c1a989999b4..b4db3df795177 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -45,6 +45,7 @@ import scala.reflect.api.{Mirror, TypeCreator, Universe => ApiUniverse}
 import org.apache.spark.Logging
 import org.apache.spark.SparkConf
 import org.apache.spark.SparkContext
+import org.apache.spark.sql.SQLContext
 import org.apache.spark.util.Utils
 
 /** The Scala interactive shell.  It provides a read-eval-print loop
@@ -130,6 +131,7 @@ class SparkILoop(
   // NOTE: Must be public for visibility
   @DeveloperApi
   var sparkContext: SparkContext = _
+  var sqlContext: SQLContext = _
 
   override def echoCommandMessage(msg: String) {
     intp.reporter printMessage msg
@@ -1016,6 +1018,23 @@ class SparkILoop(
     sparkContext
   }
 
+  @DeveloperApi
+  def createSQLContext(): SQLContext = {
+    val name = "org.apache.spark.sql.hive.HiveContext"
+    val loader = Utils.getContextOrSparkClassLoader
+    try {
+      sqlContext = loader.loadClass(name).getConstructor(classOf[SparkContext])
+        .newInstance(sparkContext).asInstanceOf[SQLContext] 
+      logInfo("Created sql context (with Hive support)..")
+    }
+    catch {
+      case cnf: java.lang.ClassNotFoundException =>
+        sqlContext = new SQLContext(sparkContext)
+        logInfo("Created sql context..")
+    }
+    sqlContext
+  }
+
   private def getMaster(): String = {
     val master = this.master match {
       case Some(m) => m
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
index 99bd777c04fdb..0cf2de6d399b0 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
@@ -127,7 +127,17 @@ private[repl] trait SparkILoopInit {
            _sc
          }
         """)
+      command("""
+         @transient val sqlContext = {
+           val _sqlContext = org.apache.spark.repl.Main.interp.createSQLContext()
+           println("SQL context available as sqlContext.")
+           _sqlContext
+         }
+        """)
       command("import org.apache.spark.SparkContext._")
+      command("import sqlContext.implicits._")
+      command("import sqlContext.sql")
+      command("import org.apache.spark.sql.Dsl._")
     }
   }
 
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
index 69e44d4f916e1..dc25692749aad 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
@@ -19,6 +19,7 @@ package org.apache.spark.repl
 
 import org.apache.spark.util.Utils
 import org.apache.spark._
+import org.apache.spark.sql.SQLContext
 
 import scala.tools.nsc.Settings
 import scala.tools.nsc.interpreter.SparkILoop
@@ -34,6 +35,7 @@ object Main extends Logging {
     "-Yrepl-outdir", s"${outputDir.getAbsolutePath}", "-Yrepl-sync"), true)
   val classServer = new HttpServer(conf, outputDir, new SecurityManager(conf))
   var sparkContext: SparkContext = _
+  var sqlContext: SQLContext = _
   var interp = new SparkILoop // this is a public var because tests reset it.
 
   def main(args: Array[String]) {
@@ -74,6 +76,22 @@ object Main extends Logging {
     sparkContext
   }
 
+  def createSQLContext(): SQLContext = {
+    val name = "org.apache.spark.sql.hive.HiveContext"
+    val loader = Utils.getContextOrSparkClassLoader
+    try {
+      sqlContext = loader.loadClass(name).getConstructor(classOf[SparkContext])
+        .newInstance(sparkContext).asInstanceOf[SQLContext] 
+      logInfo("Created sql context (with Hive support)..")
+    }
+    catch {
+      case cnf: java.lang.ClassNotFoundException =>
+        sqlContext = new SQLContext(sparkContext)
+        logInfo("Created sql context..")
+    }
+    sqlContext
+  }
+
   private def getMaster: String = {
     val master = {
       val envMaster = sys.env.get("MASTER")
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index 250727305970d..1bd2a6991404b 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -66,8 +66,18 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter)
            println("Spark context available as sc.")
            _sc
          }
-               """)
+        """)
+      command( """
+         @transient val sqlContext = {
+           val _sqlContext = org.apache.spark.repl.Main.createSQLContext()
+           println("SQL context available as sqlContext.")
+           _sqlContext
+         }
+        """)
       command("import org.apache.spark.SparkContext._")
+      command("import sqlContext.implicits._")
+      command("import sqlContext.sql")
+      command("import org.apache.spark.sql.Dsl._")
     }
   }
 

From 0f3a36071a44e986d97981032d5b192477b38bbd Mon Sep 17 00:00:00 2001
From: GenTang <gen.tang86@gmail.com>
Date: Fri, 6 Feb 2015 13:27:34 -0800
Subject: [PATCH 033/357] [SPARK-4983] Insert waiting time before tagging EC2
 instances

The boto API doesn't support tag EC2 instances in the same call that launches them.
We add a five-second wait so EC2 has enough time to propagate the information so that
the tagging can succeed.

Author: GenTang <gen.tang86@gmail.com>
Author: Gen TANG <gen.tang86@gmail.com>

Closes #3986 from GenTang/spark-4983 and squashes the following commits:

13e257d [Gen TANG] modification of comments
47f06755 [GenTang] print the information
ab7a931 [GenTang] solve the issus spark-4983 by inserting waiting time
3179737 [GenTang] Revert "handling exceptions about adding tags to ec2"
6a8b53b [GenTang] Revert "the improvement of exception handling"
13e97a6 [GenTang] Revert "typo"
63fd360 [GenTang] typo
692fc2b [GenTang] the improvement of exception handling
6adcf6d [GenTang] handling exceptions about adding tags to ec2
---
 ec2/spark_ec2.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 0de4a62e203fd..7371558274d78 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -569,6 +569,9 @@ def launch_cluster(conn, opts, cluster_name):
         master_nodes = master_res.instances
         print "Launched master in %s, regid = %s" % (zone, master_res.id)
 
+    # This wait time corresponds to SPARK-4983
+    print "Waiting for AWS to propagate instance metadata..."
+    time.sleep(5)
     # Give the instances descriptive names
     for master in master_nodes:
         master.add_tag(

From 65181b7512c2192311705dfdfad588a80d3a6fde Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Fri, 6 Feb 2015 13:52:35 -0800
Subject: [PATCH 034/357] [HOTFIX][MLLIB] fix a compilation error with java 6

Author: Xiangrui Meng <meng@databricks.com>

Closes #4442 from mengxr/java6-fix and squashes the following commits:

2098500 [Xiangrui Meng] fix a compilation error with java 6
---
 .../org/apache/spark/examples/ml/JavaDeveloperApiExample.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
index 42d4d7d0bef26..4d9dad9f23038 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
@@ -116,7 +116,7 @@ class MyJavaLogisticRegression
    */
   IntParam maxIter = new IntParam(this, "maxIter", "max number of iterations");
 
-  int getMaxIter() { return (int)get(maxIter); }
+  int getMaxIter() { return (Integer) get(maxIter); }
 
   public MyJavaLogisticRegression() {
     setMaxIter(100);
@@ -124,7 +124,7 @@ public MyJavaLogisticRegression() {
 
   // The parameter setter is in this class since it should return type MyJavaLogisticRegression.
   MyJavaLogisticRegression setMaxIter(int value) {
-    return (MyJavaLogisticRegression)set(maxIter, value);
+    return (MyJavaLogisticRegression) set(maxIter, value);
   }
 
   // This method is used by fit().

From b3872e00d155939e40366debda635fc3fb12cc73 Mon Sep 17 00:00:00 2001
From: Vladimir Vladimirov <vladimir.vladimirov@magnetic.com>
Date: Fri, 6 Feb 2015 13:55:02 -0800
Subject: [PATCH 035/357] SPARK-5633 pyspark saveAsTextFile support for
 compression codec

See https://issues.apache.org/jira/browse/SPARK-5633 for details

Author: Vladimir Vladimirov <vladimir.vladimirov@magnetic.com>

Closes #4403 from smartkiwi/master and squashes the following commits:

94c014e [Vladimir Vladimirov] SPARK-5633 pyspark saveAsTextFile support for compression codec
---
 python/pyspark/rdd.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 6e029bf7f13fc..bd4f16e058045 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1366,10 +1366,14 @@ def saveAsPickleFile(self, path, batchSize=10):
             ser = BatchedSerializer(PickleSerializer(), batchSize)
         self._reserialize(ser)._jrdd.saveAsObjectFile(path)
 
-    def saveAsTextFile(self, path):
+    def saveAsTextFile(self, path, compressionCodecClass=None):
         """
         Save this RDD as a text file, using string representations of elements.
 
+        @param path: path to text file
+        @param compressionCodecClass: (None by default) string i.e.
+            "org.apache.hadoop.io.compress.GzipCodec"
+
         >>> tempFile = NamedTemporaryFile(delete=True)
         >>> tempFile.close()
         >>> sc.parallelize(range(10)).saveAsTextFile(tempFile.name)
@@ -1385,6 +1389,16 @@ def saveAsTextFile(self, path):
         >>> sc.parallelize(['', 'foo', '', 'bar', '']).saveAsTextFile(tempFile2.name)
         >>> ''.join(sorted(input(glob(tempFile2.name + "/part-0000*"))))
         '\\n\\n\\nbar\\nfoo\\n'
+
+        Using compressionCodecClass
+
+        >>> tempFile3 = NamedTemporaryFile(delete=True)
+        >>> tempFile3.close()
+        >>> codec = "org.apache.hadoop.io.compress.GzipCodec"
+        >>> sc.parallelize(['foo', 'bar']).saveAsTextFile(tempFile3.name, codec)
+        >>> from fileinput import input, hook_compressed
+        >>> ''.join(sorted(input(glob(tempFile3.name + "/part*.gz"), openhook=hook_compressed)))
+        'bar\\nfoo\\n'
         """
         def func(split, iterator):
             for x in iterator:
@@ -1395,7 +1409,11 @@ def func(split, iterator):
                 yield x
         keyed = self.mapPartitionsWithIndex(func)
         keyed._bypass_serializer = True
-        keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)
+        if compressionCodecClass:
+            compressionCodec = self.ctx._jvm.java.lang.Class.forName(compressionCodecClass)
+            keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path, compressionCodec)
+        else:
+            keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)
 
     # Pair functions
 

From ca66159a4f30d65fa4cd32dbf3ff23978cb7f99b Mon Sep 17 00:00:00 2001
From: Kashish Jain <kashish.jain@guavus.com>
Date: Fri, 6 Feb 2015 13:47:23 -0800
Subject: [PATCH 036/357] SPARK-5613: Catch the ApplicationNotFoundException
 exception to avoid thread from getting killed on yarn restart.

[SPARK-5613] Added a  catch block to catch the ApplicationNotFoundException. Without this catch block the thread gets killed on occurrence of this exception. This Exception occurs when yarn restarts and tries to find an application id for a spark job which got interrupted due to yarn getting stopped.
See the stacktrace in the bug for more details.

Author: Kashish Jain <kashish.jain@guavus.com>

Closes #4392 from kasjain/branch-1.2 and squashes the following commits:

4831000 [Kashish Jain] SPARK-5613: Catch the ApplicationNotFoundException exception to avoid thread from getting killed on yarn restart.
---
 .../cluster/YarnClientSchedulerBackend.scala          | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
index 690f927e938c3..f1b5aafac4066 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
@@ -20,6 +20,7 @@ package org.apache.spark.scheduler.cluster
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.hadoop.yarn.api.records.{ApplicationId, YarnApplicationState}
+import org.apache.hadoop.yarn.exceptions.ApplicationNotFoundException
 
 import org.apache.spark.{SparkException, Logging, SparkContext}
 import org.apache.spark.deploy.yarn.{Client, ClientArguments}
@@ -133,8 +134,14 @@ private[spark] class YarnClientSchedulerBackend(
     val t = new Thread {
       override def run() {
         while (!stopping) {
-          val report = client.getApplicationReport(appId)
-          val state = report.getYarnApplicationState()
+          var state: YarnApplicationState = null
+          try {
+            val report = client.getApplicationReport(appId)
+            state = report.getYarnApplicationState()
+          } catch {
+            case e: ApplicationNotFoundException =>
+              state = YarnApplicationState.KILLED
+          }
           if (state == YarnApplicationState.FINISHED ||
             state == YarnApplicationState.KILLED ||
             state == YarnApplicationState.FAILED) {

From 5687bab8fdfdc5345b8c5b9be8d4595299005fc8 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 6 Feb 2015 14:23:09 -0800
Subject: [PATCH 037/357] [SPARK-5600] [core] Clean up FsHistoryProvider test,
 fix app sort order.

Clean up some test setup code to remove duplicate instantiation of the
provider. Also make sure unfinished apps are sorted correctly.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #4370 from vanzin/SPARK-5600 and squashes the following commits:

0d048d5 [Marcelo Vanzin] Cleanup test code a bit.
2585119 [Marcelo Vanzin] Review feedback.
8b97544 [Marcelo Vanzin] Merge branch 'master' into SPARK-5600
be979e9 [Marcelo Vanzin] Merge branch 'master' into SPARK-5600
298371c [Marcelo Vanzin] [SPARK-5600] [core] Clean up FsHistoryProvider test, fix app sort order.
---
 .../deploy/history/FsHistoryProvider.scala    | 15 +++-
 .../history/FsHistoryProviderSuite.scala      | 85 +++++++++----------
 2 files changed, 52 insertions(+), 48 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 92125f2df7d10..868c63d30a202 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -194,7 +194,7 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
               None
           }
         }
-        .sortBy { info => (-info.endTime, -info.startTime) }
+        .sortWith(compareAppInfo)
 
       lastModifiedTime = newLastModifiedTime
 
@@ -214,7 +214,7 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
         val newIterator = logInfos.iterator.buffered
         val oldIterator = applications.values.iterator.buffered
         while (newIterator.hasNext && oldIterator.hasNext) {
-          if (newIterator.head.endTime > oldIterator.head.endTime) {
+          if (compareAppInfo(newIterator.head, oldIterator.head)) {
             addIfAbsent(newIterator.next)
           } else {
             addIfAbsent(oldIterator.next)
@@ -230,6 +230,17 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
     }
   }
 
+  /**
+   * Comparison function that defines the sort order for the application listing.
+   *
+   * @return Whether `i1` should precede `i2`.
+   */
+  private def compareAppInfo(
+      i1: FsApplicationHistoryInfo,
+      i2: FsApplicationHistoryInfo): Boolean = {
+    if (i1.endTime != i2.endTime) i1.endTime >= i2.endTime else i1.startTime >= i2.startTime
+  }
+
   /**
    * Replays the events in the specified log file and returns information about the associated
    * application.
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index 1d95432258111..85939eaadccc7 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -37,13 +37,8 @@ class FsHistoryProviderSuite extends FunSuite with BeforeAndAfter with Matchers
 
   private var testDir: File = null
 
-  private var provider: FsHistoryProvider = null
-
   before {
     testDir = Utils.createTempDir()
-    provider = new FsHistoryProvider(new SparkConf()
-      .set("spark.history.fs.logDirectory", testDir.getAbsolutePath())
-      .set("spark.history.fs.updateInterval", "0"))
   }
 
   after {
@@ -51,40 +46,41 @@ class FsHistoryProviderSuite extends FunSuite with BeforeAndAfter with Matchers
   }
 
   test("Parse new and old application logs") {
-    val conf = new SparkConf()
-      .set("spark.history.fs.logDirectory", testDir.getAbsolutePath())
-      .set("spark.history.fs.updateInterval", "0")
-    val provider = new FsHistoryProvider(conf)
+    val provider = new FsHistoryProvider(createTestConf())
 
     // Write a new-style application log.
-    val logFile1 = new File(testDir, "new1")
-    writeFile(logFile1, true, None,
-      SparkListenerApplicationStart("app1-1", None, 1L, "test"),
-      SparkListenerApplicationEnd(2L)
+    val newAppComplete = new File(testDir, "new1")
+    writeFile(newAppComplete, true, None,
+      SparkListenerApplicationStart("new-app-complete", None, 1L, "test"),
+      SparkListenerApplicationEnd(4L)
       )
 
     // Write an unfinished app, new-style.
-    val logFile2 = new File(testDir, "new2" + EventLoggingListener.IN_PROGRESS)
-    writeFile(logFile2, true, None,
-      SparkListenerApplicationStart("app2-2", None, 1L, "test")
+    val newAppIncomplete = new File(testDir, "new2" + EventLoggingListener.IN_PROGRESS)
+    writeFile(newAppIncomplete, true, None,
+      SparkListenerApplicationStart("new-app-incomplete", None, 1L, "test")
       )
 
     // Write an old-style application log.
-    val oldLog = new File(testDir, "old1")
-    oldLog.mkdir()
-    createEmptyFile(new File(oldLog, provider.SPARK_VERSION_PREFIX + "1.0"))
-    writeFile(new File(oldLog, provider.LOG_PREFIX + "1"), false, None,
-      SparkListenerApplicationStart("app3", None, 2L, "test"),
+    val oldAppComplete = new File(testDir, "old1")
+    oldAppComplete.mkdir()
+    createEmptyFile(new File(oldAppComplete, provider.SPARK_VERSION_PREFIX + "1.0"))
+    writeFile(new File(oldAppComplete, provider.LOG_PREFIX + "1"), false, None,
+      SparkListenerApplicationStart("old-app-complete", None, 2L, "test"),
       SparkListenerApplicationEnd(3L)
       )
-    createEmptyFile(new File(oldLog, provider.APPLICATION_COMPLETE))
+    createEmptyFile(new File(oldAppComplete, provider.APPLICATION_COMPLETE))
+
+    // Check for logs so that we force the older unfinished app to be loaded, to make
+    // sure unfinished apps are also sorted correctly.
+    provider.checkForLogs()
 
     // Write an unfinished app, old-style.
-    val oldLog2 = new File(testDir, "old2")
-    oldLog2.mkdir()
-    createEmptyFile(new File(oldLog2, provider.SPARK_VERSION_PREFIX + "1.0"))
-    writeFile(new File(oldLog2, provider.LOG_PREFIX + "1"), false, None,
-      SparkListenerApplicationStart("app4", None, 2L, "test")
+    val oldAppIncomplete = new File(testDir, "old2")
+    oldAppIncomplete.mkdir()
+    createEmptyFile(new File(oldAppIncomplete, provider.SPARK_VERSION_PREFIX + "1.0"))
+    writeFile(new File(oldAppIncomplete, provider.LOG_PREFIX + "1"), false, None,
+      SparkListenerApplicationStart("old-app-incomplete", None, 2L, "test")
       )
 
     // Force a reload of data from the log directory, and check that both logs are loaded.
@@ -96,14 +92,14 @@ class FsHistoryProviderSuite extends FunSuite with BeforeAndAfter with Matchers
     list.size should be (4)
     list.count(e => e.completed) should be (2)
 
-    list(0) should be (ApplicationHistoryInfo(oldLog.getName(), "app3", 2L, 3L,
-      oldLog.lastModified(), "test", true))
-    list(1) should be (ApplicationHistoryInfo(logFile1.getName(), "app1-1", 1L, 2L,
-      logFile1.lastModified(), "test", true))
-    list(2) should be (ApplicationHistoryInfo(oldLog2.getName(), "app4", 2L, -1L,
-      oldLog2.lastModified(), "test", false))
-     list(3) should be (ApplicationHistoryInfo(logFile2.getName(), "app2-2", 1L, -1L,
-      logFile2.lastModified(), "test", false))
+    list(0) should be (ApplicationHistoryInfo(newAppComplete.getName(), "new-app-complete", 1L, 4L,
+      newAppComplete.lastModified(), "test", true))
+    list(1) should be (ApplicationHistoryInfo(oldAppComplete.getName(), "old-app-complete", 2L, 3L,
+      oldAppComplete.lastModified(), "test", true))
+    list(2) should be (ApplicationHistoryInfo(oldAppIncomplete.getName(), "old-app-incomplete", 2L,
+      -1L, oldAppIncomplete.lastModified(), "test", false))
+    list(3) should be (ApplicationHistoryInfo(newAppIncomplete.getName(), "new-app-incomplete", 1L,
+      -1L, newAppIncomplete.lastModified(), "test", false))
 
     // Make sure the UI can be rendered.
     list.foreach { case info =>
@@ -113,6 +109,7 @@ class FsHistoryProviderSuite extends FunSuite with BeforeAndAfter with Matchers
   }
 
   test("Parse legacy logs with compression codec set") {
+    val provider = new FsHistoryProvider(createTestConf())
     val testCodecs = List((classOf[LZFCompressionCodec].getName(), true),
       (classOf[SnappyCompressionCodec].getName(), true),
       ("invalid.codec", false))
@@ -156,10 +153,7 @@ class FsHistoryProviderSuite extends FunSuite with BeforeAndAfter with Matchers
       )
     logFile2.setReadable(false, false)
 
-    val conf = new SparkConf()
-      .set("spark.history.fs.logDirectory", testDir.getAbsolutePath())
-      .set("spark.history.fs.updateInterval", "0")
-    val provider = new FsHistoryProvider(conf)
+    val provider = new FsHistoryProvider(createTestConf())
     provider.checkForLogs()
 
     val list = provider.getListing().toSeq
@@ -168,10 +162,7 @@ class FsHistoryProviderSuite extends FunSuite with BeforeAndAfter with Matchers
   }
 
   test("history file is renamed from inprogress to completed") {
-    val conf = new SparkConf()
-      .set("spark.history.fs.logDirectory", testDir.getAbsolutePath())
-      .set("spark.testing", "true")
-    val provider = new FsHistoryProvider(conf)
+    val provider = new FsHistoryProvider(createTestConf())
 
     val logFile1 = new File(testDir, "app1" + EventLoggingListener.IN_PROGRESS)
     writeFile(logFile1, true, None,
@@ -191,9 +182,7 @@ class FsHistoryProviderSuite extends FunSuite with BeforeAndAfter with Matchers
   }
 
   test("SPARK-5582: empty log directory") {
-    val conf = new SparkConf()
-      .set("spark.history.fs.logDirectory", testDir.getAbsolutePath())
-    val provider = new FsHistoryProvider(conf)
+    val provider = new FsHistoryProvider(createTestConf())
 
     val logFile1 = new File(testDir, "app1" + EventLoggingListener.IN_PROGRESS)
     writeFile(logFile1, true, None,
@@ -229,4 +218,8 @@ class FsHistoryProviderSuite extends FunSuite with BeforeAndAfter with Matchers
     new FileOutputStream(file).close()
   }
 
+  private def createTestConf(): SparkConf = {
+    new SparkConf().set("spark.history.fs.logDirectory", testDir.getAbsolutePath())
+  }
+
 }

From 57961567ef104efb3174e67d762c5d9d6263b800 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Fri, 6 Feb 2015 14:27:06 -0800
Subject: [PATCH 038/357] [HOTFIX] Fix the maven build after adding sqlContext
 to spark-shell

Follow up to #4387 to fix the build break.

Author: Michael Armbrust <michael@databricks.com>

Closes #4443 from marmbrus/fixMaven and squashes the following commits:

1eeba7d [Michael Armbrust] try again
7f5fb15 [Michael Armbrust] [HOTFIX] Fix the maven build after adding sqlContext to spark-shell
---
 repl/pom.xml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/repl/pom.xml b/repl/pom.xml
index bd39b90fd8714..3d4adf8fd5b03 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -66,7 +66,6 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-sql_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
-      <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>

From dcd1e42d6b6ac08d2c0736bf61a15f515a1f222b Mon Sep 17 00:00:00 2001
From: Kostas Sakellis <kostas@cloudera.com>
Date: Fri, 6 Feb 2015 14:31:20 -0800
Subject: [PATCH 039/357] [SPARK-4874] [CORE] Collect record count metrics

Collects record counts for both Input/Output and Shuffle Metrics. For the input/output metrics, it just appends the counter every time the iterators get accessed.

For shuffle on the write side, we count the metrics post aggregation (after a map side combine) and on the read side we count the metrics pre aggregation. This allows both the bytes read/written metrics and the records read/written to line up.

For backwards compatibility, if we deserialize an older event that doesn't have record metrics, we set the metric to -1.

Author: Kostas Sakellis <kostas@cloudera.com>

Closes #4067 from ksakellis/kostas-spark-4874 and squashes the following commits:

bd919be [Kostas Sakellis] Changed 'Records Read' in shuffleReadMetrics json output to 'Total Records Read'
dad4d57 [Kostas Sakellis] Add a comment and check to BlockObjectWriter so that it cannot be reopend.
6f236a1 [Kostas Sakellis] Renamed _recordsWritten in ShuffleWriteMetrics to be more consistent
70620a0 [Kostas Sakellis] CR Feedback
17faa3a [Kostas Sakellis] Removed AtomicLong in favour of using Long
b6f9923 [Kostas Sakellis] Merge AfterNextInterceptingIterator with InterruptableIterator to save a function call
46c8186 [Kostas Sakellis] Combined Bytes and # records into one column
57551c1 [Kostas Sakellis] Conforms to SPARK-3288
6cdb44e [Kostas Sakellis] Removed the generic InterceptingIterator and repalced it with specific implementation
1aa273c [Kostas Sakellis] CR Feedback
1bb78b1 [Kostas Sakellis] [SPARK-4874] [CORE] Collect record count metrics
---
 .../scala/org/apache/spark/CacheManager.scala |  11 +-
 .../apache/spark/executor/TaskMetrics.scala   |  54 +++--
 .../org/apache/spark/rdd/HadoopRDD.scala      |   6 +-
 .../org/apache/spark/rdd/NewHadoopRDD.scala   |   6 +-
 .../apache/spark/rdd/PairRDDFunctions.scala   |  12 +-
 .../hash/BlockStoreShuffleFetcher.scala       |   8 +-
 .../apache/spark/storage/BlockManager.scala   |   2 +-
 .../spark/storage/BlockObjectWriter.scala     |  24 ++-
 .../scala/org/apache/spark/ui/ToolTips.scala  |   9 +-
 .../apache/spark/ui/exec/ExecutorsTab.scala   |   6 +
 .../apache/spark/ui/jobs/ExecutorTable.scala  |  86 ++++++--
 .../spark/ui/jobs/JobProgressListener.scala   |  24 +++
 .../org/apache/spark/ui/jobs/StagePage.scala  | 148 +++++++++-----
 .../org/apache/spark/ui/jobs/UIData.scala     |  14 ++
 .../org/apache/spark/util/JsonProtocol.scala  |  19 +-
 .../util/collection/ExternalSorter.scala      |   1 +
 .../metrics/InputOutputMetricsSuite.scala     | 186 ++++++++++++++++--
 .../storage/BlockObjectWriterSuite.scala      |  21 ++
 .../ui/jobs/JobProgressListenerSuite.scala    |   2 +-
 .../apache/spark/util/JsonProtocolSuite.scala |  55 +++++-
 20 files changed, 548 insertions(+), 146 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/CacheManager.scala b/core/src/main/scala/org/apache/spark/CacheManager.scala
index a0c0372b7f0ef..a96d754744a05 100644
--- a/core/src/main/scala/org/apache/spark/CacheManager.scala
+++ b/core/src/main/scala/org/apache/spark/CacheManager.scala
@@ -47,10 +47,15 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
         val inputMetrics = blockResult.inputMetrics
         val existingMetrics = context.taskMetrics
           .getInputMetricsForReadMethod(inputMetrics.readMethod)
-        existingMetrics.addBytesRead(inputMetrics.bytesRead)
-
-        new InterruptibleIterator(context, blockResult.data.asInstanceOf[Iterator[T]])
+        existingMetrics.incBytesRead(inputMetrics.bytesRead)
 
+        val iter = blockResult.data.asInstanceOf[Iterator[T]]
+        new InterruptibleIterator[T](context, iter) {
+          override def next(): T = {
+            existingMetrics.incRecordsRead(1)
+            delegate.next()
+          }
+        }
       case None =>
         // Acquire a lock for loading this partition
         // If another thread already holds the lock, wait for it to finish return its results
diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
index 97912c68c5982..d05659193b334 100644
--- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
+++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
@@ -194,18 +194,19 @@ class TaskMetrics extends Serializable {
   /**
    * Aggregates shuffle read metrics for all registered dependencies into shuffleReadMetrics.
    */
-  private[spark] def updateShuffleReadMetrics() = synchronized {
+  private[spark] def updateShuffleReadMetrics(): Unit = synchronized {
     val merged = new ShuffleReadMetrics()
     for (depMetrics <- depsShuffleReadMetrics) {
       merged.incFetchWaitTime(depMetrics.fetchWaitTime)
       merged.incLocalBlocksFetched(depMetrics.localBlocksFetched)
       merged.incRemoteBlocksFetched(depMetrics.remoteBlocksFetched)
       merged.incRemoteBytesRead(depMetrics.remoteBytesRead)
+      merged.incRecordsRead(depMetrics.recordsRead)
     }
     _shuffleReadMetrics = Some(merged)
   }
 
-  private[spark] def updateInputMetrics() = synchronized {
+  private[spark] def updateInputMetrics(): Unit = synchronized {
     inputMetrics.foreach(_.updateBytesRead())
   }
 }
@@ -242,27 +243,31 @@ object DataWriteMethod extends Enumeration with Serializable {
 @DeveloperApi
 case class InputMetrics(readMethod: DataReadMethod.Value) {
 
-  private val _bytesRead: AtomicLong = new AtomicLong()
+  /**
+   * This is volatile so that it is visible to the updater thread.
+   */
+  @volatile @transient var bytesReadCallback: Option[() => Long] = None
 
   /**
    * Total bytes read.
    */
-  def bytesRead: Long = _bytesRead.get()
-  @volatile @transient var bytesReadCallback: Option[() => Long] = None
+  private var _bytesRead: Long = _
+  def bytesRead: Long = _bytesRead
+  def incBytesRead(bytes: Long) = _bytesRead += bytes
 
   /**
-   * Adds additional bytes read for this read method.
+   * Total records read.
    */
-  def addBytesRead(bytes: Long) = {
-    _bytesRead.addAndGet(bytes)
-  }
+  private var _recordsRead: Long = _
+  def recordsRead: Long = _recordsRead
+  def incRecordsRead(records: Long) =  _recordsRead += records
 
   /**
    * Invoke the bytesReadCallback and mutate bytesRead.
    */
   def updateBytesRead() {
     bytesReadCallback.foreach { c =>
-      _bytesRead.set(c())
+      _bytesRead = c()
     }
   }
 
@@ -287,6 +292,13 @@ case class OutputMetrics(writeMethod: DataWriteMethod.Value) {
   private var _bytesWritten: Long = _
   def bytesWritten = _bytesWritten
   private[spark] def setBytesWritten(value : Long) = _bytesWritten = value
+
+  /**
+   * Total records written
+   */
+  private var _recordsWritten: Long = 0L
+  def recordsWritten = _recordsWritten
+  private[spark] def setRecordsWritten(value: Long) = _recordsWritten = value
 }
 
 /**
@@ -301,7 +313,7 @@ class ShuffleReadMetrics extends Serializable {
   private var _remoteBlocksFetched: Int = _
   def remoteBlocksFetched = _remoteBlocksFetched
   private[spark] def incRemoteBlocksFetched(value: Int) = _remoteBlocksFetched += value
-  private[spark] def defRemoteBlocksFetched(value: Int) = _remoteBlocksFetched -= value
+  private[spark] def decRemoteBlocksFetched(value: Int) = _remoteBlocksFetched -= value
   
   /**
    * Number of local blocks fetched in this shuffle by this task
@@ -309,8 +321,7 @@ class ShuffleReadMetrics extends Serializable {
   private var _localBlocksFetched: Int = _
   def localBlocksFetched = _localBlocksFetched
   private[spark] def incLocalBlocksFetched(value: Int) = _localBlocksFetched += value
-  private[spark] def defLocalBlocksFetched(value: Int) = _localBlocksFetched -= value
-
+  private[spark] def decLocalBlocksFetched(value: Int) = _localBlocksFetched -= value
 
   /**
    * Time the task spent waiting for remote shuffle blocks. This only includes the time
@@ -334,6 +345,14 @@ class ShuffleReadMetrics extends Serializable {
    * Number of blocks fetched in this shuffle by this task (remote or local)
    */
   def totalBlocksFetched = _remoteBlocksFetched + _localBlocksFetched
+
+  /**
+   * Total number of records read from the shuffle by this task
+   */
+  private var _recordsRead: Long = _
+  def recordsRead = _recordsRead
+  private[spark] def incRecordsRead(value: Long) = _recordsRead += value
+  private[spark] def decRecordsRead(value: Long) = _recordsRead -= value
 }
 
 /**
@@ -358,5 +377,12 @@ class ShuffleWriteMetrics extends Serializable {
   private[spark] def incShuffleWriteTime(value: Long) = _shuffleWriteTime += value
   private[spark] def decShuffleWriteTime(value: Long) = _shuffleWriteTime -= value
   
-
+  /**
+   * Total number of records written to the shuffle by this task
+   */
+  @volatile private var _shuffleRecordsWritten: Long = _
+  def shuffleRecordsWritten = _shuffleRecordsWritten
+  private[spark] def incShuffleRecordsWritten(value: Long) = _shuffleRecordsWritten += value
+  private[spark] def decShuffleRecordsWritten(value: Long) = _shuffleRecordsWritten -= value
+  private[spark] def setShuffleRecordsWritten(value: Long) = _shuffleRecordsWritten = value
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index 89adddcf0ac36..486e86ce1bb19 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -247,7 +247,9 @@ class HadoopRDD[K, V](
           case eof: EOFException =>
             finished = true
         }
-
+        if (!finished) {
+          inputMetrics.incRecordsRead(1)
+        }
         (key, value)
       }
 
@@ -261,7 +263,7 @@ class HadoopRDD[K, V](
             // If we can't get the bytes read from the FS stats, fall back to the split size,
             // which may be inaccurate.
             try {
-              inputMetrics.addBytesRead(split.inputSplit.value.getLength)
+              inputMetrics.incBytesRead(split.inputSplit.value.getLength)
             } catch {
               case e: java.io.IOException =>
                 logWarning("Unable to get input size to set InputMetrics for task", e)
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index 44b9ffd2a53fd..7fb94840df99c 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -151,7 +151,9 @@ class NewHadoopRDD[K, V](
           throw new java.util.NoSuchElementException("End of stream")
         }
         havePair = false
-
+        if (!finished) {
+          inputMetrics.incRecordsRead(1)
+        }
         (reader.getCurrentKey, reader.getCurrentValue)
       }
 
@@ -165,7 +167,7 @@ class NewHadoopRDD[K, V](
             // If we can't get the bytes read from the FS stats, fall back to the split size,
             // which may be inaccurate.
             try {
-              inputMetrics.addBytesRead(split.serializableHadoopSplit.value.getLength)
+              inputMetrics.incBytesRead(split.serializableHadoopSplit.value.getLength)
             } catch {
               case e: java.io.IOException =>
                 logWarning("Unable to get input size to set InputMetrics for task", e)
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 49b88a90ab5af..955b42c3baaa1 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -34,7 +34,7 @@ import org.apache.hadoop.io.SequenceFile.CompressionType
 import org.apache.hadoop.io.compress.CompressionCodec
 import org.apache.hadoop.mapred.{FileOutputCommitter, FileOutputFormat, JobConf, OutputFormat}
 import org.apache.hadoop.mapreduce.{Job => NewAPIHadoopJob, OutputFormat => NewOutputFormat,
-RecordWriter => NewRecordWriter}
+  RecordWriter => NewRecordWriter}
 
 import org.apache.spark._
 import org.apache.spark.Partitioner.defaultPartitioner
@@ -993,8 +993,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context)
 
       val writer = format.getRecordWriter(hadoopContext).asInstanceOf[NewRecordWriter[K,V]]
+      var recordsWritten = 0L
       try {
-        var recordsWritten = 0L
         while (iter.hasNext) {
           val pair = iter.next()
           writer.write(pair._1, pair._2)
@@ -1008,6 +1008,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       }
       committer.commitTask(hadoopContext)
       bytesWrittenCallback.foreach { fn => outputMetrics.setBytesWritten(fn()) }
+      outputMetrics.setRecordsWritten(recordsWritten)
       1
     } : Int
 
@@ -1065,8 +1066,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
 
       writer.setup(context.stageId, context.partitionId, taskAttemptId)
       writer.open()
+      var recordsWritten = 0L
       try {
-        var recordsWritten = 0L
         while (iter.hasNext) {
           val record = iter.next()
           writer.write(record._1.asInstanceOf[AnyRef], record._2.asInstanceOf[AnyRef])
@@ -1080,6 +1081,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       }
       writer.commit()
       bytesWrittenCallback.foreach { fn => outputMetrics.setBytesWritten(fn()) }
+      outputMetrics.setRecordsWritten(recordsWritten)
     }
 
     self.context.runJob(self, writeToFile)
@@ -1097,9 +1099,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
 
   private def maybeUpdateOutputMetrics(bytesWrittenCallback: Option[() => Long],
       outputMetrics: OutputMetrics, recordsWritten: Long): Unit = {
-    if (recordsWritten % PairRDDFunctions.RECORDS_BETWEEN_BYTES_WRITTEN_METRIC_UPDATES == 0
-        && bytesWrittenCallback.isDefined) {
+    if (recordsWritten % PairRDDFunctions.RECORDS_BETWEEN_BYTES_WRITTEN_METRIC_UPDATES == 0) {
       bytesWrittenCallback.foreach { fn => outputMetrics.setBytesWritten(fn()) }
+      outputMetrics.setRecordsWritten(recordsWritten)
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala
index e3e7434df45b0..7a2c5ae32d98b 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala
@@ -86,6 +86,12 @@ private[hash] object BlockStoreShuffleFetcher extends Logging {
       context.taskMetrics.updateShuffleReadMetrics()
     })
 
-    new InterruptibleIterator[T](context, completionIter)
+    new InterruptibleIterator[T](context, completionIter) {
+      val readMetrics = context.taskMetrics.createShuffleReadMetricsForDependency()
+      override def next(): T = {
+        readMetrics.incRecordsRead(1)
+        delegate.next()
+      }
+    }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 8bc5a1cd18b64..86dbd89f0ffb8 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -53,7 +53,7 @@ private[spark] class BlockResult(
     readMethod: DataReadMethod.Value,
     bytes: Long) {
   val inputMetrics = new InputMetrics(readMethod)
-  inputMetrics.addBytesRead(bytes)
+  inputMetrics.incBytesRead(bytes)
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
index 3198d766fca37..81164178b9e8e 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
@@ -29,7 +29,8 @@ import org.apache.spark.executor.ShuffleWriteMetrics
  * appending data to an existing block, and can guarantee atomicity in the case of faults
  * as it allows the caller to revert partial writes.
  *
- * This interface does not support concurrent writes.
+ * This interface does not support concurrent writes. Also, once the writer has
+ * been opened, it cannot be reopened again.
  */
 private[spark] abstract class BlockObjectWriter(val blockId: BlockId) {
 
@@ -95,6 +96,7 @@ private[spark] class DiskBlockObjectWriter(
   private var ts: TimeTrackingOutputStream = null
   private var objOut: SerializationStream = null
   private var initialized = false
+  private var hasBeenClosed = false
 
   /**
    * Cursors used to represent positions in the file.
@@ -115,11 +117,16 @@ private[spark] class DiskBlockObjectWriter(
   private var finalPosition: Long = -1
   private var reportedPosition = initialPosition
 
-  /** Calling channel.position() to update the write metrics can be a little bit expensive, so we
-    * only call it every N writes */
-  private var writesSinceMetricsUpdate = 0
+  /**
+   * Keep track of number of records written and also use this to periodically
+   * output bytes written since the latter is expensive to do for each record.
+   */
+  private var numRecordsWritten = 0
 
   override def open(): BlockObjectWriter = {
+    if (hasBeenClosed) {
+      throw new IllegalStateException("Writer already closed. Cannot be reopened.")
+    }
     fos = new FileOutputStream(file, true)
     ts = new TimeTrackingOutputStream(fos)
     channel = fos.getChannel()
@@ -145,6 +152,7 @@ private[spark] class DiskBlockObjectWriter(
       ts = null
       objOut = null
       initialized = false
+      hasBeenClosed = true
     }
   }
 
@@ -168,6 +176,7 @@ private[spark] class DiskBlockObjectWriter(
   override def revertPartialWritesAndClose() {
     try {
       writeMetrics.decShuffleBytesWritten(reportedPosition - initialPosition)
+      writeMetrics.decShuffleRecordsWritten(numRecordsWritten)
 
       if (initialized) {
         objOut.flush()
@@ -193,12 +202,11 @@ private[spark] class DiskBlockObjectWriter(
     }
 
     objOut.writeObject(value)
+    numRecordsWritten += 1
+    writeMetrics.incShuffleRecordsWritten(1)
 
-    if (writesSinceMetricsUpdate == 32) {
-      writesSinceMetricsUpdate = 0
+    if (numRecordsWritten % 32 == 0) {
       updateBytesWritten()
-    } else {
-      writesSinceMetricsUpdate += 1
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/ui/ToolTips.scala b/core/src/main/scala/org/apache/spark/ui/ToolTips.scala
index 4307029d44fbb..3a15e603b1969 100644
--- a/core/src/main/scala/org/apache/spark/ui/ToolTips.scala
+++ b/core/src/main/scala/org/apache/spark/ui/ToolTips.scala
@@ -29,14 +29,15 @@ private[spark] object ToolTips {
   val SHUFFLE_READ_BLOCKED_TIME =
     "Time that the task spent blocked waiting for shuffle data to be read from remote machines."
 
-  val INPUT = "Bytes read from Hadoop or from Spark storage."
+  val INPUT = "Bytes and records read from Hadoop or from Spark storage."
 
-  val OUTPUT = "Bytes written to Hadoop."
+  val OUTPUT = "Bytes and records written to Hadoop."
 
-  val SHUFFLE_WRITE = "Bytes written to disk in order to be read by a shuffle in a future stage."
+  val SHUFFLE_WRITE =
+    "Bytes and records written to disk in order to be read by a shuffle in a future stage."
 
   val SHUFFLE_READ =
-    """Bytes read from remote executors. Typically less than shuffle write bytes
+    """Bytes and records read from remote executors. Typically less than shuffle write bytes
        because this does not include shuffle data read locally."""
 
   val GETTING_RESULT_TIME =
diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
index a38cb75fdd8c6..3afd7ef07d7c9 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
@@ -48,7 +48,9 @@ class ExecutorsListener(storageStatusListener: StorageStatusListener) extends Sp
   val executorToTasksFailed = HashMap[String, Int]()
   val executorToDuration = HashMap[String, Long]()
   val executorToInputBytes = HashMap[String, Long]()
+  val executorToInputRecords = HashMap[String, Long]()
   val executorToOutputBytes = HashMap[String, Long]()
+  val executorToOutputRecords = HashMap[String, Long]()
   val executorToShuffleRead = HashMap[String, Long]()
   val executorToShuffleWrite = HashMap[String, Long]()
   val executorToLogUrls = HashMap[String, Map[String, String]]()
@@ -84,10 +86,14 @@ class ExecutorsListener(storageStatusListener: StorageStatusListener) extends Sp
         metrics.inputMetrics.foreach { inputMetrics =>
           executorToInputBytes(eid) =
             executorToInputBytes.getOrElse(eid, 0L) + inputMetrics.bytesRead
+          executorToInputRecords(eid) =
+            executorToInputRecords.getOrElse(eid, 0L) + inputMetrics.recordsRead
         }
         metrics.outputMetrics.foreach { outputMetrics =>
           executorToOutputBytes(eid) =
             executorToOutputBytes.getOrElse(eid, 0L) + outputMetrics.bytesWritten
+          executorToOutputRecords(eid) =
+            executorToOutputRecords.getOrElse(eid, 0L) + outputMetrics.recordsWritten
         }
         metrics.shuffleReadMetrics.foreach { shuffleRead =>
           executorToShuffleRead(eid) =
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
index 9836d11a6d85f..1f8536d1b7195 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
@@ -36,6 +36,20 @@ private[ui] class ExecutorTable(stageId: Int, stageAttemptId: Int, parent: Stage
 
   /** Special table which merges two header cells. */
   private def executorTable[T](): Seq[Node] = {
+    val stageData = listener.stageIdToData.get((stageId, stageAttemptId))
+    var hasInput = false
+    var hasOutput = false
+    var hasShuffleWrite = false
+    var hasShuffleRead = false
+    var hasBytesSpilled = false
+    stageData.foreach(data => {
+        hasInput = data.hasInput
+        hasOutput = data.hasOutput
+        hasShuffleRead = data.hasShuffleRead
+        hasShuffleWrite = data.hasShuffleWrite
+        hasBytesSpilled = data.hasBytesSpilled
+    })
+
     <table class={UIUtils.TABLE_CLASS_STRIPED}>
       <thead>
         <th>Executor ID</th>
@@ -44,12 +58,32 @@ private[ui] class ExecutorTable(stageId: Int, stageAttemptId: Int, parent: Stage
         <th>Total Tasks</th>
         <th>Failed Tasks</th>
         <th>Succeeded Tasks</th>
-        <th><span data-toggle="tooltip" title={ToolTips.INPUT}>Input</span></th>
-        <th><span data-toggle="tooltip" title={ToolTips.OUTPUT}>Output</span></th>
-        <th><span data-toggle="tooltip" title={ToolTips.SHUFFLE_READ}>Shuffle Read</span></th>
-        <th><span data-toggle="tooltip" title={ToolTips.SHUFFLE_WRITE}>Shuffle Write</span></th>
-        <th>Shuffle Spill (Memory)</th>
-        <th>Shuffle Spill (Disk)</th>
+        {if (hasInput) {
+          <th>
+            <span data-toggle="tooltip" title={ToolTips.INPUT}>Input Size / Records</span>
+          </th>
+        }}
+        {if (hasOutput) {
+          <th>
+            <span data-toggle="tooltip" title={ToolTips.OUTPUT}>Output Size / Records</span>
+          </th>
+        }}
+        {if (hasShuffleRead) {
+          <th>
+            <span data-toggle="tooltip" title={ToolTips.SHUFFLE_READ}>
+            Shuffle Read Size / Records</span>
+          </th>
+        }}
+        {if (hasShuffleWrite) {
+          <th>
+            <span data-toggle="tooltip" title={ToolTips.SHUFFLE_WRITE}>
+            Shuffle Write Size / Records</span>
+          </th>
+        }}
+        {if (hasBytesSpilled) {
+          <th>Shuffle Spill (Memory)</th>
+          <th>Shuffle Spill (Disk)</th>
+        }}
       </thead>
       <tbody>
         {createExecutorTable()}
@@ -76,18 +110,34 @@ private[ui] class ExecutorTable(stageId: Int, stageAttemptId: Int, parent: Stage
             <td>{v.failedTasks + v.succeededTasks}</td>
             <td>{v.failedTasks}</td>
             <td>{v.succeededTasks}</td>
-            <td sorttable_customkey={v.inputBytes.toString}>
-              {Utils.bytesToString(v.inputBytes)}</td>
-            <td sorttable_customkey={v.outputBytes.toString}>
-              {Utils.bytesToString(v.outputBytes)}</td>
-            <td sorttable_customkey={v.shuffleRead.toString}>
-              {Utils.bytesToString(v.shuffleRead)}</td>
-            <td sorttable_customkey={v.shuffleWrite.toString}>
-              {Utils.bytesToString(v.shuffleWrite)}</td>
-            <td sorttable_customkey={v.memoryBytesSpilled.toString}>
-              {Utils.bytesToString(v.memoryBytesSpilled)}</td>
-            <td sorttable_customkey={v.diskBytesSpilled.toString}>
-              {Utils.bytesToString(v.diskBytesSpilled)}</td>
+            {if (stageData.hasInput) {
+              <td sorttable_customkey={v.inputBytes.toString}>
+                {s"${Utils.bytesToString(v.inputBytes)} / ${v.inputRecords}"}
+              </td>
+            }}
+            {if (stageData.hasOutput) {
+              <td sorttable_customkey={v.outputBytes.toString}>
+                {s"${Utils.bytesToString(v.outputBytes)} / ${v.outputRecords}"}
+              </td>
+            }}
+            {if (stageData.hasShuffleRead) {
+              <td sorttable_customkey={v.shuffleRead.toString}>
+                {s"${Utils.bytesToString(v.shuffleRead)} / ${v.shuffleReadRecords}"}
+              </td>
+            }}
+            {if (stageData.hasShuffleWrite) {
+              <td sorttable_customkey={v.shuffleWrite.toString}>
+                {s"${Utils.bytesToString(v.shuffleWrite)} / ${v.shuffleWriteRecords}"}
+              </td>
+            }}
+            {if (stageData.hasBytesSpilled) {
+              <td sorttable_customkey={v.memoryBytesSpilled.toString}>
+                {Utils.bytesToString(v.memoryBytesSpilled)}
+              </td>
+              <td sorttable_customkey={v.diskBytesSpilled.toString}>
+                {Utils.bytesToString(v.diskBytesSpilled)}
+              </td>
+            }}
           </tr>
         }
       case None =>
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
index 4d200eeda86b9..f463f8d7c7215 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
@@ -394,24 +394,48 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
     stageData.shuffleWriteBytes += shuffleWriteDelta
     execSummary.shuffleWrite += shuffleWriteDelta
 
+    val shuffleWriteRecordsDelta =
+      (taskMetrics.shuffleWriteMetrics.map(_.shuffleRecordsWritten).getOrElse(0L)
+      - oldMetrics.flatMap(_.shuffleWriteMetrics).map(_.shuffleRecordsWritten).getOrElse(0L))
+    stageData.shuffleWriteRecords += shuffleWriteRecordsDelta
+    execSummary.shuffleWriteRecords += shuffleWriteRecordsDelta
+
     val shuffleReadDelta =
       (taskMetrics.shuffleReadMetrics.map(_.remoteBytesRead).getOrElse(0L)
       - oldMetrics.flatMap(_.shuffleReadMetrics).map(_.remoteBytesRead).getOrElse(0L))
     stageData.shuffleReadBytes += shuffleReadDelta
     execSummary.shuffleRead += shuffleReadDelta
 
+    val shuffleReadRecordsDelta =
+      (taskMetrics.shuffleReadMetrics.map(_.recordsRead).getOrElse(0L)
+      - oldMetrics.flatMap(_.shuffleReadMetrics).map(_.recordsRead).getOrElse(0L))
+    stageData.shuffleReadRecords += shuffleReadRecordsDelta
+    execSummary.shuffleReadRecords += shuffleReadRecordsDelta
+
     val inputBytesDelta =
       (taskMetrics.inputMetrics.map(_.bytesRead).getOrElse(0L)
       - oldMetrics.flatMap(_.inputMetrics).map(_.bytesRead).getOrElse(0L))
     stageData.inputBytes += inputBytesDelta
     execSummary.inputBytes += inputBytesDelta
 
+    val inputRecordsDelta =
+      (taskMetrics.inputMetrics.map(_.recordsRead).getOrElse(0L)
+      - oldMetrics.flatMap(_.inputMetrics).map(_.recordsRead).getOrElse(0L))
+    stageData.inputRecords += inputRecordsDelta
+    execSummary.inputRecords += inputRecordsDelta
+
     val outputBytesDelta =
       (taskMetrics.outputMetrics.map(_.bytesWritten).getOrElse(0L)
         - oldMetrics.flatMap(_.outputMetrics).map(_.bytesWritten).getOrElse(0L))
     stageData.outputBytes += outputBytesDelta
     execSummary.outputBytes += outputBytesDelta
 
+    val outputRecordsDelta =
+      (taskMetrics.outputMetrics.map(_.recordsWritten).getOrElse(0L)
+        - oldMetrics.flatMap(_.outputMetrics).map(_.recordsWritten).getOrElse(0L))
+    stageData.outputRecords += outputRecordsDelta
+    execSummary.outputRecords += outputRecordsDelta
+
     val diskSpillDelta =
       taskMetrics.diskBytesSpilled - oldMetrics.map(_.diskBytesSpilled).getOrElse(0L)
     stageData.diskBytesSpilled += diskSpillDelta
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index d8be1b20b3acd..02a3cc3e43c25 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -56,11 +56,6 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
       val numCompleted = tasks.count(_.taskInfo.finished)
       val accumulables = listener.stageIdToData((stageId, stageAttemptId)).accumulables
       val hasAccumulators = accumulables.size > 0
-      val hasInput = stageData.inputBytes > 0
-      val hasOutput = stageData.outputBytes > 0
-      val hasShuffleRead = stageData.shuffleReadBytes > 0
-      val hasShuffleWrite = stageData.shuffleWriteBytes > 0
-      val hasBytesSpilled = stageData.memoryBytesSpilled > 0 && stageData.diskBytesSpilled > 0
 
       val summary =
         <div>
@@ -69,31 +64,33 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
               <strong>Total task time across all tasks: </strong>
               {UIUtils.formatDuration(stageData.executorRunTime)}
             </li>
-            {if (hasInput) {
+            {if (stageData.hasInput) {
               <li>
-                <strong>Input: </strong>
-                {Utils.bytesToString(stageData.inputBytes)}
+                <strong>Input Size / Records: </strong>
+                {s"${Utils.bytesToString(stageData.inputBytes)} / ${stageData.inputRecords}"}
               </li>
             }}
-            {if (hasOutput) {
+            {if (stageData.hasOutput) {
               <li>
                 <strong>Output: </strong>
-                {Utils.bytesToString(stageData.outputBytes)}
+                {s"${Utils.bytesToString(stageData.outputBytes)} / ${stageData.outputRecords}"}
               </li>
             }}
-            {if (hasShuffleRead) {
+            {if (stageData.hasShuffleRead) {
               <li>
                 <strong>Shuffle read: </strong>
-                {Utils.bytesToString(stageData.shuffleReadBytes)}
+                {s"${Utils.bytesToString(stageData.shuffleReadBytes)} / " +
+                 s"${stageData.shuffleReadRecords}"}
               </li>
             }}
-            {if (hasShuffleWrite) {
+            {if (stageData.hasShuffleWrite) {
               <li>
                 <strong>Shuffle write: </strong>
-                {Utils.bytesToString(stageData.shuffleWriteBytes)}
+                 {s"${Utils.bytesToString(stageData.shuffleWriteBytes)} / " +
+                 s"${stageData.shuffleWriteRecords}"}
               </li>
             }}
-            {if (hasBytesSpilled) {
+            {if (stageData.hasBytesSpilled) {
               <li>
                 <strong>Shuffle spill (memory): </strong>
                 {Utils.bytesToString(stageData.memoryBytesSpilled)}
@@ -132,7 +129,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
                   <span class="additional-metric-title">Task Deserialization Time</span>
                 </span>
               </li>
-              {if (hasShuffleRead) {
+              {if (stageData.hasShuffleRead) {
                 <li>
                   <span data-toggle="tooltip"
                         title={ToolTips.SHUFFLE_READ_BLOCKED_TIME} data-placement="right">
@@ -174,25 +171,32 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
           ("Result Serialization Time", TaskDetailsClassNames.RESULT_SERIALIZATION_TIME),
           ("Getting Result Time", TaskDetailsClassNames.GETTING_RESULT_TIME)) ++
         {if (hasAccumulators) Seq(("Accumulators", "")) else Nil} ++
-        {if (hasInput) Seq(("Input", "")) else Nil} ++
-        {if (hasOutput) Seq(("Output", "")) else Nil} ++
-        {if (hasShuffleRead) {
+        {if (stageData.hasInput) Seq(("Input Size / Records", "")) else Nil} ++
+        {if (stageData.hasOutput) Seq(("Output Size / Records", "")) else Nil} ++
+        {if (stageData.hasShuffleRead) {
           Seq(("Shuffle Read Blocked Time", TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME),
-            ("Shuffle Read", ""))
+            ("Shuffle Read Size / Records", ""))
+        } else {
+          Nil
+        }} ++
+        {if (stageData.hasShuffleWrite) {
+          Seq(("Write Time", ""), ("Shuffle Write Size / Records", ""))
+        } else {
+          Nil
+        }} ++
+        {if (stageData.hasBytesSpilled) {
+          Seq(("Shuffle Spill (Memory)", ""), ("Shuffle Spill (Disk)", ""))
         } else {
           Nil
         }} ++
-        {if (hasShuffleWrite) Seq(("Write Time", ""), ("Shuffle Write", "")) else Nil} ++
-        {if (hasBytesSpilled) Seq(("Shuffle Spill (Memory)", ""), ("Shuffle Spill (Disk)", ""))
-          else Nil} ++
         Seq(("Errors", ""))
 
       val unzipped = taskHeadersAndCssClasses.unzip
 
       val taskTable = UIUtils.listingTable(
         unzipped._1,
-        taskRow(hasAccumulators, hasInput, hasOutput, hasShuffleRead, hasShuffleWrite,
-          hasBytesSpilled),
+        taskRow(hasAccumulators, stageData.hasInput, stageData.hasOutput,
+          stageData.hasShuffleRead, stageData.hasShuffleWrite, stageData.hasBytesSpilled),
         tasks,
         headerClasses = unzipped._2)
       // Excludes tasks which failed and have incomplete metrics
@@ -203,8 +207,11 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
           None
         }
         else {
+          def getDistributionQuantiles(data: Seq[Double]): IndexedSeq[Double] =
+            Distribution(data).get.getQuantiles()
+
           def getFormattedTimeQuantiles(times: Seq[Double]): Seq[Node] = {
-            Distribution(times).get.getQuantiles().map { millis =>
+            getDistributionQuantiles(times).map { millis =>
               <td>{UIUtils.formatDuration(millis.toLong)}</td>
             }
           }
@@ -273,17 +280,36 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
             getFormattedTimeQuantiles(schedulerDelays)
 
           def getFormattedSizeQuantiles(data: Seq[Double]) =
-            Distribution(data).get.getQuantiles().map(d => <td>{Utils.bytesToString(d.toLong)}</td>)
+            getDistributionQuantiles(data).map(d => <td>{Utils.bytesToString(d.toLong)}</td>)
+
+          def getFormattedSizeQuantilesWithRecords(data: Seq[Double], records: Seq[Double]) = {
+            val recordDist = getDistributionQuantiles(records).iterator
+            getDistributionQuantiles(data).map(d =>
+              <td>{s"${Utils.bytesToString(d.toLong)} / ${recordDist.next().toLong}"}</td>
+            )
+          }
 
           val inputSizes = validTasks.map { case TaskUIData(_, metrics, _) =>
             metrics.get.inputMetrics.map(_.bytesRead).getOrElse(0L).toDouble
           }
-          val inputQuantiles = <td>Input</td> +: getFormattedSizeQuantiles(inputSizes)
+
+          val inputRecords = validTasks.map { case TaskUIData(_, metrics, _) =>
+            metrics.get.inputMetrics.map(_.recordsRead).getOrElse(0L).toDouble
+          }
+
+          val inputQuantiles = <td>Input Size / Records</td> +:
+            getFormattedSizeQuantilesWithRecords(inputSizes, inputRecords)
 
           val outputSizes = validTasks.map { case TaskUIData(_, metrics, _) =>
             metrics.get.outputMetrics.map(_.bytesWritten).getOrElse(0L).toDouble
           }
-          val outputQuantiles = <td>Output</td> +: getFormattedSizeQuantiles(outputSizes)
+
+          val outputRecords = validTasks.map { case TaskUIData(_, metrics, _) =>
+            metrics.get.outputMetrics.map(_.recordsWritten).getOrElse(0L).toDouble
+          }
+
+          val outputQuantiles = <td>Output Size / Records</td> +:
+            getFormattedSizeQuantilesWithRecords(outputSizes, outputRecords)
 
           val shuffleReadBlockedTimes = validTasks.map { case TaskUIData(_, metrics, _) =>
             metrics.get.shuffleReadMetrics.map(_.fetchWaitTime).getOrElse(0L).toDouble
@@ -294,14 +320,24 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
           val shuffleReadSizes = validTasks.map { case TaskUIData(_, metrics, _) =>
             metrics.get.shuffleReadMetrics.map(_.remoteBytesRead).getOrElse(0L).toDouble
           }
-          val shuffleReadQuantiles = <td>Shuffle Read (Remote)</td> +:
-            getFormattedSizeQuantiles(shuffleReadSizes)
+
+          val shuffleReadRecords = validTasks.map { case TaskUIData(_, metrics, _) =>
+            metrics.get.shuffleReadMetrics.map(_.recordsRead).getOrElse(0L).toDouble
+          }
+
+          val shuffleReadQuantiles = <td>Shuffle Read Size / Records (Remote)</td> +:
+            getFormattedSizeQuantilesWithRecords(shuffleReadSizes, shuffleReadRecords)
 
           val shuffleWriteSizes = validTasks.map { case TaskUIData(_, metrics, _) =>
             metrics.get.shuffleWriteMetrics.map(_.shuffleBytesWritten).getOrElse(0L).toDouble
           }
-          val shuffleWriteQuantiles = <td>Shuffle Write</td> +:
-            getFormattedSizeQuantiles(shuffleWriteSizes)
+
+          val shuffleWriteRecords = validTasks.map { case TaskUIData(_, metrics, _) =>
+            metrics.get.shuffleWriteMetrics.map(_.shuffleRecordsWritten).getOrElse(0L).toDouble
+          }
+
+          val shuffleWriteQuantiles = <td>Shuffle Write Size / Records</td> +:
+            getFormattedSizeQuantilesWithRecords(shuffleWriteSizes, shuffleWriteRecords)
 
           val memoryBytesSpilledSizes = validTasks.map { case TaskUIData(_, metrics, _) =>
             metrics.get.memoryBytesSpilled.toDouble
@@ -326,9 +362,9 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
               {serializationQuantiles}
             </tr>,
             <tr class={TaskDetailsClassNames.GETTING_RESULT_TIME}>{gettingResultQuantiles}</tr>,
-            if (hasInput) <tr>{inputQuantiles}</tr> else Nil,
-            if (hasOutput) <tr>{outputQuantiles}</tr> else Nil,
-            if (hasShuffleRead) {
+            if (stageData.hasInput) <tr>{inputQuantiles}</tr> else Nil,
+            if (stageData.hasOutput) <tr>{outputQuantiles}</tr> else Nil,
+            if (stageData.hasShuffleRead) {
               <tr class={TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME}>
                 {shuffleReadBlockedQuantiles}
               </tr>
@@ -336,9 +372,9 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
             } else {
               Nil
             },
-            if (hasShuffleWrite) <tr>{shuffleWriteQuantiles}</tr> else Nil,
-            if (hasBytesSpilled) <tr>{memoryBytesSpilledQuantiles}</tr> else Nil,
-            if (hasBytesSpilled) <tr>{diskBytesSpilledQuantiles}</tr> else Nil)
+            if (stageData.hasShuffleWrite) <tr>{shuffleWriteQuantiles}</tr> else Nil,
+            if (stageData.hasBytesSpilled) <tr>{memoryBytesSpilledQuantiles}</tr> else Nil,
+            if (stageData.hasBytesSpilled) <tr>{diskBytesSpilledQuantiles}</tr> else Nil)
 
           val quantileHeaders = Seq("Metric", "Min", "25th percentile",
             "Median", "75th percentile", "Max")
@@ -397,26 +433,32 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
       val inputReadable = maybeInput
         .map(m => s"${Utils.bytesToString(m.bytesRead)} (${m.readMethod.toString.toLowerCase()})")
         .getOrElse("")
+      val inputRecords = maybeInput.map(_.recordsRead.toString).getOrElse("")
 
       val maybeOutput = metrics.flatMap(_.outputMetrics)
       val outputSortable = maybeOutput.map(_.bytesWritten.toString).getOrElse("")
       val outputReadable = maybeOutput
         .map(m => s"${Utils.bytesToString(m.bytesWritten)}")
         .getOrElse("")
+      val outputRecords = maybeOutput.map(_.recordsWritten.toString).getOrElse("")
 
-      val maybeShuffleReadBlockedTime = metrics.flatMap(_.shuffleReadMetrics).map(_.fetchWaitTime)
-      val shuffleReadBlockedTimeSortable = maybeShuffleReadBlockedTime.map(_.toString).getOrElse("")
+      val maybeShuffleRead = metrics.flatMap(_.shuffleReadMetrics)
+      val shuffleReadBlockedTimeSortable = maybeShuffleRead
+        .map(_.fetchWaitTime.toString).getOrElse("")
       val shuffleReadBlockedTimeReadable =
-        maybeShuffleReadBlockedTime.map(ms => UIUtils.formatDuration(ms)).getOrElse("")
+        maybeShuffleRead.map(ms => UIUtils.formatDuration(ms.fetchWaitTime)).getOrElse("")
 
-      val maybeShuffleRead = metrics.flatMap(_.shuffleReadMetrics).map(_.remoteBytesRead)
-      val shuffleReadSortable = maybeShuffleRead.map(_.toString).getOrElse("")
-      val shuffleReadReadable = maybeShuffleRead.map(Utils.bytesToString).getOrElse("")
+      val shuffleReadSortable = maybeShuffleRead.map(_.remoteBytesRead.toString).getOrElse("")
+      val shuffleReadReadable = maybeShuffleRead
+        .map(m => s"${Utils.bytesToString(m.remoteBytesRead)}").getOrElse("")
+      val shuffleReadRecords = maybeShuffleRead.map(_.recordsRead.toString).getOrElse("")
 
-      val maybeShuffleWrite =
-        metrics.flatMap(_.shuffleWriteMetrics).map(_.shuffleBytesWritten)
-      val shuffleWriteSortable = maybeShuffleWrite.map(_.toString).getOrElse("")
-      val shuffleWriteReadable = maybeShuffleWrite.map(Utils.bytesToString).getOrElse("")
+      val maybeShuffleWrite = metrics.flatMap(_.shuffleWriteMetrics)
+      val shuffleWriteSortable = maybeShuffleWrite.map(_.shuffleBytesWritten.toString).getOrElse("")
+      val shuffleWriteReadable = maybeShuffleWrite
+        .map(m => s"${Utils.bytesToString(m.shuffleBytesWritten)}").getOrElse("")
+      val shuffleWriteRecords = maybeShuffleWrite
+        .map(_.shuffleRecordsWritten.toString).getOrElse("")
 
       val maybeWriteTime = metrics.flatMap(_.shuffleWriteMetrics).map(_.shuffleWriteTime)
       val writeTimeSortable = maybeWriteTime.map(_.toString).getOrElse("")
@@ -472,12 +514,12 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
         }}
         {if (hasInput) {
           <td sorttable_customkey={inputSortable}>
-            {inputReadable}
+            {s"$inputReadable / $inputRecords"}
           </td>
         }}
         {if (hasOutput) {
           <td sorttable_customkey={outputSortable}>
-            {outputReadable}
+            {s"$outputReadable / $outputRecords"}
           </td>
         }}
         {if (hasShuffleRead) {
@@ -486,7 +528,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
              {shuffleReadBlockedTimeReadable}
            </td>
            <td sorttable_customkey={shuffleReadSortable}>
-             {shuffleReadReadable}
+             {s"$shuffleReadReadable / $shuffleReadRecords"}
            </td>
         }}
         {if (hasShuffleWrite) {
@@ -494,7 +536,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
              {writeTimeReadable}
            </td>
            <td sorttable_customkey={shuffleWriteSortable}>
-             {shuffleWriteReadable}
+             {s"$shuffleWriteReadable / $shuffleWriteRecords"}
            </td>
         }}
         {if (hasBytesSpilled) {
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
index 01f7e23212c3d..69aac6c862de5 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
@@ -31,9 +31,13 @@ private[jobs] object UIData {
     var failedTasks : Int = 0
     var succeededTasks : Int = 0
     var inputBytes : Long = 0
+    var inputRecords : Long = 0
     var outputBytes : Long = 0
+    var outputRecords : Long = 0
     var shuffleRead : Long = 0
+    var shuffleReadRecords : Long = 0
     var shuffleWrite : Long = 0
+    var shuffleWriteRecords : Long = 0
     var memoryBytesSpilled : Long = 0
     var diskBytesSpilled : Long = 0
   }
@@ -73,9 +77,13 @@ private[jobs] object UIData {
     var executorRunTime: Long = _
 
     var inputBytes: Long = _
+    var inputRecords: Long = _
     var outputBytes: Long = _
+    var outputRecords: Long = _
     var shuffleReadBytes: Long = _
+    var shuffleReadRecords : Long = _
     var shuffleWriteBytes: Long = _
+    var shuffleWriteRecords: Long = _
     var memoryBytesSpilled: Long = _
     var diskBytesSpilled: Long = _
 
@@ -85,6 +93,12 @@ private[jobs] object UIData {
     var accumulables = new HashMap[Long, AccumulableInfo]
     var taskData = new HashMap[Long, TaskUIData]
     var executorSummary = new HashMap[String, ExecutorSummary]
+
+    def hasInput = inputBytes > 0
+    def hasOutput = outputBytes > 0
+    def hasShuffleRead = shuffleReadBytes > 0
+    def hasShuffleWrite = shuffleWriteBytes > 0
+    def hasBytesSpilled = memoryBytesSpilled > 0 && diskBytesSpilled > 0
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index c8407bbcb780b..b0b545640f5aa 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -293,22 +293,26 @@ private[spark] object JsonProtocol {
     ("Remote Blocks Fetched" -> shuffleReadMetrics.remoteBlocksFetched) ~
     ("Local Blocks Fetched" -> shuffleReadMetrics.localBlocksFetched) ~
     ("Fetch Wait Time" -> shuffleReadMetrics.fetchWaitTime) ~
-    ("Remote Bytes Read" -> shuffleReadMetrics.remoteBytesRead)
+    ("Remote Bytes Read" -> shuffleReadMetrics.remoteBytesRead) ~
+    ("Total Records Read" -> shuffleReadMetrics.recordsRead)
   }
 
   def shuffleWriteMetricsToJson(shuffleWriteMetrics: ShuffleWriteMetrics): JValue = {
     ("Shuffle Bytes Written" -> shuffleWriteMetrics.shuffleBytesWritten) ~
-    ("Shuffle Write Time" -> shuffleWriteMetrics.shuffleWriteTime)
+    ("Shuffle Write Time" -> shuffleWriteMetrics.shuffleWriteTime) ~
+    ("Shuffle Records Written" -> shuffleWriteMetrics.shuffleRecordsWritten)
   }
 
   def inputMetricsToJson(inputMetrics: InputMetrics): JValue = {
     ("Data Read Method" -> inputMetrics.readMethod.toString) ~
-    ("Bytes Read" -> inputMetrics.bytesRead)
+    ("Bytes Read" -> inputMetrics.bytesRead) ~
+    ("Records Read" -> inputMetrics.recordsRead)
   }
 
   def outputMetricsToJson(outputMetrics: OutputMetrics): JValue = {
     ("Data Write Method" -> outputMetrics.writeMethod.toString) ~
-    ("Bytes Written" -> outputMetrics.bytesWritten)
+    ("Bytes Written" -> outputMetrics.bytesWritten) ~
+    ("Records Written" -> outputMetrics.recordsWritten)
   }
 
   def taskEndReasonToJson(taskEndReason: TaskEndReason): JValue = {
@@ -670,6 +674,7 @@ private[spark] object JsonProtocol {
     metrics.incLocalBlocksFetched((json \ "Local Blocks Fetched").extract[Int])
     metrics.incFetchWaitTime((json \ "Fetch Wait Time").extract[Long])
     metrics.incRemoteBytesRead((json \ "Remote Bytes Read").extract[Long])
+    metrics.incRecordsRead((json \ "Total Records Read").extractOpt[Long].getOrElse(0))
     metrics
   }
 
@@ -677,13 +682,16 @@ private[spark] object JsonProtocol {
     val metrics = new ShuffleWriteMetrics
     metrics.incShuffleBytesWritten((json \ "Shuffle Bytes Written").extract[Long])
     metrics.incShuffleWriteTime((json \ "Shuffle Write Time").extract[Long])
+    metrics.setShuffleRecordsWritten((json \ "Shuffle Records Written")
+      .extractOpt[Long].getOrElse(0))
     metrics
   }
 
   def inputMetricsFromJson(json: JValue): InputMetrics = {
     val metrics = new InputMetrics(
       DataReadMethod.withName((json \ "Data Read Method").extract[String]))
-    metrics.addBytesRead((json \ "Bytes Read").extract[Long])
+    metrics.incBytesRead((json \ "Bytes Read").extract[Long])
+    metrics.incRecordsRead((json \ "Records Read").extractOpt[Long].getOrElse(0))
     metrics
   }
 
@@ -691,6 +699,7 @@ private[spark] object JsonProtocol {
     val metrics = new OutputMetrics(
       DataWriteMethod.withName((json \ "Data Write Method").extract[String]))
     metrics.setBytesWritten((json \ "Bytes Written").extract[Long])
+    metrics.setRecordsWritten((json \ "Records Written").extractOpt[Long].getOrElse(0))
     metrics
   }
 
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index 6ba03841f746b..eaec5a71e6819 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -763,6 +763,7 @@ private[spark] class ExternalSorter[K, V, C](
       if (curWriteMetrics != null) {
         m.incShuffleBytesWritten(curWriteMetrics.shuffleBytesWritten)
         m.incShuffleWriteTime(curWriteMetrics.shuffleWriteTime)
+        m.incShuffleRecordsWritten(curWriteMetrics.shuffleRecordsWritten)
       }
     }
 
diff --git a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
index 81db66ae17464..78fa98a3b9065 100644
--- a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala
@@ -21,44 +21,46 @@ import java.io.{File, FileWriter, PrintWriter}
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.scalatest.FunSuite
-
+import org.apache.commons.lang.math.RandomUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.io.{LongWritable, Text}
-import org.apache.hadoop.mapred.{FileSplit => OldFileSplit, InputSplit => OldInputSplit, JobConf,
-  LineRecordReader => OldLineRecordReader, RecordReader => OldRecordReader, Reporter,
-  TextInputFormat => OldTextInputFormat}
 import org.apache.hadoop.mapred.lib.{CombineFileInputFormat => OldCombineFileInputFormat,
-  CombineFileSplit => OldCombineFileSplit, CombineFileRecordReader => OldCombineFileRecordReader}
-import org.apache.hadoop.mapreduce.{InputSplit => NewInputSplit, RecordReader => NewRecordReader,
-  TaskAttemptContext}
+  CombineFileRecordReader => OldCombineFileRecordReader, CombineFileSplit => OldCombineFileSplit}
+import org.apache.hadoop.mapred.{JobConf, Reporter, FileSplit => OldFileSplit,
+  InputSplit => OldInputSplit, LineRecordReader => OldLineRecordReader,
+  RecordReader => OldRecordReader, TextInputFormat => OldTextInputFormat}
 import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat => NewCombineFileInputFormat,
   CombineFileRecordReader => NewCombineFileRecordReader, CombineFileSplit => NewCombineFileSplit,
   FileSplit => NewFileSplit, TextInputFormat => NewTextInputFormat}
+import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat}
+import org.apache.hadoop.mapreduce.{TaskAttemptContext, InputSplit => NewInputSplit,
+  RecordReader => NewRecordReader}
+import org.scalatest.{BeforeAndAfter, FunSuite}
 
 import org.apache.spark.SharedSparkContext
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd}
 import org.apache.spark.util.Utils
 
-class InputOutputMetricsSuite extends FunSuite with SharedSparkContext {
+class InputOutputMetricsSuite extends FunSuite with SharedSparkContext
+  with BeforeAndAfter {
 
   @transient var tmpDir: File = _
   @transient var tmpFile: File = _
   @transient var tmpFilePath: String = _
+  @transient val numRecords: Int = 100000
+  @transient val numBuckets: Int = 10
 
-  override def beforeAll() {
-    super.beforeAll()
-
+  before {
     tmpDir = Utils.createTempDir()
     val testTempDir = new File(tmpDir, "test")
     testTempDir.mkdir()
 
     tmpFile = new File(testTempDir, getClass.getSimpleName + ".txt")
     val pw = new PrintWriter(new FileWriter(tmpFile))
-    for (x <- 1 to 1000000) {
-      pw.println("s")
+    for (x <- 1 to numRecords) {
+      pw.println(RandomUtils.nextInt(numBuckets))
     }
     pw.close()
 
@@ -66,8 +68,7 @@ class InputOutputMetricsSuite extends FunSuite with SharedSparkContext {
     tmpFilePath = "file://" + tmpFile.getAbsolutePath
   }
 
-  override def afterAll() {
-    super.afterAll()
+  after {
     Utils.deleteRecursively(tmpDir)
   }
 
@@ -155,6 +156,101 @@ class InputOutputMetricsSuite extends FunSuite with SharedSparkContext {
     assert(bytesRead >= tmpFile.length())
   }
 
+  test("input metrics on records read - simple") {
+    val records = runAndReturnRecordsRead {
+      sc.textFile(tmpFilePath, 4).count()
+    }
+    assert(records == numRecords)
+  }
+
+  test("input metrics on records read - more stages") {
+    val records = runAndReturnRecordsRead {
+      sc.textFile(tmpFilePath, 4)
+        .map(key => (key.length, 1))
+        .reduceByKey(_ + _)
+        .count()
+    }
+    assert(records == numRecords)
+  }
+
+  test("input metrics on records - New Hadoop API") {
+    val records = runAndReturnRecordsRead {
+      sc.newAPIHadoopFile(tmpFilePath, classOf[NewTextInputFormat], classOf[LongWritable],
+        classOf[Text]).count()
+    }
+    assert(records == numRecords)
+  }
+
+  test("input metrics on recordsd read with cache") {
+    // prime the cache manager
+    val rdd = sc.textFile(tmpFilePath, 4).cache()
+    rdd.collect()
+
+    val records = runAndReturnRecordsRead {
+      rdd.count()
+    }
+
+    assert(records == numRecords)
+  }
+
+  test("shuffle records read metrics") {
+    val recordsRead = runAndReturnShuffleRecordsRead {
+      sc.textFile(tmpFilePath, 4)
+        .map(key => (key, 1))
+        .groupByKey()
+        .collect()
+    }
+    assert(recordsRead == numRecords)
+  }
+
+  test("shuffle records written metrics") {
+    val recordsWritten = runAndReturnShuffleRecordsWritten {
+      sc.textFile(tmpFilePath, 4)
+        .map(key => (key, 1))
+        .groupByKey()
+        .collect()
+    }
+    assert(recordsWritten == numRecords)
+  }
+
+  /**
+   * Tests the metrics from end to end.
+   * 1) reading a hadoop file
+   * 2) shuffle and writing to a hadoop file.
+   * 3) writing to hadoop file.
+   */
+  test("input read/write and shuffle read/write metrics all line up") {
+    var inputRead = 0L
+    var outputWritten = 0L
+    var shuffleRead = 0L
+    var shuffleWritten = 0L
+    sc.addSparkListener(new SparkListener() {
+      override def onTaskEnd(taskEnd: SparkListenerTaskEnd) {
+        val metrics = taskEnd.taskMetrics
+        metrics.inputMetrics.foreach(inputRead += _.recordsRead)
+        metrics.outputMetrics.foreach(outputWritten += _.recordsWritten)
+        metrics.shuffleReadMetrics.foreach(shuffleRead += _.recordsRead)
+        metrics.shuffleWriteMetrics.foreach(shuffleWritten += _.shuffleRecordsWritten)
+      }
+    })
+
+    val tmpFile = new File(tmpDir, getClass.getSimpleName)
+
+    sc.textFile(tmpFilePath, 4)
+      .map(key => (key, 1))
+      .reduceByKey(_+_)
+      .saveAsTextFile("file://" + tmpFile.getAbsolutePath)
+
+    sc.listenerBus.waitUntilEmpty(500)
+    assert(inputRead == numRecords)
+
+    // Only supported on newer Hadoop
+    if (SparkHadoopUtil.get.getFSBytesWrittenOnThreadCallback().isDefined) {
+      assert(outputWritten == numBuckets)
+    }
+    assert(shuffleRead == shuffleWritten)
+  }
+
   test("input metrics with interleaved reads") {
     val numPartitions = 2
     val cartVector = 0 to 9
@@ -193,18 +289,66 @@ class InputOutputMetricsSuite extends FunSuite with SharedSparkContext {
     assert(cartesianBytes == firstSize * numPartitions + (cartVector.length  * secondSize))
   }
 
-  private def runAndReturnBytesRead(job : => Unit): Long = {
-    val taskBytesRead = new ArrayBuffer[Long]()
+  private def runAndReturnBytesRead(job: => Unit): Long = {
+    runAndReturnMetrics(job, _.taskMetrics.inputMetrics.map(_.bytesRead))
+  }
+
+  private def runAndReturnRecordsRead(job: => Unit): Long = {
+    runAndReturnMetrics(job, _.taskMetrics.inputMetrics.map(_.recordsRead))
+  }
+
+  private def runAndReturnRecordsWritten(job: => Unit): Long = {
+    runAndReturnMetrics(job, _.taskMetrics.outputMetrics.map(_.recordsWritten))
+  }
+
+  private def runAndReturnShuffleRecordsRead(job: => Unit): Long = {
+    runAndReturnMetrics(job, _.taskMetrics.shuffleReadMetrics.map(_.recordsRead))
+  }
+
+  private def runAndReturnShuffleRecordsWritten(job: => Unit): Long = {
+    runAndReturnMetrics(job, _.taskMetrics.shuffleWriteMetrics.map(_.shuffleRecordsWritten))
+  }
+
+  private def runAndReturnMetrics(job: => Unit,
+      collector: (SparkListenerTaskEnd) => Option[Long]): Long = {
+    val taskMetrics = new ArrayBuffer[Long]()
     sc.addSparkListener(new SparkListener() {
       override def onTaskEnd(taskEnd: SparkListenerTaskEnd) {
-        taskBytesRead += taskEnd.taskMetrics.inputMetrics.get.bytesRead
+        collector(taskEnd).foreach(taskMetrics += _)
       }
     })
 
     job
 
     sc.listenerBus.waitUntilEmpty(500)
-    taskBytesRead.sum
+    taskMetrics.sum
+  }
+
+  test("output metrics on records written") {
+    // Only supported on newer Hadoop
+    if (SparkHadoopUtil.get.getFSBytesWrittenOnThreadCallback().isDefined) {
+      val file = new File(tmpDir, getClass.getSimpleName)
+      val filePath = "file://" + file.getAbsolutePath
+
+      val records = runAndReturnRecordsWritten {
+        sc.parallelize(1 to numRecords).saveAsTextFile(filePath)
+      }
+      assert(records == numRecords)
+    }
+  }
+
+  test("output metrics on records written - new Hadoop API") {
+    // Only supported on newer Hadoop
+    if (SparkHadoopUtil.get.getFSBytesWrittenOnThreadCallback().isDefined) {
+      val file = new File(tmpDir, getClass.getSimpleName)
+      val filePath = "file://" + file.getAbsolutePath
+
+      val records = runAndReturnRecordsWritten {
+        sc.parallelize(1 to numRecords).map(key => (key.toString, key.toString))
+          .saveAsNewAPIHadoopFile[NewTextOutputFormat[String, String]](filePath)
+      }
+      assert(records == numRecords)
+    }
   }
 
   test("output metrics when writing text file") {
@@ -318,4 +462,4 @@ class NewCombineTextRecordReaderWrapper(
   override def getCurrentValue(): Text = delegate.getCurrentValue
   override def getProgress(): Float = delegate.getProgress
   override def close(): Unit = delegate.close()
-}
\ No newline at end of file
+}
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockObjectWriterSuite.scala
index bbc7e1357b90d..c21c92b63ad13 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockObjectWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockObjectWriterSuite.scala
@@ -31,6 +31,8 @@ class BlockObjectWriterSuite extends FunSuite {
       new JavaSerializer(new SparkConf()), 1024, os => os, true, writeMetrics)
 
     writer.write(Long.box(20))
+    // Record metrics update on every write
+    assert(writeMetrics.shuffleRecordsWritten === 1)
     // Metrics don't update on every write
     assert(writeMetrics.shuffleBytesWritten == 0)
     // After 32 writes, metrics should update
@@ -39,6 +41,7 @@ class BlockObjectWriterSuite extends FunSuite {
       writer.write(Long.box(i))
     }
     assert(writeMetrics.shuffleBytesWritten > 0)
+    assert(writeMetrics.shuffleRecordsWritten === 33)
     writer.commitAndClose()
     assert(file.length() == writeMetrics.shuffleBytesWritten)
   }
@@ -51,6 +54,8 @@ class BlockObjectWriterSuite extends FunSuite {
       new JavaSerializer(new SparkConf()), 1024, os => os, true, writeMetrics)
 
     writer.write(Long.box(20))
+    // Record metrics update on every write
+    assert(writeMetrics.shuffleRecordsWritten === 1)
     // Metrics don't update on every write
     assert(writeMetrics.shuffleBytesWritten == 0)
     // After 32 writes, metrics should update
@@ -59,7 +64,23 @@ class BlockObjectWriterSuite extends FunSuite {
       writer.write(Long.box(i))
     }
     assert(writeMetrics.shuffleBytesWritten > 0)
+    assert(writeMetrics.shuffleRecordsWritten === 33)
     writer.revertPartialWritesAndClose()
     assert(writeMetrics.shuffleBytesWritten == 0)
+    assert(writeMetrics.shuffleRecordsWritten == 0)
+  }
+
+  test("Reopening a closed block writer") {
+    val file = new File("somefile")
+    file.deleteOnExit()
+    val writeMetrics = new ShuffleWriteMetrics()
+    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
+      new JavaSerializer(new SparkConf()), 1024, os => os, true, writeMetrics)
+
+    writer.open()
+    writer.close()
+    intercept[IllegalStateException] {
+      writer.open()
+    }
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
index 68074ae32a672..e8405baa8e3ea 100644
--- a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
@@ -234,7 +234,7 @@ class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matc
       taskMetrics.incMemoryBytesSpilled(base + 6)
       val inputMetrics = new InputMetrics(DataReadMethod.Hadoop)
       taskMetrics.setInputMetrics(Some(inputMetrics))
-      inputMetrics.addBytesRead(base + 7)
+      inputMetrics.incBytesRead(base + 7)
       val outputMetrics = new OutputMetrics(DataWriteMethod.Hadoop)
       taskMetrics.outputMetrics = Some(outputMetrics)
       outputMetrics.setBytesWritten(base + 8)
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index 842f54529baf0..f3017dc42cd5c 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -189,6 +189,34 @@ class JsonProtocolSuite extends FunSuite {
     assert(newMetrics.inputMetrics.isEmpty)
   }
 
+  test("Input/Output records backwards compatibility") {
+    // records read were added after 1.2
+    val metrics = makeTaskMetrics(1L, 2L, 3L, 4L, 5, 6,
+      hasHadoopInput = true, hasOutput = true, hasRecords = false)
+    assert(metrics.inputMetrics.nonEmpty)
+    assert(metrics.outputMetrics.nonEmpty)
+    val newJson = JsonProtocol.taskMetricsToJson(metrics)
+    val oldJson = newJson.removeField { case (field, _) => field == "Records Read" }
+                         .removeField { case (field, _) => field == "Records Written" }
+    val newMetrics = JsonProtocol.taskMetricsFromJson(oldJson)
+    assert(newMetrics.inputMetrics.get.recordsRead == 0)
+    assert(newMetrics.outputMetrics.get.recordsWritten == 0)
+  }
+
+  test("Shuffle Read/Write records backwards compatibility") {
+    // records read were added after 1.2
+    val metrics = makeTaskMetrics(1L, 2L, 3L, 4L, 5, 6,
+      hasHadoopInput = false, hasOutput = false, hasRecords = false)
+    assert(metrics.shuffleReadMetrics.nonEmpty)
+    assert(metrics.shuffleWriteMetrics.nonEmpty)
+    val newJson = JsonProtocol.taskMetricsToJson(metrics)
+    val oldJson = newJson.removeField { case (field, _) => field == "Total Records Read" }
+                         .removeField { case (field, _) => field == "Shuffle Records Written" }
+    val newMetrics = JsonProtocol.taskMetricsFromJson(oldJson)
+    assert(newMetrics.shuffleReadMetrics.get.recordsRead == 0)
+    assert(newMetrics.shuffleWriteMetrics.get.shuffleRecordsWritten == 0)
+  }
+
   test("OutputMetrics backward compatibility") {
     // OutputMetrics were added after 1.1
     val metrics = makeTaskMetrics(1L, 2L, 3L, 4L, 5, 6, hasHadoopInput = false, hasOutput = true)
@@ -644,7 +672,8 @@ class JsonProtocolSuite extends FunSuite {
       e: Int,
       f: Int,
       hasHadoopInput: Boolean,
-      hasOutput: Boolean) = {
+      hasOutput: Boolean,
+      hasRecords: Boolean = true) = {
     val t = new TaskMetrics
     t.setHostname("localhost")
     t.setExecutorDeserializeTime(a)
@@ -656,7 +685,8 @@ class JsonProtocolSuite extends FunSuite {
 
     if (hasHadoopInput) {
       val inputMetrics = new InputMetrics(DataReadMethod.Hadoop)
-      inputMetrics.addBytesRead(d + e + f)
+      inputMetrics.incBytesRead(d + e + f)
+      inputMetrics.incRecordsRead(if (hasRecords) (d + e + f) / 100 else -1)
       t.setInputMetrics(Some(inputMetrics))
     } else {
       val sr = new ShuffleReadMetrics
@@ -664,16 +694,19 @@ class JsonProtocolSuite extends FunSuite {
       sr.incLocalBlocksFetched(e)
       sr.incFetchWaitTime(a + d)
       sr.incRemoteBlocksFetched(f)
+      sr.incRecordsRead(if (hasRecords) (b + d) / 100 else -1)
       t.setShuffleReadMetrics(Some(sr))
     }
     if (hasOutput) {
       val outputMetrics = new OutputMetrics(DataWriteMethod.Hadoop)
       outputMetrics.setBytesWritten(a + b + c)
+      outputMetrics.setRecordsWritten(if (hasRecords) (a + b + c)/100 else -1)
       t.outputMetrics = Some(outputMetrics)
     } else {
       val sw = new ShuffleWriteMetrics
       sw.incShuffleBytesWritten(a + b + c)
       sw.incShuffleWriteTime(b + c + d)
+      sw.setShuffleRecordsWritten(if (hasRecords) (a + b + c) / 100 else -1)
       t.shuffleWriteMetrics = Some(sw)
     }
     // Make at most 6 blocks
@@ -907,11 +940,13 @@ class JsonProtocolSuite extends FunSuite {
       |      "Remote Blocks Fetched": 800,
       |      "Local Blocks Fetched": 700,
       |      "Fetch Wait Time": 900,
-      |      "Remote Bytes Read": 1000
+      |      "Remote Bytes Read": 1000,
+      |      "Total Records Read" : 10
       |    },
       |    "Shuffle Write Metrics": {
       |      "Shuffle Bytes Written": 1200,
-      |      "Shuffle Write Time": 1500
+      |      "Shuffle Write Time": 1500,
+      |      "Shuffle Records Written": 12
       |    },
       |    "Updated Blocks": [
       |      {
@@ -988,11 +1023,13 @@ class JsonProtocolSuite extends FunSuite {
       |    "Disk Bytes Spilled": 0,
       |    "Shuffle Write Metrics": {
       |      "Shuffle Bytes Written": 1200,
-      |      "Shuffle Write Time": 1500
+      |      "Shuffle Write Time": 1500,
+      |      "Shuffle Records Written": 12
       |    },
       |    "Input Metrics": {
       |      "Data Read Method": "Hadoop",
-      |      "Bytes Read": 2100
+      |      "Bytes Read": 2100,
+      |      "Records Read": 21
       |    },
       |    "Updated Blocks": [
       |      {
@@ -1069,11 +1106,13 @@ class JsonProtocolSuite extends FunSuite {
       |    "Disk Bytes Spilled": 0,
       |    "Input Metrics": {
       |      "Data Read Method": "Hadoop",
-      |      "Bytes Read": 2100
+      |      "Bytes Read": 2100,
+      |      "Records Read": 21
       |    },
       |    "Output Metrics": {
       |      "Data Write Method": "Hadoop",
-      |      "Bytes Written": 1200
+      |      "Bytes Written": 1200,
+      |      "Records Written": 12
       |    },
       |    "Updated Blocks": [
       |      {

From 2bda1c1d376afd8abe6a04be345461752f3fb1b6 Mon Sep 17 00:00:00 2001
From: huangzhaowei <carlmartinmax@gmail.com>
Date: Fri, 6 Feb 2015 14:35:29 -0800
Subject: [PATCH 040/357] [SPARK-5444][Network]Add a retry to deal with the
 conflict port in netty server.

If the `spark.blockMnager.port` had conflicted with a specific port, Spark will throw an exception and exit.
So add a retry to avoid this situation.

Author: huangzhaowei <carlmartinmax@gmail.com>

Closes #4240 from SaintBacchus/NettyPortConflict and squashes the following commits:

cc926d2 [huangzhaowei] Add a retry to deal with the conflict port in netty server.
---
 .../spark/network/server/TransportServer.java | 36 +++++++++++++++++--
 .../spark/network/util/TransportConf.java     |  7 ++++
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/network/common/src/main/java/org/apache/spark/network/server/TransportServer.java b/network/common/src/main/java/org/apache/spark/network/server/TransportServer.java
index 625c3257d764e..ef209991804b4 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/TransportServer.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/TransportServer.java
@@ -100,8 +100,7 @@ protected void initChannel(SocketChannel ch) throws Exception {
       }
     });
 
-    channelFuture = bootstrap.bind(new InetSocketAddress(portToBind));
-    channelFuture.syncUninterruptibly();
+    bindRightPort(portToBind);
 
     port = ((InetSocketAddress) channelFuture.channel().localAddress()).getPort();
     logger.debug("Shuffle server started on port :" + port);
@@ -123,4 +122,37 @@ public void close() {
     bootstrap = null;
   }
 
+  /**
+   * Attempt to bind to the specified port up to a fixed number of retries.
+   * If all attempts fail after the max number of retries, exit.
+   */
+  private void bindRightPort(int portToBind) {
+    int maxPortRetries = conf.portMaxRetries();
+
+    for (int i = 0; i <= maxPortRetries; i++) {
+      int tryPort = -1;
+      if (0 == portToBind) {
+        // Do not increment port if tryPort is 0, which is treated as a special port
+        tryPort = 0;
+      } else {
+        // If the new port wraps around, do not try a privilege port
+        tryPort = ((portToBind + i - 1024) % (65536 - 1024)) + 1024;
+      }
+      try {
+        channelFuture = bootstrap.bind(new InetSocketAddress(tryPort));
+        channelFuture.syncUninterruptibly();
+        return;
+      } catch (Exception e) {
+        logger.warn("Netty service could not bind on port " + tryPort +
+          ". Attempting the next port.");
+        if (i >= maxPortRetries) {
+          logger.error(e.getMessage() + ": Netty server failed after "
+            + maxPortRetries + " retries.");
+
+          // If it can't find a right port, it should exit directly.
+          System.exit(-1);
+        }
+      }
+    }
+  }
 }
diff --git a/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java b/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java
index 6c9178688693f..2eaf3b71d9a49 100644
--- a/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java
+++ b/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java
@@ -98,4 +98,11 @@ public int memoryMapBytes() {
   public boolean lazyFileDescriptor() {
     return conf.getBoolean("spark.shuffle.io.lazyFD", true);
   }
+
+  /**
+   * Maximum number of retries when binding to a port before giving up.
+   */
+  public int portMaxRetries() {
+    return conf.getInt("spark.port.maxRetries", 16);
+  }
 }

From 61073f832128845a76469fc37376483b784c927b Mon Sep 17 00:00:00 2001
From: lianhuiwang <lianhuiwang09@gmail.com>
Date: Fri, 6 Feb 2015 14:47:52 -0800
Subject: [PATCH 041/357] [SPARK-4994][network]Cleanup removed executors'
 ShuffleInfo  in yarn shuffle service

when the application is completed, yarn's nodemanager can remove application's local-dirs.but all executors' metadata of completed application havenot be removed. now it lets yarn ShuffleService to have much more memory to store Executors' ShuffleInfo. so these metadata need to be removed.

Author: lianhuiwang <lianhuiwang09@gmail.com>

Closes #3828 from lianhuiwang/SPARK-4994 and squashes the following commits:

f3ba1d2 [lianhuiwang] Cleanup removed executors' ShuffleInfo
---
 .../org/apache/spark/network/yarn/YarnShuffleService.java  | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/network/yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java b/network/yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
index a34aabe9e78a6..63b21222e7b77 100644
--- a/network/yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
+++ b/network/yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
@@ -76,6 +76,9 @@ public class YarnShuffleService extends AuxiliaryService {
   // The actual server that serves shuffle files
   private TransportServer shuffleServer = null;
 
+  // Handles registering executors and opening shuffle blocks
+  private ExternalShuffleBlockHandler blockHandler;
+
   public YarnShuffleService() {
     super("spark_shuffle");
     logger.info("Initializing YARN shuffle service for Spark");
@@ -99,7 +102,8 @@ protected void serviceInit(Configuration conf) {
     // If authentication is enabled, set up the shuffle server to use a
     // special RPC handler that filters out unauthenticated fetch requests
     boolean authEnabled = conf.getBoolean(SPARK_AUTHENTICATE_KEY, DEFAULT_SPARK_AUTHENTICATE);
-    RpcHandler rpcHandler = new ExternalShuffleBlockHandler(transportConf);
+    blockHandler = new ExternalShuffleBlockHandler(transportConf);
+    RpcHandler rpcHandler = blockHandler;
     if (authEnabled) {
       secretManager = new ShuffleSecretManager();
       rpcHandler = new SaslRpcHandler(rpcHandler, secretManager);
@@ -136,6 +140,7 @@ public void stopApplication(ApplicationTerminationContext context) {
       if (isAuthenticationEnabled()) {
         secretManager.unregisterApp(appId);
       }
+      blockHandler.applicationRemoved(appId, false /* clean up local dirs */);
     } catch (Exception e) {
       logger.error("Exception when stopping application {}", appId, e);
     }

From 76c4bf59f6544b9c02d99fd18436427cbce632e6 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Fri, 6 Feb 2015 15:11:02 -0800
Subject: [PATCH 042/357] [SQL] Use TestSQLContext in Java tests

Sometimes tests were failing due to the creation of multiple `SparkContext`s in a single JVM.

Author: Michael Armbrust <michael@databricks.com>

Closes #4441 from marmbrus/javaTests and squashes the following commits:

657b1e0 [Michael Armbrust] [SQL] Use TestSQLContext in Java tests
---
 .../java/org/apache/spark/sql/api/java/JavaAPISuite.java   | 7 +++----
 .../apache/spark/sql/api/java/JavaApplySchemaSuite.java    | 6 +++---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java
index e5588938ea162..a21a15409080c 100644
--- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java
+++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java
@@ -19,6 +19,7 @@
 
 import java.io.Serializable;
 
+import org.apache.spark.sql.test.TestSQLContext$;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -37,14 +38,12 @@ public class JavaAPISuite implements Serializable {
 
   @Before
   public void setUp() {
-    sc = new JavaSparkContext("local", "JavaAPISuite");
-    sqlContext = new SQLContext(sc);
+    sqlContext = TestSQLContext$.MODULE$;
+    sc = new JavaSparkContext(sqlContext.sparkContext());
   }
 
   @After
   public void tearDown() {
-    sc.stop();
-    sc = null;
   }
 
   @SuppressWarnings("unchecked")
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
index 8510bac499092..2e6e977fdc752 100644
--- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
+++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
@@ -22,6 +22,7 @@
 import java.util.Arrays;
 import java.util.List;
 
+import org.apache.spark.sql.test.TestSQLContext$;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -42,13 +43,12 @@ public class JavaApplySchemaSuite implements Serializable {
 
   @Before
   public void setUp() {
-    javaCtx = new JavaSparkContext("local", "JavaApplySchemaSuite");
-    javaSqlCtx = new SQLContext(javaCtx);
+    javaSqlCtx = TestSQLContext$.MODULE$;
+    javaCtx = new JavaSparkContext(javaSqlCtx.sparkContext());
   }
 
   @After
   public void tearDown() {
-    javaCtx.stop();
     javaCtx = null;
     javaSqlCtx = null;
   }

From c4021401e326cd5a412a70425f5c75405284880e Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Fri, 6 Feb 2015 15:23:42 -0800
Subject: [PATCH 043/357] [SQL] [Minor] HiveParquetSuite was disabled by
 mistake, re-enable them

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4440)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>

Closes #4440 from liancheng/parquet-oops and squashes the following commits:

f21ede4 [Cheng Lian] HiveParquetSuite was disabled by mistake, re-enable them.
---
 .../spark/sql/parquet/ParquetQuerySuite.scala       |  1 -
 .../apache/spark/sql/parquet/HiveParquetSuite.scala | 13 ++++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index 48c7598343e55..cba06835f9a61 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -35,7 +35,6 @@ class ParquetQuerySuite extends QueryTest with ParquetTest {
       }
     }
 
-    // TODO Re-enable this after data source insertion API is merged
     test(s"$prefix: appending") {
       val data = (0 until 10).map(i => (i, i.toString))
       withParquetTable(data, "t") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
index eae69af5864aa..e89b4489f15d1 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.parquet
 
-import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.{SQLConf, QueryTest}
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.hive.test.TestHive
 
@@ -64,8 +64,7 @@ class HiveParquetSuite extends QueryTest with ParquetTest {
       }
     }
 
-    // TODO Re-enable this after data source insertion API is merged
-    ignore(s"$prefix: INSERT OVERWRITE TABLE Parquet table") {
+    test(s"$prefix: INSERT OVERWRITE TABLE Parquet table") {
       withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t") {
         withTempPath { file =>
           sql("SELECT * FROM t LIMIT 1").saveAsParquetFile(file.getCanonicalPath)
@@ -81,4 +80,12 @@ class HiveParquetSuite extends QueryTest with ParquetTest {
       }
     }
   }
+
+  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
+    run("Parquet data source enabled")
+  }
+
+  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "false") {
+    run("Parquet data source disabled")
+  }
 }

From 0e23ca9f805b46d9b3472330676e5c8db926b8f5 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Fri, 6 Feb 2015 15:42:59 -0800
Subject: [PATCH 044/357] [SPARK-5601][MLLIB] make streaming linear algorithms
 Java-friendly

Overload `trainOn`, `predictOn`, and `predictOnValues`.

CC freeman-lab

Author: Xiangrui Meng <meng@databricks.com>

Closes #4432 from mengxr/streaming-java and squashes the following commits:

6a79b85 [Xiangrui Meng] add java test for streaming logistic regression
2d7b357 [Xiangrui Meng] organize imports
1f662b3 [Xiangrui Meng] make streaming linear algorithms Java-friendly
---
 .../regression/StreamingLinearAlgorithm.scala | 20 ++++-
 .../JavaStreamingLogisticRegressionSuite.java | 82 +++++++++++++++++++
 .../JavaStreamingLinearRegressionSuite.java   | 80 ++++++++++++++++++
 3 files changed, 181 insertions(+), 1 deletion(-)
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/classification/JavaStreamingLogisticRegressionSuite.java
 create mode 100644 mllib/src/test/java/org/apache/spark/mllib/regression/JavaStreamingLinearRegressionSuite.java

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
index c854f124451cf..ce95c063db970 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
@@ -21,7 +21,9 @@ import scala.reflect.ClassTag
 
 import org.apache.spark.Logging
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.streaming.api.java.{JavaDStream, JavaPairDStream}
 import org.apache.spark.streaming.dstream.DStream
 
 /**
@@ -76,7 +78,7 @@ abstract class StreamingLinearAlgorithm[
    *
    * @param data DStream containing labeled data
    */
-  def trainOn(data: DStream[LabeledPoint]) {
+  def trainOn(data: DStream[LabeledPoint]): Unit = {
     if (model.isEmpty) {
       throw new IllegalArgumentException("Model must be initialized before starting training.")
     }
@@ -99,6 +101,9 @@ abstract class StreamingLinearAlgorithm[
     }
   }
 
+  /** Java-friendly version of `trainOn`. */
+  def trainOn(data: JavaDStream[LabeledPoint]): Unit = trainOn(data.dstream)
+
   /**
    * Use the model to make predictions on batches of data from a DStream
    *
@@ -112,6 +117,11 @@ abstract class StreamingLinearAlgorithm[
     data.map(model.get.predict)
   }
 
+  /** Java-friendly version of `predictOn`. */
+  def predictOn(data: JavaDStream[Vector]): JavaDStream[java.lang.Double] = {
+    JavaDStream.fromDStream(predictOn(data.dstream).asInstanceOf[DStream[java.lang.Double]])
+  }
+
   /**
    * Use the model to make predictions on the values of a DStream and carry over its keys.
    * @param data DStream containing feature vectors
@@ -124,4 +134,12 @@ abstract class StreamingLinearAlgorithm[
     }
     data.mapValues(model.get.predict)
   }
+
+
+  /** Java-friendly version of `predictOnValues`. */
+  def predictOnValues[K](data: JavaPairDStream[K, Vector]): JavaPairDStream[K, java.lang.Double] = {
+    implicit val tag = fakeClassTag[K]
+    JavaPairDStream.fromPairDStream(
+      predictOnValues(data.dstream).asInstanceOf[DStream[(K, java.lang.Double)]])
+  }
 }
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaStreamingLogisticRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaStreamingLogisticRegressionSuite.java
new file mode 100644
index 0000000000000..ac945ba6f23c1
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaStreamingLogisticRegressionSuite.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.classification;
+
+import java.io.Serializable;
+import java.util.List;
+
+import scala.Tuple2;
+
+import com.google.common.collect.Lists;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.streaming.Duration;
+import org.apache.spark.streaming.api.java.JavaDStream;
+import org.apache.spark.streaming.api.java.JavaPairDStream;
+import org.apache.spark.streaming.api.java.JavaStreamingContext;
+import static org.apache.spark.streaming.JavaTestUtils.*;
+
+public class JavaStreamingLogisticRegressionSuite implements Serializable {
+
+  protected transient JavaStreamingContext ssc;
+
+  @Before
+  public void setUp() {
+    SparkConf conf = new SparkConf()
+      .setMaster("local[2]")
+      .setAppName("test")
+      .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
+    ssc = new JavaStreamingContext(conf, new Duration(1000));
+    ssc.checkpoint("checkpoint");
+  }
+
+  @After
+  public void tearDown() {
+    ssc.stop();
+    ssc = null;
+  }
+
+  @Test
+  @SuppressWarnings("unchecked")
+  public void javaAPI() {
+    List<LabeledPoint> trainingBatch = Lists.newArrayList(
+      new LabeledPoint(1.0, Vectors.dense(1.0)),
+      new LabeledPoint(0.0, Vectors.dense(0.0)));
+    JavaDStream<LabeledPoint> training =
+      attachTestInputStream(ssc, Lists.newArrayList(trainingBatch, trainingBatch), 2);
+    List<Tuple2<Integer, Vector>> testBatch = Lists.newArrayList(
+      new Tuple2<Integer, Vector>(10, Vectors.dense(1.0)),
+      new Tuple2<Integer, Vector>(11, Vectors.dense(0.0)));
+    JavaPairDStream<Integer, Vector> test = JavaPairDStream.fromJavaDStream(
+      attachTestInputStream(ssc, Lists.newArrayList(testBatch, testBatch), 2));
+    StreamingLogisticRegressionWithSGD slr = new StreamingLogisticRegressionWithSGD()
+      .setNumIterations(2)
+      .setInitialWeights(Vectors.dense(0.0));
+    slr.trainOn(training);
+    JavaPairDStream<Integer, Double> prediction = slr.predictOnValues(test);
+    attachTestOutputStream(prediction.count());
+    runStreams(ssc, 2, 2);
+  }
+}
diff --git a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaStreamingLinearRegressionSuite.java b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaStreamingLinearRegressionSuite.java
new file mode 100644
index 0000000000000..a4dd1ac39a3c8
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaStreamingLinearRegressionSuite.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.regression;
+
+import java.io.Serializable;
+import java.util.List;
+
+import scala.Tuple2;
+
+import com.google.common.collect.Lists;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.streaming.Duration;
+import org.apache.spark.streaming.api.java.JavaDStream;
+import org.apache.spark.streaming.api.java.JavaPairDStream;
+import org.apache.spark.streaming.api.java.JavaStreamingContext;
+import static org.apache.spark.streaming.JavaTestUtils.*;
+
+public class JavaStreamingLinearRegressionSuite implements Serializable {
+
+  protected transient JavaStreamingContext ssc;
+
+  @Before
+  public void setUp() {
+    SparkConf conf = new SparkConf()
+      .setMaster("local[2]")
+      .setAppName("test")
+      .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
+    ssc = new JavaStreamingContext(conf, new Duration(1000));
+    ssc.checkpoint("checkpoint");
+  }
+
+  @After
+  public void tearDown() {
+    ssc.stop();
+    ssc = null;
+  }
+
+  @Test
+  @SuppressWarnings("unchecked")
+  public void javaAPI() {
+    List<LabeledPoint> trainingBatch = Lists.newArrayList(
+      new LabeledPoint(1.0, Vectors.dense(1.0)),
+      new LabeledPoint(0.0, Vectors.dense(0.0)));
+    JavaDStream<LabeledPoint> training =
+      attachTestInputStream(ssc, Lists.newArrayList(trainingBatch, trainingBatch), 2);
+    List<Tuple2<Integer, Vector>> testBatch = Lists.newArrayList(
+      new Tuple2<Integer, Vector>(10, Vectors.dense(1.0)),
+      new Tuple2<Integer, Vector>(11, Vectors.dense(0.0)));
+    JavaPairDStream<Integer, Vector> test = JavaPairDStream.fromJavaDStream(
+      attachTestInputStream(ssc, Lists.newArrayList(testBatch, testBatch), 2));
+    StreamingLinearRegressionWithSGD slr = new StreamingLinearRegressionWithSGD()
+      .setNumIterations(2)
+      .setInitialWeights(Vectors.dense(0.0));
+    slr.trainOn(training);
+    JavaPairDStream<Integer, Double> prediction = slr.predictOnValues(test);
+    attachTestOutputStream(prediction.count());
+    runStreams(ssc, 2, 2);
+  }
+}

From e772b4e4e1b790199dd000bd096a8917cb8def24 Mon Sep 17 00:00:00 2001
From: Grzegorz Dubicki <grzegorz.dubicki@gmail.com>
Date: Fri, 6 Feb 2015 15:43:58 -0800
Subject: [PATCH 045/357] SPARK-5403: Ignore UserKnownHostsFile in SSH calls

See https://issues.apache.org/jira/browse/SPARK-5403

Author: Grzegorz Dubicki <grzegorz.dubicki@gmail.com>

Closes #4196 from grzegorz-dubicki/SPARK-5403 and squashes the following commits:

a7d863f [Grzegorz Dubicki] Resolve start command hanging issue
---
 ec2/spark_ec2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 7371558274d78..3f7242a53d6fd 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -899,6 +899,7 @@ def stringify_command(parts):
 
 def ssh_args(opts):
     parts = ['-o', 'StrictHostKeyChecking=no']
+    parts += ['-o', 'UserKnownHostsFile=/dev/null']
     if opts.identity_file is not None:
         parts += ['-i', opts.identity_file]
     return parts

From 1390e56fa86d07b588f0c3a1a2cc25ab5bb0db67 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 6 Feb 2015 15:57:06 -0800
Subject: [PATCH 046/357] [SPARK-5388] Provide a stable application submission
 gateway for standalone cluster mode

The goal is to provide a stable, REST-based application submission gateway that is not inherently based on Akka, which is unstable across versions. This PR targets standalone cluster mode, but is implemented in a general enough manner that can be potentially extended to other modes in the future. Client mode is currently not included in the changes here because there are many more Akka messages exchanged there.

As of the changes here, the Master will advertise two ports, 7077 and 6066. We need to keep around the old one (7077) for client mode and older versions of Spark submit. However, all new versions of Spark submit will use the REST gateway (6066).

By the way this includes ~700 lines of tests and ~200 lines of license.

Author: Andrew Or <andrew@databricks.com>

Closes #4216 from andrewor14/rest and squashes the following commits:

8d7ce07 [Andrew Or] Merge branch 'master' of github.com:apache/spark into rest
6f0c597 [Andrew Or] Use nullable fields for integer and boolean values
dfe4bd7 [Andrew Or] Merge branch 'master' of github.com:apache/spark into rest
b9e2a08 [Andrew Or] Minor comments
02b5cea [Andrew Or] Fix tests
d2b1ef8 [Andrew Or] Comment changes + minor code refactoring across the board
9c82a36 [Andrew Or] Minor comment and wording updates
b4695e7 [Andrew Or] Merge branch 'master' of github.com:apache/spark into rest
c9a8ad7 [Andrew Or] Do not include appResource and mainClass as properties
6fc7670 [Andrew Or] Report REST server response back to the user
40e6095 [Andrew Or] Pass submit parameters through system properties
cbd670b [Andrew Or] Include unknown fields, if any, in server response
9fee16f [Andrew Or] Include server protocol version on mismatch
09f873a [Andrew Or] Fix style
8188e61 [Andrew Or] Upgrade Jackson from 2.3.0 to 2.4.4
37538e0 [Andrew Or] Merge branch 'master' of github.com:apache/spark into rest
9165ae8 [Andrew Or] Fall back to Akka if endpoint was not REST
252d53c [Andrew Or] Clean up server error handling behavior further
c643f64 [Andrew Or] Fix style
bbbd329 [Andrew Or] Merge branch 'master' of github.com:apache/spark into rest
792e112 [Andrew Or] Use specific HTTP response codes on error
f98660b [Andrew Or] Version the protocol and include it in REST URL
721819f [Andrew Or] Provide more REST-like interface for submit/kill/status
581f7bf [Andrew Or] Merge branch 'master' of github.com:apache/spark into rest
9e0d1af [Andrew Or] Move some classes around to reduce number of files (minor)
42e5de4 [Andrew Or] Merge branch 'master' of github.com:apache/spark into rest
1f1c03f [Andrew Or] Use Jackson's DefaultScalaModule to simplify messages
9229433 [Andrew Or] Reduce duplicate naming in REST field
ade28fd [Andrew Or] Clean up REST response output in Spark submit
b2fef8b [Andrew Or] Abstract the success field to the general response
6c57b4b [Andrew Or] Increase timeout in end-to-end tests
bf696ff [Andrew Or] Add checks for enabling REST when using kill/status
7ee6737 [Andrew Or] Merge branch 'master' of github.com:apache/spark into rest
e2f7f5f [Andrew Or] Provide more safeguard against missing fields
9581df7 [Andrew Or] Clean up uses of exceptions
914fdff [Andrew Or] Merge branch 'master' of github.com:apache/spark into rest
e2104e6 [Andrew Or] stable -> rest
3db7379 [Andrew Or] Fix comments and name fields for better error messages
8d43486 [Andrew Or] Replace SubmitRestProtocolAction with class name
df90e8b [Andrew Or] Use Jackson for JSON de/serialization
d7a1f9f [Andrew Or] Fix local cluster tests
efa5e18 [Andrew Or] Merge branch 'master' of github.com:apache/spark into rest
e42c131 [Andrew Or] Add end-to-end tests for standalone REST protocol
837475b [Andrew Or] Show the REST port on the Master UI
d8d3717 [Andrew Or] Use a daemon thread pool for REST server
6568ca5 [Andrew Or] Merge branch 'master' of github.com:apache/spark into rest
77774ba [Andrew Or] Minor fixes
206cae4 [Andrew Or] Refactor and add tests for the REST protocol
63c05b3 [Andrew Or] Remove MASTER as a field (minor)
9e21b72 [Andrew Or] Action -> SparkSubmitAction (minor)
51c5ca6 [Andrew Or] Distinguish client and server side Spark versions
b44e103 [Andrew Or] Implement status requests + fix validation behavior
120ab9d [Andrew Or] Support kill and request driver status through SparkSubmit
544de1d [Andrew Or] Major clean ups in code and comments
e958cae [Andrew Or] Supported nested values in messages
484bd21 [Andrew Or] Specify an ordering for fields in SubmitDriverRequestMessage
6ff088d [Andrew Or] Rename classes to generalize REST protocol
af9d9cb [Andrew Or] Integrate REST protocol in standalone mode
53e7c0e [Andrew Or] Initial client, server, and all the messages
---
 core/pom.xml                                  |   8 +
 .../org/apache/spark/ui/static/webui.css      |   6 +
 .../scala/org/apache/spark/SparkContext.scala |   2 +-
 .../apache/spark/deploy/ClientArguments.scala |  20 +-
 .../apache/spark/deploy/DeployMessage.scala   |  15 +-
 .../spark/deploy/LocalSparkCluster.scala      |  12 +-
 .../org/apache/spark/deploy/SparkSubmit.scala | 142 ++++--
 .../spark/deploy/SparkSubmitArguments.scala   |  81 +++-
 .../apache/spark/deploy/master/Master.scala   |  46 +-
 .../spark/deploy/master/MasterMessages.scala  |   4 +-
 .../spark/deploy/master/ui/MasterPage.scala   |  18 +-
 .../deploy/rest/StandaloneRestClient.scala    | 307 ++++++++++++
 .../deploy/rest/StandaloneRestServer.scala    | 449 ++++++++++++++++++
 .../rest/SubmitRestProtocolException.scala    |  36 ++
 .../rest/SubmitRestProtocolMessage.scala      | 146 ++++++
 .../rest/SubmitRestProtocolRequest.scala      |  78 +++
 .../rest/SubmitRestProtocolResponse.scala     |  85 ++++
 .../spark/deploy/JsonProtocolSuite.scala      |   3 +-
 .../spark/deploy/SparkSubmitSuite.scala       |  57 ++-
 .../rest/StandaloneRestSubmitSuite.scala      | 265 +++++++++++
 .../deploy/rest/SubmitRestProtocolSuite.scala | 324 +++++++++++++
 .../KryoSerializerDistributedSuite.scala      |   6 +-
 pom.xml                                       |  11 +
 23 files changed, 2027 insertions(+), 94 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestClient.scala
 create mode 100644 core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
 create mode 100644 core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolException.scala
 create mode 100644 core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala
 create mode 100644 core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala
 create mode 100644 core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolResponse.scala
 create mode 100644 core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
 create mode 100644 core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala

diff --git a/core/pom.xml b/core/pom.xml
index 4daaf88147142..66180035e61f1 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -243,6 +243,14 @@
       <groupId>io.dropwizard.metrics</groupId>
       <artifactId>metrics-graphite</artifactId>
     </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-databind</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.module</groupId>
+      <artifactId>jackson-module-scala_2.10</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.apache.derby</groupId>
       <artifactId>derby</artifactId>
diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.css b/core/src/main/resources/org/apache/spark/ui/static/webui.css
index f23ba9dba167f..68b33b5f0d7c7 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/webui.css
+++ b/core/src/main/resources/org/apache/spark/ui/static/webui.css
@@ -103,6 +103,12 @@ span.expand-details {
   float: right;
 }
 
+span.rest-uri {
+  font-size: 10pt;
+  font-style: italic;
+  color: gray;
+}
+
 pre {
   font-size: 0.8em;
 }
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 5623587c36fa6..71bdbc9b38ddb 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -2110,7 +2110,7 @@ object SparkContext extends Logging {
 
         val scheduler = new TaskSchedulerImpl(sc)
         val localCluster = new LocalSparkCluster(
-          numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt)
+          numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt, sc.conf)
         val masterUrls = localCluster.start()
         val backend = new SparkDeploySchedulerBackend(scheduler, sc, masterUrls)
         scheduler.initialize(backend)
diff --git a/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala
index e5873ce724b9f..415bd50591692 100644
--- a/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala
@@ -29,8 +29,7 @@ import org.apache.spark.util.{IntParam, MemoryParam}
  * Command-line parser for the driver client.
  */
 private[spark] class ClientArguments(args: Array[String]) {
-  val defaultCores = 1
-  val defaultMemory = 512
+  import ClientArguments._
 
   var cmd: String = "" // 'launch' or 'kill'
   var logLevel = Level.WARN
@@ -39,9 +38,9 @@ private[spark] class ClientArguments(args: Array[String]) {
   var master: String = ""
   var jarUrl: String = ""
   var mainClass: String = ""
-  var supervise: Boolean = false
-  var memory: Int = defaultMemory
-  var cores: Int = defaultCores
+  var supervise: Boolean = DEFAULT_SUPERVISE
+  var memory: Int = DEFAULT_MEMORY
+  var cores: Int = DEFAULT_CORES
   private var _driverOptions = ListBuffer[String]()
   def driverOptions = _driverOptions.toSeq
 
@@ -50,7 +49,7 @@ private[spark] class ClientArguments(args: Array[String]) {
 
   parse(args.toList)
 
-  def parse(args: List[String]): Unit = args match {
+  private def parse(args: List[String]): Unit = args match {
     case ("--cores" | "-c") :: IntParam(value) :: tail =>
       cores = value
       parse(tail)
@@ -106,9 +105,10 @@ private[spark] class ClientArguments(args: Array[String]) {
       |Usage: DriverClient kill <active-master> <driver-id>
       |
       |Options:
-      |   -c CORES, --cores CORES        Number of cores to request (default: $defaultCores)
-      |   -m MEMORY, --memory MEMORY     Megabytes of memory to request (default: $defaultMemory)
+      |   -c CORES, --cores CORES        Number of cores to request (default: $DEFAULT_CORES)
+      |   -m MEMORY, --memory MEMORY     Megabytes of memory to request (default: $DEFAULT_MEMORY)
       |   -s, --supervise                Whether to restart the driver on failure
+      |                                  (default: $DEFAULT_SUPERVISE)
       |   -v, --verbose                  Print more debugging output
      """.stripMargin
     System.err.println(usage)
@@ -117,6 +117,10 @@ private[spark] class ClientArguments(args: Array[String]) {
 }
 
 object ClientArguments {
+  private[spark] val DEFAULT_CORES = 1
+  private[spark] val DEFAULT_MEMORY = 512 // MB
+  private[spark] val DEFAULT_SUPERVISE = false
+
   def isValidJarUrl(s: String): Boolean = {
     try {
       val uri = new URI(s)
diff --git a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
index 243d8edb72ed3..7f600d89604a2 100644
--- a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
@@ -148,15 +148,22 @@ private[deploy] object DeployMessages {
 
   // Master to MasterWebUI
 
-  case class MasterStateResponse(host: String, port: Int, workers: Array[WorkerInfo],
-    activeApps: Array[ApplicationInfo], completedApps: Array[ApplicationInfo],
-    activeDrivers: Array[DriverInfo], completedDrivers: Array[DriverInfo],
-    status: MasterState) {
+  case class MasterStateResponse(
+      host: String,
+      port: Int,
+      restPort: Option[Int],
+      workers: Array[WorkerInfo],
+      activeApps: Array[ApplicationInfo],
+      completedApps: Array[ApplicationInfo],
+      activeDrivers: Array[DriverInfo],
+      completedDrivers: Array[DriverInfo],
+      status: MasterState) {
 
     Utils.checkHost(host, "Required hostname")
     assert (port > 0)
 
     def uri = "spark://" + host + ":" + port
+    def restUri: Option[String] = restPort.map { p => "spark://" + host + ":" + p }
   }
 
   //  WorkerWebUI to Worker
diff --git a/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala b/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala
index 9a7a113c95715..0401b15446a7b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala
@@ -33,7 +33,11 @@ import org.apache.spark.util.Utils
  * fault recovery without spinning up a lot of processes.
  */
 private[spark]
-class LocalSparkCluster(numWorkers: Int, coresPerWorker: Int, memoryPerWorker: Int)
+class LocalSparkCluster(
+    numWorkers: Int,
+    coresPerWorker: Int,
+    memoryPerWorker: Int,
+    conf: SparkConf)
   extends Logging {
 
   private val localHostname = Utils.localHostName()
@@ -43,9 +47,11 @@ class LocalSparkCluster(numWorkers: Int, coresPerWorker: Int, memoryPerWorker: I
   def start(): Array[String] = {
     logInfo("Starting a local Spark cluster with " + numWorkers + " workers.")
 
+    // Disable REST server on Master in this mode unless otherwise specified
+    val _conf = conf.clone().setIfMissing("spark.master.rest.enabled", "false")
+
     /* Start the Master */
-    val conf = new SparkConf(false)
-    val (masterSystem, masterPort, _) = Master.startSystemAndActor(localHostname, 0, 0, conf)
+    val (masterSystem, masterPort, _, _) = Master.startSystemAndActor(localHostname, 0, 0, _conf)
     masterActorSystems += masterSystem
     val masterUrl = "spark://" + localHostname + ":" + masterPort
     val masters = Array(masterUrl)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 9d25e647f1703..6d213926f3d7b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -18,25 +18,35 @@
 package org.apache.spark.deploy
 
 import java.io.{File, PrintStream}
-import java.lang.reflect.{Modifier, InvocationTargetException}
+import java.lang.reflect.{InvocationTargetException, Modifier}
 import java.net.URL
+
 import scala.collection.mutable.{ArrayBuffer, HashMap, Map}
 
 import org.apache.hadoop.fs.Path
 import org.apache.ivy.Ivy
 import org.apache.ivy.core.LogOptions
-import org.apache.ivy.core.module.descriptor.{DefaultExcludeRule, DefaultDependencyDescriptor, DefaultModuleDescriptor}
-import org.apache.ivy.core.module.id.{ModuleId, ArtifactId, ModuleRevisionId}
+import org.apache.ivy.core.module.descriptor._
+import org.apache.ivy.core.module.id.{ArtifactId, ModuleId, ModuleRevisionId}
 import org.apache.ivy.core.report.ResolveReport
-import org.apache.ivy.core.resolve.{IvyNode, ResolveOptions}
+import org.apache.ivy.core.resolve.ResolveOptions
 import org.apache.ivy.core.retrieve.RetrieveOptions
 import org.apache.ivy.core.settings.IvySettings
 import org.apache.ivy.plugins.matcher.GlobPatternMatcher
 import org.apache.ivy.plugins.resolver.{ChainResolver, IBiblioResolver}
-import org.apache.spark.executor.ExecutorURLClassLoader
+
+import org.apache.spark.deploy.rest._
+import org.apache.spark.executor._
 import org.apache.spark.util.Utils
-import org.apache.spark.executor.ChildExecutorURLClassLoader
-import org.apache.spark.executor.MutableURLClassLoader
+
+/**
+ * Whether to submit, kill, or request the status of an application.
+ * The latter two operations are currently supported only for standalone cluster mode.
+ */
+private[spark] object SparkSubmitAction extends Enumeration {
+  type SparkSubmitAction = Value
+  val SUBMIT, KILL, REQUEST_STATUS = Value
+}
 
 /**
  * Main gateway of launching a Spark application.
@@ -83,21 +93,74 @@ object SparkSubmit {
     if (appArgs.verbose) {
       printStream.println(appArgs)
     }
-    val (childArgs, classpath, sysProps, mainClass) = createLaunchEnv(appArgs)
-    launch(childArgs, classpath, sysProps, mainClass, appArgs.verbose)
+    appArgs.action match {
+      case SparkSubmitAction.SUBMIT => submit(appArgs)
+      case SparkSubmitAction.KILL => kill(appArgs)
+      case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs)
+    }
+  }
+
+  /** Kill an existing submission using the REST protocol. Standalone cluster mode only. */
+  private def kill(args: SparkSubmitArguments): Unit = {
+    new StandaloneRestClient()
+      .killSubmission(args.master, args.submissionToKill)
   }
 
   /**
-   * @return a tuple containing
-   *           (1) the arguments for the child process,
-   *           (2) a list of classpath entries for the child,
-   *           (3) a list of system properties and env vars, and
-   *           (4) the main class for the child
+   * Request the status of an existing submission using the REST protocol.
+   * Standalone cluster mode only.
    */
-  private[spark] def createLaunchEnv(args: SparkSubmitArguments)
-      : (ArrayBuffer[String], ArrayBuffer[String], Map[String, String], String) = {
+  private def requestStatus(args: SparkSubmitArguments): Unit = {
+    new StandaloneRestClient()
+      .requestSubmissionStatus(args.master, args.submissionToRequestStatusFor)
+  }
 
-    // Values to return
+  /**
+   * Submit the application using the provided parameters.
+   *
+   * This runs in two steps. First, we prepare the launch environment by setting up
+   * the appropriate classpath, system properties, and application arguments for
+   * running the child main class based on the cluster manager and the deploy mode.
+   * Second, we use this launch environment to invoke the main method of the child
+   * main class.
+   */
+  private[spark] def submit(args: SparkSubmitArguments): Unit = {
+    val (childArgs, childClasspath, sysProps, childMainClass) = prepareSubmitEnvironment(args)
+     // In standalone cluster mode, there are two submission gateways:
+     //   (1) The traditional Akka gateway using o.a.s.deploy.Client as a wrapper
+     //   (2) The new REST-based gateway introduced in Spark 1.3
+     // The latter is the default behavior as of Spark 1.3, but Spark submit will fail over
+     // to use the legacy gateway if the master endpoint turns out to be not a REST server.
+    if (args.isStandaloneCluster && args.useRest) {
+      try {
+        printStream.println("Running Spark using the REST application submission protocol.")
+        runMain(childArgs, childClasspath, sysProps, childMainClass)
+      } catch {
+        // Fail over to use the legacy submission gateway
+        case e: SubmitRestConnectionException =>
+          printWarning(s"Master endpoint ${args.master} was not a REST server. " +
+            "Falling back to legacy submission gateway instead.")
+          args.useRest = false
+          submit(args)
+      }
+    // In all other modes, just run the main class as prepared
+    } else {
+      runMain(childArgs, childClasspath, sysProps, childMainClass)
+    }
+  }
+
+  /**
+   * Prepare the environment for submitting an application.
+   * This returns a 4-tuple:
+   *   (1) the arguments for the child process,
+   *   (2) a list of classpath entries for the child,
+   *   (3) a map of system properties, and
+   *   (4) the main class for the child
+   * Exposed for testing.
+   */
+  private[spark] def prepareSubmitEnvironment(args: SparkSubmitArguments)
+      : (Seq[String], Seq[String], Map[String, String], String) = {
+    // Return values
     val childArgs = new ArrayBuffer[String]()
     val childClasspath = new ArrayBuffer[String]()
     val sysProps = new HashMap[String, String]()
@@ -235,10 +298,13 @@ object SparkSubmit {
         sysProp = "spark.driver.extraLibraryPath"),
 
       // Standalone cluster only
+      // Do not set CL arguments here because there are multiple possibilities for the main class
       OptionAssigner(args.jars, STANDALONE, CLUSTER, sysProp = "spark.jars"),
       OptionAssigner(args.ivyRepoPath, STANDALONE, CLUSTER, sysProp = "spark.jars.ivy"),
-      OptionAssigner(args.driverMemory, STANDALONE, CLUSTER, clOption = "--memory"),
-      OptionAssigner(args.driverCores, STANDALONE, CLUSTER, clOption = "--cores"),
+      OptionAssigner(args.driverMemory, STANDALONE, CLUSTER, sysProp = "spark.driver.memory"),
+      OptionAssigner(args.driverCores, STANDALONE, CLUSTER, sysProp = "spark.driver.cores"),
+      OptionAssigner(args.supervise.toString, STANDALONE, CLUSTER,
+        sysProp = "spark.driver.supervise"),
 
       // Yarn client only
       OptionAssigner(args.queue, YARN, CLIENT, sysProp = "spark.yarn.queue"),
@@ -279,7 +345,6 @@ object SparkSubmit {
       if (args.childArgs != null) { childArgs ++= args.childArgs }
     }
 
-
     // Map all arguments to command-line options or system properties for our chosen mode
     for (opt <- options) {
       if (opt.value != null &&
@@ -301,14 +366,21 @@ object SparkSubmit {
       sysProps.put("spark.jars", jars.mkString(","))
     }
 
-    // In standalone-cluster mode, use Client as a wrapper around the user class
-    if (clusterManager == STANDALONE && deployMode == CLUSTER) {
-      childMainClass = "org.apache.spark.deploy.Client"
-      if (args.supervise) {
-        childArgs += "--supervise"
+    // In standalone cluster mode, use the REST client to submit the application (Spark 1.3+).
+    // All Spark parameters are expected to be passed to the client through system properties.
+    if (args.isStandaloneCluster) {
+      if (args.useRest) {
+        childMainClass = "org.apache.spark.deploy.rest.StandaloneRestClient"
+        childArgs += (args.primaryResource, args.mainClass)
+      } else {
+        // In legacy standalone cluster mode, use Client as a wrapper around the user class
+        childMainClass = "org.apache.spark.deploy.Client"
+        if (args.supervise) { childArgs += "--supervise" }
+        Option(args.driverMemory).foreach { m => childArgs += ("--memory", m) }
+        Option(args.driverCores).foreach { c => childArgs += ("--cores", c) }
+        childArgs += "launch"
+        childArgs += (args.master, args.primaryResource, args.mainClass)
       }
-      childArgs += "launch"
-      childArgs += (args.master, args.primaryResource, args.mainClass)
       if (args.childArgs != null) {
         childArgs ++= args.childArgs
       }
@@ -345,7 +417,7 @@ object SparkSubmit {
 
     // Ignore invalid spark.driver.host in cluster modes.
     if (deployMode == CLUSTER) {
-      sysProps -= ("spark.driver.host")
+      sysProps -= "spark.driver.host"
     }
 
     // Resolve paths in certain spark properties
@@ -374,9 +446,15 @@ object SparkSubmit {
     (childArgs, childClasspath, sysProps, childMainClass)
   }
 
-  private def launch(
-      childArgs: ArrayBuffer[String],
-      childClasspath: ArrayBuffer[String],
+  /**
+   * Run the main method of the child class using the provided launch environment.
+   *
+   * Note that this main class will not be the one provided by the user if we're
+   * running cluster deploy mode or python applications.
+   */
+  private def runMain(
+      childArgs: Seq[String],
+      childClasspath: Seq[String],
       sysProps: Map[String, String],
       childMainClass: String,
       verbose: Boolean = false) {
@@ -697,7 +775,7 @@ private[spark] object SparkSubmitUtils {
  * Provides an indirection layer for passing arguments as system properties or flags to
  * the user's driver program or to downstream launcher tools.
  */
-private[spark] case class OptionAssigner(
+private case class OptionAssigner(
     value: String,
     clusterManager: Int,
     deployMode: Int,
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index 5cadc534f4baa..bd0ae26fd8210 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -22,6 +22,7 @@ import java.util.jar.JarFile
 
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 
+import org.apache.spark.deploy.SparkSubmitAction._
 import org.apache.spark.util.Utils
 
 /**
@@ -39,8 +40,6 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
   var driverExtraClassPath: String = null
   var driverExtraLibraryPath: String = null
   var driverExtraJavaOptions: String = null
-  var driverCores: String = null
-  var supervise: Boolean = false
   var queue: String = null
   var numExecutors: String = null
   var files: String = null
@@ -56,8 +55,16 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
   var verbose: Boolean = false
   var isPython: Boolean = false
   var pyFiles: String = null
+  var action: SparkSubmitAction = null
   val sparkProperties: HashMap[String, String] = new HashMap[String, String]()
 
+  // Standalone cluster mode only
+  var supervise: Boolean = false
+  var driverCores: String = null
+  var submissionToKill: String = null
+  var submissionToRequestStatusFor: String = null
+  var useRest: Boolean = true // used internally
+
   /** Default properties present in the currently defined defaults file. */
   lazy val defaultSparkProperties: HashMap[String, String] = {
     val defaultProperties = new HashMap[String, String]()
@@ -82,7 +89,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
   // Use `sparkProperties` map along with env vars to fill in any missing parameters
   loadEnvironmentArguments()
 
-  checkRequiredArguments()
+  validateArguments()
 
   /**
    * Merge values from the default properties file with those specified through --conf.
@@ -107,6 +114,15 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
       .orElse(sparkProperties.get("spark.master"))
       .orElse(env.get("MASTER"))
       .orNull
+    driverExtraClassPath = Option(driverExtraClassPath)
+      .orElse(sparkProperties.get("spark.driver.extraClassPath"))
+      .orNull
+    driverExtraJavaOptions = Option(driverExtraJavaOptions)
+      .orElse(sparkProperties.get("spark.driver.extraJavaOptions"))
+      .orNull
+    driverExtraLibraryPath = Option(driverExtraLibraryPath)
+      .orElse(sparkProperties.get("spark.driver.extraLibraryPath"))
+      .orNull
     driverMemory = Option(driverMemory)
       .orElse(sparkProperties.get("spark.driver.memory"))
       .orElse(env.get("SPARK_DRIVER_MEMORY"))
@@ -166,10 +182,21 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
     if (name == null && primaryResource != null) {
       name = Utils.stripDirectory(primaryResource)
     }
+
+    // Action should be SUBMIT unless otherwise specified
+    action = Option(action).getOrElse(SUBMIT)
   }
 
   /** Ensure that required fields exists. Call this only once all defaults are loaded. */
-  private def checkRequiredArguments(): Unit = {
+  private def validateArguments(): Unit = {
+    action match {
+      case SUBMIT => validateSubmitArguments()
+      case KILL => validateKillArguments()
+      case REQUEST_STATUS => validateStatusRequestArguments()
+    }
+  }
+
+  private def validateSubmitArguments(): Unit = {
     if (args.length == 0) {
       printUsageAndExit(-1)
     }
@@ -192,6 +219,29 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
     }
   }
 
+  private def validateKillArguments(): Unit = {
+    if (!master.startsWith("spark://")) {
+      SparkSubmit.printErrorAndExit("Killing submissions is only supported in standalone mode!")
+    }
+    if (submissionToKill == null) {
+      SparkSubmit.printErrorAndExit("Please specify a submission to kill.")
+    }
+  }
+
+  private def validateStatusRequestArguments(): Unit = {
+    if (!master.startsWith("spark://")) {
+      SparkSubmit.printErrorAndExit(
+        "Requesting submission statuses is only supported in standalone mode!")
+    }
+    if (submissionToRequestStatusFor == null) {
+      SparkSubmit.printErrorAndExit("Please specify a submission to request status for.")
+    }
+  }
+
+  def isStandaloneCluster: Boolean = {
+    master.startsWith("spark://") && deployMode == "cluster"
+  }
+
   override def toString = {
     s"""Parsed arguments:
     |  master                  $master
@@ -300,6 +350,22 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
         propertiesFile = value
         parse(tail)
 
+      case ("--kill") :: value :: tail =>
+        submissionToKill = value
+        if (action != null) {
+          SparkSubmit.printErrorAndExit(s"Action cannot be both $action and $KILL.")
+        }
+        action = KILL
+        parse(tail)
+
+      case ("--status") :: value :: tail =>
+        submissionToRequestStatusFor = value
+        if (action != null) {
+          SparkSubmit.printErrorAndExit(s"Action cannot be both $action and $REQUEST_STATUS.")
+        }
+        action = REQUEST_STATUS
+        parse(tail)
+
       case ("--supervise") :: tail =>
         supervise = true
         parse(tail)
@@ -372,7 +438,10 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
       outStream.println("Unknown/unsupported param " + unknownParam)
     }
     outStream.println(
-      """Usage: spark-submit [options] <app jar | python file> [app options]
+      """Usage: spark-submit [options] <app jar | python file> [app arguments]
+        |Usage: spark-submit --kill [submission ID] --master [spark://...]
+        |Usage: spark-submit --status [submission ID] --master [spark://...]
+        |
         |Options:
         |  --master MASTER_URL         spark://host:port, mesos://host:port, yarn, or local.
         |  --deploy-mode DEPLOY_MODE   Whether to launch the driver program locally ("client") or
@@ -413,6 +482,8 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
         | Spark standalone with cluster deploy mode only:
         |  --driver-cores NUM          Cores for driver (Default: 1).
         |  --supervise                 If given, restarts the driver on failure.
+        |  --kill SUBMISSION_ID        If given, kills the driver specified.
+        |  --status SUBMISSION_ID      If given, requests the status of the driver specified.
         |
         | Spark standalone and Mesos only:
         |  --total-executor-cores NUM  Total cores for all executors.
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 5eeb9fe526248..b8b1a25abff2e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -43,6 +43,7 @@ import org.apache.spark.deploy.history.HistoryServer
 import org.apache.spark.deploy.master.DriverState.DriverState
 import org.apache.spark.deploy.master.MasterMessages._
 import org.apache.spark.deploy.master.ui.MasterWebUI
+import org.apache.spark.deploy.rest.StandaloneRestServer
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.scheduler.{EventLoggingListener, ReplayListenerBus}
 import org.apache.spark.ui.SparkUI
@@ -52,12 +53,12 @@ private[spark] class Master(
     host: String,
     port: Int,
     webUiPort: Int,
-    val securityMgr: SecurityManager)
+    val securityMgr: SecurityManager,
+    val conf: SparkConf)
   extends Actor with ActorLogReceive with Logging with LeaderElectable {
 
   import context.dispatcher   // to use Akka's scheduler.schedule()
 
-  val conf = new SparkConf
   val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
 
   def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss")  // For application IDs
@@ -121,6 +122,17 @@ private[spark] class Master(
     throw new SparkException("spark.deploy.defaultCores must be positive")
   }
 
+  // Alternative application submission gateway that is stable across Spark versions
+  private val restServerEnabled = conf.getBoolean("spark.master.rest.enabled", true)
+  private val restServer =
+    if (restServerEnabled) {
+      val port = conf.getInt("spark.master.rest.port", 6066)
+      Some(new StandaloneRestServer(host, port, self, masterUrl, conf))
+    } else {
+      None
+    }
+  private val restServerBoundPort = restServer.map(_.start())
+
   override def preStart() {
     logInfo("Starting Spark master at " + masterUrl)
     logInfo(s"Running Spark version ${org.apache.spark.SPARK_VERSION}")
@@ -174,6 +186,7 @@ private[spark] class Master(
       recoveryCompletionTask.cancel()
     }
     webUi.stop()
+    restServer.foreach(_.stop())
     masterMetricsSystem.stop()
     applicationMetricsSystem.stop()
     persistenceEngine.close()
@@ -421,7 +434,9 @@ private[spark] class Master(
     }
 
     case RequestMasterState => {
-      sender ! MasterStateResponse(host, port, workers.toArray, apps.toArray, completedApps.toArray,
+      sender ! MasterStateResponse(
+        host, port, restServerBoundPort,
+        workers.toArray, apps.toArray, completedApps.toArray,
         drivers.toArray, completedDrivers.toArray, state)
     }
 
@@ -429,8 +444,8 @@ private[spark] class Master(
       timeOutDeadWorkers()
     }
 
-    case RequestWebUIPort => {
-      sender ! WebUIPortResponse(webUi.boundPort)
+    case BoundPortsRequest => {
+      sender ! BoundPortsResponse(port, webUi.boundPort, restServerBoundPort)
     }
   }
 
@@ -851,7 +866,7 @@ private[spark] object Master extends Logging {
     SignalLogger.register(log)
     val conf = new SparkConf
     val args = new MasterArguments(argStrings, conf)
-    val (actorSystem, _, _) = startSystemAndActor(args.host, args.port, args.webUiPort, conf)
+    val (actorSystem, _, _, _) = startSystemAndActor(args.host, args.port, args.webUiPort, conf)
     actorSystem.awaitTermination()
   }
 
@@ -875,19 +890,26 @@ private[spark] object Master extends Logging {
     Address(protocol, systemName, host, port)
   }
 
+  /**
+   * Start the Master and return a four tuple of:
+   *   (1) The Master actor system
+   *   (2) The bound port
+   *   (3) The web UI bound port
+   *   (4) The REST server bound port, if any
+   */
   def startSystemAndActor(
       host: String,
       port: Int,
       webUiPort: Int,
-      conf: SparkConf): (ActorSystem, Int, Int) = {
+      conf: SparkConf): (ActorSystem, Int, Int, Option[Int]) = {
     val securityMgr = new SecurityManager(conf)
     val (actorSystem, boundPort) = AkkaUtils.createActorSystem(systemName, host, port, conf = conf,
       securityManager = securityMgr)
-    val actor = actorSystem.actorOf(Props(classOf[Master], host, boundPort, webUiPort,
-      securityMgr), actorName)
+    val actor = actorSystem.actorOf(
+      Props(classOf[Master], host, boundPort, webUiPort, securityMgr, conf), actorName)
     val timeout = AkkaUtils.askTimeout(conf)
-    val respFuture = actor.ask(RequestWebUIPort)(timeout)
-    val resp = Await.result(respFuture, timeout).asInstanceOf[WebUIPortResponse]
-    (actorSystem, boundPort, resp.webUIBoundPort)
+    val portsRequest = actor.ask(BoundPortsRequest)(timeout)
+    val portsResponse = Await.result(portsRequest, timeout).asInstanceOf[BoundPortsResponse]
+    (actorSystem, boundPort, portsResponse.webUIPort, portsResponse.restPort)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/MasterMessages.scala b/core/src/main/scala/org/apache/spark/deploy/master/MasterMessages.scala
index db72d8ae9bdaf..15c6296888f70 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/MasterMessages.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/MasterMessages.scala
@@ -36,7 +36,7 @@ private[master] object MasterMessages {
 
   case object CompleteRecovery
 
-  case object RequestWebUIPort
+  case object BoundPortsRequest
 
-  case class WebUIPortResponse(webUIBoundPort: Int)
+  case class BoundPortsResponse(actorPort: Int, webUIPort: Int, restPort: Option[Int])
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
index 7ca3b08a28728..b47a081053e77 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
@@ -46,19 +46,19 @@ private[spark] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
     val stateFuture = (master ? RequestMasterState)(timeout).mapTo[MasterStateResponse]
     val state = Await.result(stateFuture, timeout)
 
-    val workerHeaders = Seq("Id", "Address", "State", "Cores", "Memory")
+    val workerHeaders = Seq("Worker Id", "Address", "State", "Cores", "Memory")
     val workers = state.workers.sortBy(_.id)
     val workerTable = UIUtils.listingTable(workerHeaders, workerRow, workers)
 
-    val appHeaders = Seq("ID", "Name", "Cores", "Memory per Node", "Submitted Time", "User",
-      "State", "Duration")
+    val appHeaders = Seq("Application ID", "Name", "Cores", "Memory per Node", "Submitted Time",
+      "User", "State", "Duration")
     val activeApps = state.activeApps.sortBy(_.startTime).reverse
     val activeAppsTable = UIUtils.listingTable(appHeaders, appRow, activeApps)
     val completedApps = state.completedApps.sortBy(_.endTime).reverse
     val completedAppsTable = UIUtils.listingTable(appHeaders, appRow, completedApps)
 
-    val driverHeaders = Seq("ID", "Submitted Time", "Worker", "State", "Cores", "Memory",
-      "Main Class")
+    val driverHeaders = Seq("Submission ID", "Submitted Time", "Worker", "State", "Cores",
+      "Memory", "Main Class")
     val activeDrivers = state.activeDrivers.sortBy(_.startTime).reverse
     val activeDriversTable = UIUtils.listingTable(driverHeaders, driverRow, activeDrivers)
     val completedDrivers = state.completedDrivers.sortBy(_.startTime).reverse
@@ -73,6 +73,14 @@ private[spark] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
           <div class="span12">
             <ul class="unstyled">
               <li><strong>URL:</strong> {state.uri}</li>
+              {
+                state.restUri.map { uri =>
+                  <li>
+                    <strong>REST URL:</strong> {uri}
+                    <span class="rest-uri"> (cluster mode)</span>
+                  </li>
+                }.getOrElse { Seq.empty }
+              }
               <li><strong>Workers:</strong> {state.workers.size}</li>
               <li><strong>Cores:</strong> {state.workers.map(_.cores).sum} Total,
                 {state.workers.map(_.coresUsed).sum} Used</li>
diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestClient.scala b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestClient.scala
new file mode 100644
index 0000000000000..115aa5278bb62
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestClient.scala
@@ -0,0 +1,307 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.rest
+
+import java.io.{DataOutputStream, FileNotFoundException}
+import java.net.{HttpURLConnection, SocketException, URL}
+
+import scala.io.Source
+
+import com.fasterxml.jackson.databind.JsonMappingException
+import com.google.common.base.Charsets
+
+import org.apache.spark.{Logging, SparkConf, SPARK_VERSION => sparkVersion}
+
+/**
+ * A client that submits applications to the standalone Master using a REST protocol.
+ * This client is intended to communicate with the [[StandaloneRestServer]] and is
+ * currently used for cluster mode only.
+ *
+ * In protocol version v1, the REST URL takes the form http://[host:port]/v1/submissions/[action],
+ * where [action] can be one of create, kill, or status. Each type of request is represented in
+ * an HTTP message sent to the following prefixes:
+ *   (1) submit - POST to /submissions/create
+ *   (2) kill - POST /submissions/kill/[submissionId]
+ *   (3) status - GET /submissions/status/[submissionId]
+ *
+ * In the case of (1), parameters are posted in the HTTP body in the form of JSON fields.
+ * Otherwise, the URL fully specifies the intended action of the client.
+ *
+ * Since the protocol is expected to be stable across Spark versions, existing fields cannot be
+ * added or removed, though new optional fields can be added. In the rare event that forward or
+ * backward compatibility is broken, Spark must introduce a new protocol version (e.g. v2).
+ *
+ * The client and the server must communicate using the same version of the protocol. If there
+ * is a mismatch, the server will respond with the highest protocol version it supports. A future
+ * implementation of this client can use that information to retry using the version specified
+ * by the server.
+ */
+private[spark] class StandaloneRestClient extends Logging {
+  import StandaloneRestClient._
+
+  /**
+   * Submit an application specified by the parameters in the provided request.
+   *
+   * If the submission was successful, poll the status of the submission and report
+   * it to the user. Otherwise, report the error message provided by the server.
+   */
+  def createSubmission(
+      master: String,
+      request: CreateSubmissionRequest): SubmitRestProtocolResponse = {
+    logInfo(s"Submitting a request to launch an application in $master.")
+    validateMaster(master)
+    val url = getSubmitUrl(master)
+    val response = postJson(url, request.toJson)
+    response match {
+      case s: CreateSubmissionResponse =>
+        reportSubmissionStatus(master, s)
+        handleRestResponse(s)
+      case unexpected =>
+        handleUnexpectedRestResponse(unexpected)
+    }
+    response
+  }
+
+  /** Request that the server kill the specified submission. */
+  def killSubmission(master: String, submissionId: String): SubmitRestProtocolResponse = {
+    logInfo(s"Submitting a request to kill submission $submissionId in $master.")
+    validateMaster(master)
+    val response = post(getKillUrl(master, submissionId))
+    response match {
+      case k: KillSubmissionResponse => handleRestResponse(k)
+      case unexpected => handleUnexpectedRestResponse(unexpected)
+    }
+    response
+  }
+
+  /** Request the status of a submission from the server. */
+  def requestSubmissionStatus(
+      master: String,
+      submissionId: String,
+      quiet: Boolean = false): SubmitRestProtocolResponse = {
+    logInfo(s"Submitting a request for the status of submission $submissionId in $master.")
+    validateMaster(master)
+    val response = get(getStatusUrl(master, submissionId))
+    response match {
+      case s: SubmissionStatusResponse => if (!quiet) { handleRestResponse(s) }
+      case unexpected => handleUnexpectedRestResponse(unexpected)
+    }
+    response
+  }
+
+  /** Construct a message that captures the specified parameters for submitting an application. */
+  def constructSubmitRequest(
+      appResource: String,
+      mainClass: String,
+      appArgs: Array[String],
+      sparkProperties: Map[String, String],
+      environmentVariables: Map[String, String]): CreateSubmissionRequest = {
+    val message = new CreateSubmissionRequest
+    message.clientSparkVersion = sparkVersion
+    message.appResource = appResource
+    message.mainClass = mainClass
+    message.appArgs = appArgs
+    message.sparkProperties = sparkProperties
+    message.environmentVariables = environmentVariables
+    message.validate()
+    message
+  }
+
+  /** Send a GET request to the specified URL. */
+  private def get(url: URL): SubmitRestProtocolResponse = {
+    logDebug(s"Sending GET request to server at $url.")
+    val conn = url.openConnection().asInstanceOf[HttpURLConnection]
+    conn.setRequestMethod("GET")
+    readResponse(conn)
+  }
+
+  /** Send a POST request to the specified URL. */
+  private def post(url: URL): SubmitRestProtocolResponse = {
+    logDebug(s"Sending POST request to server at $url.")
+    val conn = url.openConnection().asInstanceOf[HttpURLConnection]
+    conn.setRequestMethod("POST")
+    readResponse(conn)
+  }
+
+  /** Send a POST request with the given JSON as the body to the specified URL. */
+  private def postJson(url: URL, json: String): SubmitRestProtocolResponse = {
+    logDebug(s"Sending POST request to server at $url:\n$json")
+    val conn = url.openConnection().asInstanceOf[HttpURLConnection]
+    conn.setRequestMethod("POST")
+    conn.setRequestProperty("Content-Type", "application/json")
+    conn.setRequestProperty("charset", "utf-8")
+    conn.setDoOutput(true)
+    val out = new DataOutputStream(conn.getOutputStream)
+    out.write(json.getBytes(Charsets.UTF_8))
+    out.close()
+    readResponse(conn)
+  }
+
+  /**
+   * Read the response from the server and return it as a validated [[SubmitRestProtocolResponse]].
+   * If the response represents an error, report the embedded message to the user.
+   */
+  private def readResponse(connection: HttpURLConnection): SubmitRestProtocolResponse = {
+    try {
+      val responseJson = Source.fromInputStream(connection.getInputStream).mkString
+      logDebug(s"Response from the server:\n$responseJson")
+      val response = SubmitRestProtocolMessage.fromJson(responseJson)
+      response.validate()
+      response match {
+        // If the response is an error, log the message
+        case error: ErrorResponse =>
+          logError(s"Server responded with error:\n${error.message}")
+          error
+        // Otherwise, simply return the response
+        case response: SubmitRestProtocolResponse => response
+        case unexpected =>
+          throw new SubmitRestProtocolException(
+            s"Message received from server was not a response:\n${unexpected.toJson}")
+      }
+    } catch {
+      case unreachable @ (_: FileNotFoundException | _: SocketException) =>
+        throw new SubmitRestConnectionException(
+          s"Unable to connect to server ${connection.getURL}", unreachable)
+      case malformed @ (_: SubmitRestProtocolException | _: JsonMappingException) =>
+        throw new SubmitRestProtocolException(
+          "Malformed response received from server", malformed)
+    }
+  }
+
+  /** Return the REST URL for creating a new submission. */
+  private def getSubmitUrl(master: String): URL = {
+    val baseUrl = getBaseUrl(master)
+    new URL(s"$baseUrl/create")
+  }
+
+  /** Return the REST URL for killing an existing submission. */
+  private def getKillUrl(master: String, submissionId: String): URL = {
+    val baseUrl = getBaseUrl(master)
+    new URL(s"$baseUrl/kill/$submissionId")
+  }
+
+  /** Return the REST URL for requesting the status of an existing submission. */
+  private def getStatusUrl(master: String, submissionId: String): URL = {
+    val baseUrl = getBaseUrl(master)
+    new URL(s"$baseUrl/status/$submissionId")
+  }
+
+  /** Return the base URL for communicating with the server, including the protocol version. */
+  private def getBaseUrl(master: String): String = {
+    val masterUrl = master.stripPrefix("spark://").stripSuffix("/")
+    s"http://$masterUrl/$PROTOCOL_VERSION/submissions"
+  }
+
+  /** Throw an exception if this is not standalone mode. */
+  private def validateMaster(master: String): Unit = {
+    if (!master.startsWith("spark://")) {
+      throw new IllegalArgumentException("This REST client is only supported in standalone mode.")
+    }
+  }
+
+  /** Report the status of a newly created submission. */
+  private def reportSubmissionStatus(
+      master: String,
+      submitResponse: CreateSubmissionResponse): Unit = {
+    if (submitResponse.success) {
+      val submissionId = submitResponse.submissionId
+      if (submissionId != null) {
+        logInfo(s"Submission successfully created as $submissionId. Polling submission state...")
+        pollSubmissionStatus(master, submissionId)
+      } else {
+        // should never happen
+        logError("Application successfully submitted, but submission ID was not provided!")
+      }
+    } else {
+      val failMessage = Option(submitResponse.message).map { ": " + _ }.getOrElse("")
+      logError("Application submission failed" + failMessage)
+    }
+  }
+
+  /**
+   * Poll the status of the specified submission and log it.
+   * This retries up to a fixed number of times before giving up.
+   */
+  private def pollSubmissionStatus(master: String, submissionId: String): Unit = {
+    (1 to REPORT_DRIVER_STATUS_MAX_TRIES).foreach { _ =>
+      val response = requestSubmissionStatus(master, submissionId, quiet = true)
+      val statusResponse = response match {
+        case s: SubmissionStatusResponse => s
+        case _ => return // unexpected type, let upstream caller handle it
+      }
+      if (statusResponse.success) {
+        val driverState = Option(statusResponse.driverState)
+        val workerId = Option(statusResponse.workerId)
+        val workerHostPort = Option(statusResponse.workerHostPort)
+        val exception = Option(statusResponse.message)
+        // Log driver state, if present
+        driverState match {
+          case Some(state) => logInfo(s"State of driver $submissionId is now $state.")
+          case _ => logError(s"State of driver $submissionId was not found!")
+        }
+        // Log worker node, if present
+        (workerId, workerHostPort) match {
+          case (Some(id), Some(hp)) => logInfo(s"Driver is running on worker $id at $hp.")
+          case _ =>
+        }
+        // Log exception stack trace, if present
+        exception.foreach { e => logError(e) }
+        return
+      }
+      Thread.sleep(REPORT_DRIVER_STATUS_INTERVAL)
+    }
+    logError(s"Error: Master did not recognize driver $submissionId.")
+  }
+
+  /** Log the response sent by the server in the REST application submission protocol. */
+  private def handleRestResponse(response: SubmitRestProtocolResponse): Unit = {
+    logInfo(s"Server responded with ${response.messageType}:\n${response.toJson}")
+  }
+
+  /** Log an appropriate error if the response sent by the server is not of the expected type. */
+  private def handleUnexpectedRestResponse(unexpected: SubmitRestProtocolResponse): Unit = {
+    logError(s"Error: Server responded with message of unexpected type ${unexpected.messageType}.")
+  }
+}
+
+private[spark] object StandaloneRestClient {
+  val REPORT_DRIVER_STATUS_INTERVAL = 1000
+  val REPORT_DRIVER_STATUS_MAX_TRIES = 10
+  val PROTOCOL_VERSION = "v1"
+
+  /** Submit an application, assuming Spark parameters are specified through system properties. */
+  def main(args: Array[String]): Unit = {
+    if (args.size < 2) {
+      sys.error("Usage: StandaloneRestClient [app resource] [main class] [app args*]")
+      sys.exit(1)
+    }
+    val appResource = args(0)
+    val mainClass = args(1)
+    val appArgs = args.slice(2, args.size)
+    val conf = new SparkConf
+    val master = conf.getOption("spark.master").getOrElse {
+      throw new IllegalArgumentException("'spark.master' must be set.")
+    }
+    val sparkProperties = conf.getAll.toMap
+    val environmentVariables = sys.env.filter { case (k, _) => k.startsWith("SPARK_") }
+    val client = new StandaloneRestClient
+    val submitRequest = client.constructSubmitRequest(
+      appResource, mainClass, appArgs, sparkProperties, environmentVariables)
+    client.createSubmission(master, submitRequest)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
new file mode 100644
index 0000000000000..2033d67e1f394
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
@@ -0,0 +1,449 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.rest
+
+import java.io.{DataOutputStream, File}
+import java.net.InetSocketAddress
+import javax.servlet.http.{HttpServlet, HttpServletRequest, HttpServletResponse}
+
+import scala.io.Source
+
+import akka.actor.ActorRef
+import com.fasterxml.jackson.databind.JsonMappingException
+import com.google.common.base.Charsets
+import org.eclipse.jetty.server.Server
+import org.eclipse.jetty.servlet.{ServletHolder, ServletContextHandler}
+import org.eclipse.jetty.util.thread.QueuedThreadPool
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.{Logging, SparkConf, SPARK_VERSION => sparkVersion}
+import org.apache.spark.util.{AkkaUtils, Utils}
+import org.apache.spark.deploy.{Command, DeployMessages, DriverDescription}
+import org.apache.spark.deploy.ClientArguments._
+
+/**
+ * A server that responds to requests submitted by the [[StandaloneRestClient]].
+ * This is intended to be embedded in the standalone Master and used in cluster mode only.
+ *
+ * This server responds with different HTTP codes depending on the situation:
+ *   200 OK - Request was processed successfully
+ *   400 BAD REQUEST - Request was malformed, not successfully validated, or of unexpected type
+ *   468 UNKNOWN PROTOCOL VERSION - Request specified a protocol this server does not understand
+ *   500 INTERNAL SERVER ERROR - Server throws an exception internally while processing the request
+ *
+ * The server always includes a JSON representation of the relevant [[SubmitRestProtocolResponse]]
+ * in the HTTP body. If an error occurs, however, the server will include an [[ErrorResponse]]
+ * instead of the one expected by the client. If the construction of this error response itself
+ * fails, the response will consist of an empty body with a response code that indicates internal
+ * server error.
+ *
+ * @param host the address this server should bind to
+ * @param requestedPort the port this server will attempt to bind to
+ * @param masterActor reference to the Master actor to which requests can be sent
+ * @param masterUrl the URL of the Master new drivers will attempt to connect to
+ * @param masterConf the conf used by the Master
+ */
+private[spark] class StandaloneRestServer(
+    host: String,
+    requestedPort: Int,
+    masterActor: ActorRef,
+    masterUrl: String,
+    masterConf: SparkConf)
+  extends Logging {
+
+  import StandaloneRestServer._
+
+  private var _server: Option[Server] = None
+  private val baseContext = s"/$PROTOCOL_VERSION/submissions"
+
+  // A mapping from servlets to the URL prefixes they are responsible for
+  private val servletToContext = Map[StandaloneRestServlet, String](
+    new SubmitRequestServlet(masterActor, masterUrl, masterConf) -> s"$baseContext/create/*",
+    new KillRequestServlet(masterActor, masterConf) -> s"$baseContext/kill/*",
+    new StatusRequestServlet(masterActor, masterConf) -> s"$baseContext/status/*",
+    new ErrorServlet -> "/" // default handler
+  )
+
+  /** Start the server and return the bound port. */
+  def start(): Int = {
+    val (server, boundPort) = Utils.startServiceOnPort[Server](requestedPort, doStart, masterConf)
+    _server = Some(server)
+    logInfo(s"Started REST server for submitting applications on port $boundPort")
+    boundPort
+  }
+
+  /**
+   * Map the servlets to their corresponding contexts and attach them to a server.
+   * Return a 2-tuple of the started server and the bound port.
+   */
+  private def doStart(startPort: Int): (Server, Int) = {
+    val server = new Server(new InetSocketAddress(host, startPort))
+    val threadPool = new QueuedThreadPool
+    threadPool.setDaemon(true)
+    server.setThreadPool(threadPool)
+    val mainHandler = new ServletContextHandler
+    mainHandler.setContextPath("/")
+    servletToContext.foreach { case (servlet, prefix) =>
+      mainHandler.addServlet(new ServletHolder(servlet), prefix)
+    }
+    server.setHandler(mainHandler)
+    server.start()
+    val boundPort = server.getConnectors()(0).getLocalPort
+    (server, boundPort)
+  }
+
+  def stop(): Unit = {
+    _server.foreach(_.stop())
+  }
+}
+
+private object StandaloneRestServer {
+  val PROTOCOL_VERSION = StandaloneRestClient.PROTOCOL_VERSION
+  val SC_UNKNOWN_PROTOCOL_VERSION = 468
+}
+
+/**
+ * An abstract servlet for handling requests passed to the [[StandaloneRestServer]].
+ */
+private abstract class StandaloneRestServlet extends HttpServlet with Logging {
+
+  /** Service a request. If an exception is thrown in the process, indicate server error. */
+  protected override def service(
+      request: HttpServletRequest,
+      response: HttpServletResponse): Unit = {
+    try {
+      super.service(request, response)
+    } catch {
+      case e: Exception =>
+        logError("Exception while handling request", e)
+        response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR)
+    }
+  }
+
+  /**
+   * Serialize the given response message to JSON and send it through the response servlet.
+   * This validates the response before sending it to ensure it is properly constructed.
+   */
+  protected def sendResponse(
+      responseMessage: SubmitRestProtocolResponse,
+      responseServlet: HttpServletResponse): Unit = {
+    val message = validateResponse(responseMessage, responseServlet)
+    responseServlet.setContentType("application/json")
+    responseServlet.setCharacterEncoding("utf-8")
+    responseServlet.setStatus(HttpServletResponse.SC_OK)
+    val content = message.toJson.getBytes(Charsets.UTF_8)
+    val out = new DataOutputStream(responseServlet.getOutputStream)
+    out.write(content)
+    out.close()
+  }
+
+  /**
+   * Return any fields in the client request message that the server does not know about.
+   *
+   * The mechanism for this is to reconstruct the JSON on the server side and compare the
+   * diff between this JSON and the one generated on the client side. Any fields that are
+   * only in the client JSON are treated as unexpected.
+   */
+  protected def findUnknownFields(
+      requestJson: String,
+      requestMessage: SubmitRestProtocolMessage): Array[String] = {
+    val clientSideJson = parse(requestJson)
+    val serverSideJson = parse(requestMessage.toJson)
+    val Diff(_, _, unknown) = clientSideJson.diff(serverSideJson)
+    unknown match {
+      case j: JObject => j.obj.map { case (k, _) => k }.toArray
+      case _ => Array.empty[String] // No difference
+    }
+  }
+
+  /** Return a human readable String representation of the exception. */
+  protected def formatException(e: Throwable): String = {
+    val stackTraceString = e.getStackTrace.map { "\t" + _ }.mkString("\n")
+    s"$e\n$stackTraceString"
+  }
+
+  /** Construct an error message to signal the fact that an exception has been thrown. */
+  protected def handleError(message: String): ErrorResponse = {
+    val e = new ErrorResponse
+    e.serverSparkVersion = sparkVersion
+    e.message = message
+    e
+  }
+
+  /**
+   * Validate the response to ensure that it is correctly constructed.
+   *
+   * If it is, simply return the message as is. Otherwise, return an error response instead
+   * to propagate the exception back to the client and set the appropriate error code.
+   */
+  private def validateResponse(
+      responseMessage: SubmitRestProtocolResponse,
+      responseServlet: HttpServletResponse): SubmitRestProtocolResponse = {
+    try {
+      responseMessage.validate()
+      responseMessage
+    } catch {
+      case e: Exception =>
+        responseServlet.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR)
+        handleError("Internal server error: " + formatException(e))
+    }
+  }
+}
+
+/**
+ * A servlet for handling kill requests passed to the [[StandaloneRestServer]].
+ */
+private class KillRequestServlet(masterActor: ActorRef, conf: SparkConf)
+  extends StandaloneRestServlet {
+
+  /**
+   * If a submission ID is specified in the URL, have the Master kill the corresponding
+   * driver and return an appropriate response to the client. Otherwise, return error.
+   */
+  protected override def doPost(
+      request: HttpServletRequest,
+      response: HttpServletResponse): Unit = {
+    val submissionId = request.getPathInfo.stripPrefix("/")
+    val responseMessage =
+      if (submissionId.nonEmpty) {
+        handleKill(submissionId)
+      } else {
+        response.setStatus(HttpServletResponse.SC_BAD_REQUEST)
+        handleError("Submission ID is missing in kill request.")
+      }
+    sendResponse(responseMessage, response)
+  }
+
+  private def handleKill(submissionId: String): KillSubmissionResponse = {
+    val askTimeout = AkkaUtils.askTimeout(conf)
+    val response = AkkaUtils.askWithReply[DeployMessages.KillDriverResponse](
+      DeployMessages.RequestKillDriver(submissionId), masterActor, askTimeout)
+    val k = new KillSubmissionResponse
+    k.serverSparkVersion = sparkVersion
+    k.message = response.message
+    k.submissionId = submissionId
+    k.success = response.success
+    k
+  }
+}
+
+/**
+ * A servlet for handling status requests passed to the [[StandaloneRestServer]].
+ */
+private class StatusRequestServlet(masterActor: ActorRef, conf: SparkConf)
+  extends StandaloneRestServlet {
+
+  /**
+   * If a submission ID is specified in the URL, request the status of the corresponding
+   * driver from the Master and include it in the response. Otherwise, return error.
+   */
+  protected override def doGet(
+      request: HttpServletRequest,
+      response: HttpServletResponse): Unit = {
+    val submissionId = request.getPathInfo.stripPrefix("/")
+    val responseMessage =
+      if (submissionId.nonEmpty) {
+        handleStatus(submissionId)
+      } else {
+        response.setStatus(HttpServletResponse.SC_BAD_REQUEST)
+        handleError("Submission ID is missing in status request.")
+      }
+    sendResponse(responseMessage, response)
+  }
+
+  private def handleStatus(submissionId: String): SubmissionStatusResponse = {
+    val askTimeout = AkkaUtils.askTimeout(conf)
+    val response = AkkaUtils.askWithReply[DeployMessages.DriverStatusResponse](
+      DeployMessages.RequestDriverStatus(submissionId), masterActor, askTimeout)
+    val message = response.exception.map { s"Exception from the cluster:\n" + formatException(_) }
+    val d = new SubmissionStatusResponse
+    d.serverSparkVersion = sparkVersion
+    d.submissionId = submissionId
+    d.success = response.found
+    d.driverState = response.state.map(_.toString).orNull
+    d.workerId = response.workerId.orNull
+    d.workerHostPort = response.workerHostPort.orNull
+    d.message = message.orNull
+    d
+  }
+}
+
+/**
+ * A servlet for handling submit requests passed to the [[StandaloneRestServer]].
+ */
+private class SubmitRequestServlet(
+    masterActor: ActorRef,
+    masterUrl: String,
+    conf: SparkConf)
+  extends StandaloneRestServlet {
+
+  /**
+   * Submit an application to the Master with parameters specified in the request.
+   *
+   * The request is assumed to be a [[SubmitRestProtocolRequest]] in the form of JSON.
+   * If the request is successfully processed, return an appropriate response to the
+   * client indicating so. Otherwise, return error instead.
+   */
+  protected override def doPost(
+      requestServlet: HttpServletRequest,
+      responseServlet: HttpServletResponse): Unit = {
+    val responseMessage =
+      try {
+        val requestMessageJson = Source.fromInputStream(requestServlet.getInputStream).mkString
+        val requestMessage = SubmitRestProtocolMessage.fromJson(requestMessageJson)
+        // The response should have already been validated on the client.
+        // In case this is not true, validate it ourselves to avoid potential NPEs.
+        requestMessage.validate()
+        handleSubmit(requestMessageJson, requestMessage, responseServlet)
+      } catch {
+        // The client failed to provide a valid JSON, so this is not our fault
+        case e @ (_: JsonMappingException | _: SubmitRestProtocolException) =>
+          responseServlet.setStatus(HttpServletResponse.SC_BAD_REQUEST)
+          handleError("Malformed request: " + formatException(e))
+      }
+    sendResponse(responseMessage, responseServlet)
+  }
+
+  /**
+   * Handle the submit request and construct an appropriate response to return to the client.
+   *
+   * This assumes that the request message is already successfully validated.
+   * If the request message is not of the expected type, return error to the client.
+   */
+  private def handleSubmit(
+      requestMessageJson: String,
+      requestMessage: SubmitRestProtocolMessage,
+      responseServlet: HttpServletResponse): SubmitRestProtocolResponse = {
+    requestMessage match {
+      case submitRequest: CreateSubmissionRequest =>
+        val askTimeout = AkkaUtils.askTimeout(conf)
+        val driverDescription = buildDriverDescription(submitRequest)
+        val response = AkkaUtils.askWithReply[DeployMessages.SubmitDriverResponse](
+          DeployMessages.RequestSubmitDriver(driverDescription), masterActor, askTimeout)
+        val submitResponse = new CreateSubmissionResponse
+        submitResponse.serverSparkVersion = sparkVersion
+        submitResponse.message = response.message
+        submitResponse.success = response.success
+        submitResponse.submissionId = response.driverId.orNull
+        val unknownFields = findUnknownFields(requestMessageJson, requestMessage)
+        if (unknownFields.nonEmpty) {
+          // If there are fields that the server does not know about, warn the client
+          submitResponse.unknownFields = unknownFields
+        }
+        submitResponse
+      case unexpected =>
+        responseServlet.setStatus(HttpServletResponse.SC_BAD_REQUEST)
+        handleError(s"Received message of unexpected type ${unexpected.messageType}.")
+    }
+  }
+
+  /**
+   * Build a driver description from the fields specified in the submit request.
+   *
+   * This involves constructing a command that takes into account memory, java options,
+   * classpath and other settings to launch the driver. This does not currently consider
+   * fields used by python applications since python is not supported in standalone
+   * cluster mode yet.
+   */
+  private def buildDriverDescription(request: CreateSubmissionRequest): DriverDescription = {
+    // Required fields, including the main class because python is not yet supported
+    val appResource = Option(request.appResource).getOrElse {
+      throw new SubmitRestMissingFieldException("Application jar is missing.")
+    }
+    val mainClass = Option(request.mainClass).getOrElse {
+      throw new SubmitRestMissingFieldException("Main class is missing.")
+    }
+
+    // Optional fields
+    val sparkProperties = request.sparkProperties
+    val driverMemory = sparkProperties.get("spark.driver.memory")
+    val driverCores = sparkProperties.get("spark.driver.cores")
+    val driverExtraJavaOptions = sparkProperties.get("spark.driver.extraJavaOptions")
+    val driverExtraClassPath = sparkProperties.get("spark.driver.extraClassPath")
+    val driverExtraLibraryPath = sparkProperties.get("spark.driver.extraLibraryPath")
+    val superviseDriver = sparkProperties.get("spark.driver.supervise")
+    val appArgs = request.appArgs
+    val environmentVariables = request.environmentVariables
+
+    // Construct driver description
+    val conf = new SparkConf(false)
+      .setAll(sparkProperties)
+      .set("spark.master", masterUrl)
+    val extraClassPath = driverExtraClassPath.toSeq.flatMap(_.split(File.pathSeparator))
+    val extraLibraryPath = driverExtraLibraryPath.toSeq.flatMap(_.split(File.pathSeparator))
+    val extraJavaOpts = driverExtraJavaOptions.map(Utils.splitCommandString).getOrElse(Seq.empty)
+    val sparkJavaOpts = Utils.sparkJavaOpts(conf)
+    val javaOpts = sparkJavaOpts ++ extraJavaOpts
+    val command = new Command(
+      "org.apache.spark.deploy.worker.DriverWrapper",
+      Seq("{{WORKER_URL}}", mainClass) ++ appArgs, // args to the DriverWrapper
+      environmentVariables, extraClassPath, extraLibraryPath, javaOpts)
+    val actualDriverMemory = driverMemory.map(Utils.memoryStringToMb).getOrElse(DEFAULT_MEMORY)
+    val actualDriverCores = driverCores.map(_.toInt).getOrElse(DEFAULT_CORES)
+    val actualSuperviseDriver = superviseDriver.map(_.toBoolean).getOrElse(DEFAULT_SUPERVISE)
+    new DriverDescription(
+      appResource, actualDriverMemory, actualDriverCores, actualSuperviseDriver, command)
+  }
+}
+
+/**
+ * A default servlet that handles error cases that are not captured by other servlets.
+ */
+private class ErrorServlet extends StandaloneRestServlet {
+  private val serverVersion = StandaloneRestServer.PROTOCOL_VERSION
+
+  /** Service a faulty request by returning an appropriate error message to the client. */
+  protected override def service(
+      request: HttpServletRequest,
+      response: HttpServletResponse): Unit = {
+    val path = request.getPathInfo
+    val parts = path.stripPrefix("/").split("/").toSeq
+    var versionMismatch = false
+    var msg =
+      parts match {
+        case Nil =>
+          // http://host:port/
+          "Missing protocol version."
+        case `serverVersion` :: Nil =>
+          // http://host:port/correct-version
+          "Missing the /submissions prefix."
+        case `serverVersion` :: "submissions" :: Nil =>
+          // http://host:port/correct-version/submissions
+          "Missing an action: please specify one of /create, /kill, or /status."
+        case unknownVersion :: _ =>
+          // http://host:port/unknown-version/*
+          versionMismatch = true
+          s"Unknown protocol version '$unknownVersion'."
+        case _ =>
+          // never reached
+          s"Malformed path $path."
+      }
+    msg += s" Please submit requests through http://[host]:[port]/$serverVersion/submissions/..."
+    val error = handleError(msg)
+    // If there is a version mismatch, include the highest protocol version that
+    // this server supports in case the client wants to retry with our version
+    if (versionMismatch) {
+      error.highestProtocolVersion = serverVersion
+      response.setStatus(StandaloneRestServer.SC_UNKNOWN_PROTOCOL_VERSION)
+    } else {
+      response.setStatus(HttpServletResponse.SC_BAD_REQUEST)
+    }
+    sendResponse(error, response)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolException.scala b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolException.scala
new file mode 100644
index 0000000000000..d7a0bdbe10778
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolException.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.rest
+
+/**
+ * An exception thrown in the REST application submission protocol.
+ */
+private[spark] class SubmitRestProtocolException(message: String, cause: Throwable = null)
+  extends Exception(message, cause)
+
+/**
+ * An exception thrown if a field is missing from a [[SubmitRestProtocolMessage]].
+ */
+private[spark] class SubmitRestMissingFieldException(message: String)
+  extends SubmitRestProtocolException(message)
+
+/**
+ * An exception thrown if the REST client cannot reach the REST server.
+ */
+private[spark] class SubmitRestConnectionException(message: String, cause: Throwable)
+  extends SubmitRestProtocolException(message, cause)
diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala
new file mode 100644
index 0000000000000..b877898231e3e
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.rest
+
+import scala.util.Try
+
+import com.fasterxml.jackson.annotation._
+import com.fasterxml.jackson.annotation.JsonAutoDetect.Visibility
+import com.fasterxml.jackson.annotation.JsonInclude.Include
+import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import org.json4s.JsonAST._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.util.Utils
+
+/**
+ * An abstract message exchanged in the REST application submission protocol.
+ *
+ * This message is intended to be serialized to and deserialized from JSON in the exchange.
+ * Each message can either be a request or a response and consists of three common fields:
+ *   (1) the action, which fully specifies the type of the message
+ *   (2) the Spark version of the client / server
+ *   (3) an optional message
+ */
+@JsonInclude(Include.NON_NULL)
+@JsonAutoDetect(getterVisibility = Visibility.ANY, setterVisibility = Visibility.ANY)
+@JsonPropertyOrder(alphabetic = true)
+private[spark] abstract class SubmitRestProtocolMessage {
+  @JsonIgnore
+  val messageType = Utils.getFormattedClassName(this)
+
+  val action: String = messageType
+  var message: String = null
+
+  // For JSON deserialization
+  private def setAction(a: String): Unit = { }
+
+  /**
+   * Serialize the message to JSON.
+   * This also ensures that the message is valid and its fields are in the expected format.
+   */
+  def toJson: String = {
+    validate()
+    SubmitRestProtocolMessage.mapper.writeValueAsString(this)
+  }
+
+  /**
+   * Assert the validity of the message.
+   * If the validation fails, throw a [[SubmitRestProtocolException]].
+   */
+  final def validate(): Unit = {
+    try {
+      doValidate()
+    } catch {
+      case e: Exception =>
+        throw new SubmitRestProtocolException(s"Validation of message $messageType failed!", e)
+    }
+  }
+
+  /** Assert the validity of the message */
+  protected def doValidate(): Unit = {
+    if (action == null) {
+      throw new SubmitRestMissingFieldException(s"The action field is missing in $messageType")
+    }
+  }
+
+  /** Assert that the specified field is set in this message. */
+  protected def assertFieldIsSet[T](value: T, name: String): Unit = {
+    if (value == null) {
+      throw new SubmitRestMissingFieldException(s"'$name' is missing in message $messageType.")
+    }
+  }
+
+  /**
+   * Assert a condition when validating this message.
+   * If the assertion fails, throw a [[SubmitRestProtocolException]].
+   */
+  protected def assert(condition: Boolean, failMessage: String): Unit = {
+    if (!condition) { throw new SubmitRestProtocolException(failMessage) }
+  }
+}
+
+/**
+ * Helper methods to process serialized [[SubmitRestProtocolMessage]]s.
+ */
+private[spark] object SubmitRestProtocolMessage {
+  private val packagePrefix = this.getClass.getPackage.getName
+  private val mapper = new ObjectMapper()
+    .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
+    .enable(SerializationFeature.INDENT_OUTPUT)
+    .registerModule(DefaultScalaModule)
+
+  /**
+   * Parse the value of the action field from the given JSON.
+   * If the action field is not found, throw a [[SubmitRestMissingFieldException]].
+   */
+  def parseAction(json: String): String = {
+    parse(json).asInstanceOf[JObject].obj
+      .find { case (f, _) => f == "action" }
+      .map { case (_, v) => v.asInstanceOf[JString].s }
+      .getOrElse {
+        throw new SubmitRestMissingFieldException(s"Action field not found in JSON:\n$json")
+      }
+  }
+
+  /**
+   * Construct a [[SubmitRestProtocolMessage]] from its JSON representation.
+   *
+   * This method first parses the action from the JSON and uses it to infer the message type.
+   * Note that the action must represent one of the [[SubmitRestProtocolMessage]]s defined in
+   * this package. Otherwise, a [[ClassNotFoundException]] will be thrown.
+   */
+  def fromJson(json: String): SubmitRestProtocolMessage = {
+    val className = parseAction(json)
+    val clazz = Class.forName(packagePrefix + "." + className)
+      .asSubclass[SubmitRestProtocolMessage](classOf[SubmitRestProtocolMessage])
+    fromJson(json, clazz)
+  }
+
+  /**
+   * Construct a [[SubmitRestProtocolMessage]] from its JSON representation.
+   *
+   * This method determines the type of the message from the class provided instead of
+   * inferring it from the action field. This is useful for deserializing JSON that
+   * represents custom user-defined messages.
+   */
+  def fromJson[T <: SubmitRestProtocolMessage](json: String, clazz: Class[T]): T = {
+    mapper.readValue(json, clazz)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala
new file mode 100644
index 0000000000000..9e1fd8c40cabd
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.rest
+
+import scala.util.Try
+
+import org.apache.spark.util.Utils
+
+/**
+ * An abstract request sent from the client in the REST application submission protocol.
+ */
+private[spark] abstract class SubmitRestProtocolRequest extends SubmitRestProtocolMessage {
+  var clientSparkVersion: String = null
+  protected override def doValidate(): Unit = {
+    super.doValidate()
+    assertFieldIsSet(clientSparkVersion, "clientSparkVersion")
+  }
+}
+
+/**
+ * A request to launch a new application in the REST application submission protocol.
+ */
+private[spark] class CreateSubmissionRequest extends SubmitRestProtocolRequest {
+  var appResource: String = null
+  var mainClass: String = null
+  var appArgs: Array[String] = null
+  var sparkProperties: Map[String, String] = null
+  var environmentVariables: Map[String, String] = null
+
+  protected override def doValidate(): Unit = {
+    super.doValidate()
+    assert(sparkProperties != null, "No Spark properties set!")
+    assertFieldIsSet(appResource, "appResource")
+    assertPropertyIsSet("spark.app.name")
+    assertPropertyIsBoolean("spark.driver.supervise")
+    assertPropertyIsNumeric("spark.driver.cores")
+    assertPropertyIsNumeric("spark.cores.max")
+    assertPropertyIsMemory("spark.driver.memory")
+    assertPropertyIsMemory("spark.executor.memory")
+  }
+
+  private def assertPropertyIsSet(key: String): Unit =
+    assertFieldIsSet(sparkProperties.getOrElse(key, null), key)
+
+  private def assertPropertyIsBoolean(key: String): Unit =
+    assertProperty[Boolean](key, "boolean", _.toBoolean)
+
+  private def assertPropertyIsNumeric(key: String): Unit =
+    assertProperty[Int](key, "numeric", _.toInt)
+
+  private def assertPropertyIsMemory(key: String): Unit =
+    assertProperty[Int](key, "memory", Utils.memoryStringToMb)
+
+  /** Assert that a Spark property can be converted to a certain type. */
+  private def assertProperty[T](key: String, valueType: String, convert: (String => T)): Unit = {
+    sparkProperties.get(key).foreach { value =>
+      Try(convert(value)).getOrElse {
+        throw new SubmitRestProtocolException(
+          s"Property '$key' expected $valueType value: actual was '$value'.")
+      }
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolResponse.scala b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolResponse.scala
new file mode 100644
index 0000000000000..16dfe041d4bea
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolResponse.scala
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.rest
+
+import java.lang.Boolean
+
+/**
+ * An abstract response sent from the server in the REST application submission protocol.
+ */
+private[spark] abstract class SubmitRestProtocolResponse extends SubmitRestProtocolMessage {
+  var serverSparkVersion: String = null
+  var success: Boolean = null
+  var unknownFields: Array[String] = null
+  protected override def doValidate(): Unit = {
+    super.doValidate()
+    assertFieldIsSet(serverSparkVersion, "serverSparkVersion")
+  }
+}
+
+/**
+ * A response to a [[CreateSubmissionRequest]] in the REST application submission protocol.
+ */
+private[spark] class CreateSubmissionResponse extends SubmitRestProtocolResponse {
+  var submissionId: String = null
+  protected override def doValidate(): Unit = {
+    super.doValidate()
+    assertFieldIsSet(success, "success")
+  }
+}
+
+/**
+ * A response to a kill request in the REST application submission protocol.
+ */
+private[spark] class KillSubmissionResponse extends SubmitRestProtocolResponse {
+  var submissionId: String = null
+  protected override def doValidate(): Unit = {
+    super.doValidate()
+    assertFieldIsSet(submissionId, "submissionId")
+    assertFieldIsSet(success, "success")
+  }
+}
+
+/**
+ * A response to a status request in the REST application submission protocol.
+ */
+private[spark] class SubmissionStatusResponse extends SubmitRestProtocolResponse {
+  var submissionId: String = null
+  var driverState: String = null
+  var workerId: String = null
+  var workerHostPort: String = null
+
+  protected override def doValidate(): Unit = {
+    super.doValidate()
+    assertFieldIsSet(submissionId, "submissionId")
+    assertFieldIsSet(success, "success")
+  }
+}
+
+/**
+ * An error response message used in the REST application submission protocol.
+ */
+private[spark] class ErrorResponse extends SubmitRestProtocolResponse {
+  // The highest protocol version that the server knows about
+  // This is set when the client specifies an unknown version
+  var highestProtocolVersion: String = null
+  protected override def doValidate(): Unit = {
+    super.doValidate()
+    assertFieldIsSet(message, "message")
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
index ed02ca81e405c..e955636cf5b59 100644
--- a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
@@ -68,7 +68,8 @@ class JsonProtocolSuite extends FunSuite {
     val completedApps = Array[ApplicationInfo]()
     val activeDrivers = Array(createDriverInfo())
     val completedDrivers = Array(createDriverInfo())
-    val stateResponse = new MasterStateResponse("host", 8080, workers, activeApps, completedApps,
+    val stateResponse = new MasterStateResponse(
+      "host", 8080, None, workers, activeApps, completedApps,
       activeDrivers, completedDrivers, RecoveryState.ALIVE)
     val output = JsonProtocol.writeMasterState(stateResponse)
     assertValidJson(output)
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 3f1355f82893e..1ddccae1262bc 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -141,7 +141,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
-    val (childArgs, classpath, sysProps, mainClass) = createLaunchEnv(appArgs)
+    val (childArgs, classpath, sysProps, mainClass) = prepareSubmitEnvironment(appArgs)
     val childArgsStr = childArgs.mkString(" ")
     childArgsStr should include ("--class org.SomeClass")
     childArgsStr should include ("--executor-memory 5g")
@@ -180,7 +180,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
-    val (childArgs, classpath, sysProps, mainClass) = createLaunchEnv(appArgs)
+    val (childArgs, classpath, sysProps, mainClass) = prepareSubmitEnvironment(appArgs)
     childArgs.mkString(" ") should be ("arg1 arg2")
     mainClass should be ("org.SomeClass")
     classpath should have length (4)
@@ -201,6 +201,18 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
   }
 
   test("handles standalone cluster mode") {
+    testStandaloneCluster(useRest = true)
+  }
+
+  test("handles legacy standalone cluster mode") {
+    testStandaloneCluster(useRest = false)
+  }
+
+  /**
+   * Test whether the launch environment is correctly set up in standalone cluster mode.
+   * @param useRest whether to use the REST submission gateway introduced in Spark 1.3
+   */
+  private def testStandaloneCluster(useRest: Boolean): Unit = {
     val clArgs = Seq(
       "--deploy-mode", "cluster",
       "--master", "spark://h:p",
@@ -212,17 +224,26 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
-    val (childArgs, classpath, sysProps, mainClass) = createLaunchEnv(appArgs)
+    appArgs.useRest = useRest
+    val (childArgs, classpath, sysProps, mainClass) = prepareSubmitEnvironment(appArgs)
     val childArgsStr = childArgs.mkString(" ")
-    childArgsStr should startWith ("--memory 4g --cores 5 --supervise")
-    childArgsStr should include regex ("launch spark://h:p .*thejar.jar org.SomeClass arg1 arg2")
-    mainClass should be ("org.apache.spark.deploy.Client")
-    classpath should have size (0)
-    sysProps should have size (5)
+    if (useRest) {
+      childArgsStr should endWith ("thejar.jar org.SomeClass arg1 arg2")
+      mainClass should be ("org.apache.spark.deploy.rest.StandaloneRestClient")
+    } else {
+      childArgsStr should startWith ("--supervise --memory 4g --cores 5")
+      childArgsStr should include regex "launch spark://h:p .*thejar.jar org.SomeClass arg1 arg2"
+      mainClass should be ("org.apache.spark.deploy.Client")
+    }
+    classpath should have size 0
+    sysProps should have size 8
     sysProps.keys should contain ("SPARK_SUBMIT")
     sysProps.keys should contain ("spark.master")
     sysProps.keys should contain ("spark.app.name")
     sysProps.keys should contain ("spark.jars")
+    sysProps.keys should contain ("spark.driver.memory")
+    sysProps.keys should contain ("spark.driver.cores")
+    sysProps.keys should contain ("spark.driver.supervise")
     sysProps.keys should contain ("spark.shuffle.spill")
     sysProps("spark.shuffle.spill") should be ("false")
   }
@@ -239,7 +260,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
-    val (childArgs, classpath, sysProps, mainClass) = createLaunchEnv(appArgs)
+    val (childArgs, classpath, sysProps, mainClass) = prepareSubmitEnvironment(appArgs)
     childArgs.mkString(" ") should be ("arg1 arg2")
     mainClass should be ("org.SomeClass")
     classpath should have length (1)
@@ -261,7 +282,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
-    val (childArgs, classpath, sysProps, mainClass) = createLaunchEnv(appArgs)
+    val (childArgs, classpath, sysProps, mainClass) = prepareSubmitEnvironment(appArgs)
     childArgs.mkString(" ") should be ("arg1 arg2")
     mainClass should be ("org.SomeClass")
     classpath should have length (1)
@@ -281,7 +302,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
       "thejar.jar",
       "arg1", "arg2")
     val appArgs = new SparkSubmitArguments(clArgs)
-    val (_, _, sysProps, mainClass) = createLaunchEnv(appArgs)
+    val (_, _, sysProps, mainClass) = prepareSubmitEnvironment(appArgs)
     sysProps("spark.executor.memory") should be ("5g")
     sysProps("spark.master") should be ("yarn-cluster")
     mainClass should be ("org.apache.spark.deploy.yarn.Client")
@@ -339,7 +360,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
       "--files", files,
       "thejar.jar")
     val appArgs = new SparkSubmitArguments(clArgs)
-    val sysProps = SparkSubmit.createLaunchEnv(appArgs)._3
+    val sysProps = SparkSubmit.prepareSubmitEnvironment(appArgs)._3
     appArgs.jars should be (Utils.resolveURIs(jars))
     appArgs.files should be (Utils.resolveURIs(files))
     sysProps("spark.jars") should be (Utils.resolveURIs(jars + ",thejar.jar"))
@@ -354,7 +375,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
       "thejar.jar"
     )
     val appArgs2 = new SparkSubmitArguments(clArgs2)
-    val sysProps2 = SparkSubmit.createLaunchEnv(appArgs2)._3
+    val sysProps2 = SparkSubmit.prepareSubmitEnvironment(appArgs2)._3
     appArgs2.files should be (Utils.resolveURIs(files))
     appArgs2.archives should be (Utils.resolveURIs(archives))
     sysProps2("spark.yarn.dist.files") should be (Utils.resolveURIs(files))
@@ -367,7 +388,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
       "mister.py"
     )
     val appArgs3 = new SparkSubmitArguments(clArgs3)
-    val sysProps3 = SparkSubmit.createLaunchEnv(appArgs3)._3
+    val sysProps3 = SparkSubmit.prepareSubmitEnvironment(appArgs3)._3
     appArgs3.pyFiles should be (Utils.resolveURIs(pyFiles))
     sysProps3("spark.submit.pyFiles") should be (
       PythonRunner.formatPaths(Utils.resolveURIs(pyFiles)).mkString(","))
@@ -392,7 +413,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
       "thejar.jar"
     )
     val appArgs = new SparkSubmitArguments(clArgs)
-    val sysProps = SparkSubmit.createLaunchEnv(appArgs)._3
+    val sysProps = SparkSubmit.prepareSubmitEnvironment(appArgs)._3
     sysProps("spark.jars") should be(Utils.resolveURIs(jars + ",thejar.jar"))
     sysProps("spark.files") should be(Utils.resolveURIs(files))
 
@@ -409,7 +430,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
       "thejar.jar"
     )
     val appArgs2 = new SparkSubmitArguments(clArgs2)
-    val sysProps2 = SparkSubmit.createLaunchEnv(appArgs2)._3
+    val sysProps2 = SparkSubmit.prepareSubmitEnvironment(appArgs2)._3
     sysProps2("spark.yarn.dist.files") should be(Utils.resolveURIs(files))
     sysProps2("spark.yarn.dist.archives") should be(Utils.resolveURIs(archives))
 
@@ -424,7 +445,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
       "mister.py"
     )
     val appArgs3 = new SparkSubmitArguments(clArgs3)
-    val sysProps3 = SparkSubmit.createLaunchEnv(appArgs3)._3
+    val sysProps3 = SparkSubmit.prepareSubmitEnvironment(appArgs3)._3
     sysProps3("spark.submit.pyFiles") should be(
       PythonRunner.formatPaths(Utils.resolveURIs(pyFiles)).mkString(","))
   }
@@ -440,7 +461,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
       val appArgs = new SparkSubmitArguments(args, Map("SPARK_CONF_DIR" -> path))
       assert(appArgs.propertiesFile != null)
       assert(appArgs.propertiesFile.startsWith(path))
-      appArgs.executorMemory should  be ("2.3g")
+      appArgs.executorMemory should be ("2.3g")
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
new file mode 100644
index 0000000000000..29aed89b67aa7
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.rest
+
+import java.io.{File, FileInputStream, FileOutputStream, PrintWriter}
+import java.util.jar.{JarEntry, JarOutputStream}
+import java.util.zip.ZipEntry
+
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+
+import akka.actor.ActorSystem
+import com.google.common.io.ByteStreams
+import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
+import org.scalatest.exceptions.TestFailedException
+
+import org.apache.spark._
+import org.apache.spark.util.Utils
+import org.apache.spark.deploy.{SparkSubmit, SparkSubmitArguments}
+import org.apache.spark.deploy.master.{DriverState, Master}
+import org.apache.spark.deploy.worker.Worker
+
+/**
+ * End-to-end tests for the REST application submission protocol in standalone mode.
+ */
+class StandaloneRestSubmitSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAfterEach {
+  private val systemsToStop = new ArrayBuffer[ActorSystem]
+  private val masterRestUrl = startLocalCluster()
+  private val client = new StandaloneRestClient
+  private val mainJar = StandaloneRestSubmitSuite.createJar()
+  private val mainClass = StandaloneRestApp.getClass.getName.stripSuffix("$")
+
+  override def afterAll() {
+    systemsToStop.foreach(_.shutdown())
+  }
+
+  test("simple submit until completion") {
+    val resultsFile = File.createTempFile("test-submit", ".txt")
+    val numbers = Seq(1, 2, 3)
+    val size = 500
+    val submissionId = submitApplication(resultsFile, numbers, size)
+    waitUntilFinished(submissionId)
+    validateResult(resultsFile, numbers, size)
+  }
+
+  test("kill empty submission") {
+    val response = client.killSubmission(masterRestUrl, "submission-that-does-not-exist")
+    val killResponse = getKillResponse(response)
+    val killSuccess = killResponse.success
+    assert(!killSuccess)
+  }
+
+  test("kill running submission") {
+    val resultsFile = File.createTempFile("test-kill", ".txt")
+    val numbers = Seq(1, 2, 3)
+    val size = 500
+    val submissionId = submitApplication(resultsFile, numbers, size)
+    val response = client.killSubmission(masterRestUrl, submissionId)
+    val killResponse = getKillResponse(response)
+    val killSuccess = killResponse.success
+    waitUntilFinished(submissionId)
+    val response2 = client.requestSubmissionStatus(masterRestUrl, submissionId)
+    val statusResponse = getStatusResponse(response2)
+    val statusSuccess = statusResponse.success
+    val driverState = statusResponse.driverState
+    assert(killSuccess)
+    assert(statusSuccess)
+    assert(driverState === DriverState.KILLED.toString)
+    // we should not see the expected results because we killed the submission
+    intercept[TestFailedException] { validateResult(resultsFile, numbers, size) }
+  }
+
+  test("request status for empty submission") {
+    val response = client.requestSubmissionStatus(masterRestUrl, "submission-that-does-not-exist")
+    val statusResponse = getStatusResponse(response)
+    val statusSuccess = statusResponse.success
+    assert(!statusSuccess)
+  }
+
+  /**
+   * Start a local cluster containing one Master and a few Workers.
+   * Do not use [[org.apache.spark.deploy.LocalSparkCluster]] here because we want the REST URL.
+   * Return the Master's REST URL to which applications should be submitted.
+   */
+  private def startLocalCluster(): String = {
+    val conf = new SparkConf(false)
+      .set("spark.master.rest.enabled", "true")
+      .set("spark.master.rest.port", "0")
+    val (numWorkers, coresPerWorker, memPerWorker) = (2, 1, 512)
+    val localHostName = Utils.localHostName()
+    val (masterSystem, masterPort, _, _masterRestPort) =
+      Master.startSystemAndActor(localHostName, 0, 0, conf)
+    val masterRestPort = _masterRestPort.getOrElse { fail("REST server not started on Master!") }
+    val masterUrl = "spark://" + localHostName + ":" + masterPort
+    val masterRestUrl = "spark://" + localHostName + ":" + masterRestPort
+    (1 to numWorkers).foreach { n =>
+      val (workerSystem, _) = Worker.startSystemAndActor(
+        localHostName, 0, 0, coresPerWorker, memPerWorker, Array(masterUrl), null, Some(n))
+      systemsToStop.append(workerSystem)
+    }
+    systemsToStop.append(masterSystem)
+    masterRestUrl
+  }
+
+  /** Submit the [[StandaloneRestApp]] and return the corresponding submission ID. */
+  private def submitApplication(resultsFile: File, numbers: Seq[Int], size: Int): String = {
+    val appArgs = Seq(resultsFile.getAbsolutePath) ++ numbers.map(_.toString) ++ Seq(size.toString)
+    val commandLineArgs = Array(
+      "--deploy-mode", "cluster",
+      "--master", masterRestUrl,
+      "--name", mainClass,
+      "--class", mainClass,
+      mainJar) ++ appArgs
+    val args = new SparkSubmitArguments(commandLineArgs)
+    val (_, _, sparkProperties, _) = SparkSubmit.prepareSubmitEnvironment(args)
+    val request = client.constructSubmitRequest(
+      mainJar, mainClass, appArgs.toArray, sparkProperties.toMap, Map.empty)
+    val response = client.createSubmission(masterRestUrl, request)
+    val submitResponse = getSubmitResponse(response)
+    val submissionId = submitResponse.submissionId
+    assert(submissionId != null, "Application submission was unsuccessful!")
+    submissionId
+  }
+
+  /** Wait until the given submission has finished running up to the specified timeout. */
+  private def waitUntilFinished(submissionId: String, maxSeconds: Int = 30): Unit = {
+    var finished = false
+    val expireTime = System.currentTimeMillis + maxSeconds * 1000
+    while (!finished) {
+      val response = client.requestSubmissionStatus(masterRestUrl, submissionId)
+      val statusResponse = getStatusResponse(response)
+      val driverState = statusResponse.driverState
+      finished =
+        driverState != DriverState.SUBMITTED.toString &&
+        driverState != DriverState.RUNNING.toString
+      if (System.currentTimeMillis > expireTime) {
+        fail(s"Driver $submissionId did not finish within $maxSeconds seconds.")
+      }
+    }
+  }
+
+  /** Return the response as a submit response, or fail with error otherwise. */
+  private def getSubmitResponse(response: SubmitRestProtocolResponse): CreateSubmissionResponse = {
+    response match {
+      case s: CreateSubmissionResponse => s
+      case e: ErrorResponse => fail(s"Server returned error: ${e.message}")
+      case r => fail(s"Expected submit response. Actual: ${r.toJson}")
+    }
+  }
+
+  /** Return the response as a kill response, or fail with error otherwise. */
+  private def getKillResponse(response: SubmitRestProtocolResponse): KillSubmissionResponse = {
+    response match {
+      case k: KillSubmissionResponse => k
+      case e: ErrorResponse => fail(s"Server returned error: ${e.message}")
+      case r => fail(s"Expected kill response. Actual: ${r.toJson}")
+    }
+  }
+
+  /** Return the response as a status response, or fail with error otherwise. */
+  private def getStatusResponse(response: SubmitRestProtocolResponse): SubmissionStatusResponse = {
+    response match {
+      case s: SubmissionStatusResponse => s
+      case e: ErrorResponse => fail(s"Server returned error: ${e.message}")
+      case r => fail(s"Expected status response. Actual: ${r.toJson}")
+    }
+  }
+
+  /** Validate whether the application produced the corrupt output. */
+  private def validateResult(resultsFile: File, numbers: Seq[Int], size: Int): Unit = {
+    val lines = Source.fromFile(resultsFile.getAbsolutePath).getLines().toSeq
+    val unexpectedContent =
+      if (lines.nonEmpty) {
+        "[\n" + lines.map { l => "  " + l }.mkString("\n") + "\n]"
+      } else {
+        "[EMPTY]"
+      }
+    assert(lines.size === 2, s"Unexpected content in file: $unexpectedContent")
+    assert(lines(0).toInt === numbers.sum, s"Sum of ${numbers.mkString(",")} is incorrect")
+    assert(lines(1).toInt === (size / 2) + 1, "Result of Spark job is incorrect")
+  }
+}
+
+private object StandaloneRestSubmitSuite {
+  private val pathPrefix = this.getClass.getPackage.getName.replaceAll("\\.", "/")
+
+  /**
+   * Create a jar that contains all the class files needed for running the [[StandaloneRestApp]].
+   * Return the absolute path to that jar.
+   */
+  def createJar(): String = {
+    val jarFile = File.createTempFile("test-standalone-rest-protocol", ".jar")
+    val jarFileStream = new FileOutputStream(jarFile)
+    val jarStream = new JarOutputStream(jarFileStream, new java.util.jar.Manifest)
+    jarStream.putNextEntry(new ZipEntry(pathPrefix))
+    getClassFiles.foreach { cf =>
+      jarStream.putNextEntry(new JarEntry(pathPrefix + "/" + cf.getName))
+      val in = new FileInputStream(cf)
+      ByteStreams.copy(in, jarStream)
+      in.close()
+    }
+    jarStream.close()
+    jarFileStream.close()
+    jarFile.getAbsolutePath
+  }
+
+  /**
+   * Return a list of class files compiled for [[StandaloneRestApp]].
+   * This includes all the anonymous classes used in the application.
+   */
+  private def getClassFiles: Seq[File] = {
+    val className = Utils.getFormattedClassName(StandaloneRestApp)
+    val clazz = StandaloneRestApp.getClass
+    val basePath = clazz.getProtectionDomain.getCodeSource.getLocation.toURI.getPath
+    val baseDir = new File(basePath + "/" + pathPrefix)
+    baseDir.listFiles().filter(_.getName.contains(className))
+  }
+}
+
+/**
+ * Sample application to be submitted to the cluster using the REST gateway.
+ * All relevant classes will be packaged into a jar at run time.
+ */
+object StandaloneRestApp {
+  // Usage: [path to results file] [num1] [num2] [num3] [rddSize]
+  // The first line of the results file should be (num1 + num2 + num3)
+  // The second line should be (rddSize / 2) + 1
+  def main(args: Array[String]) {
+    assert(args.size == 5, s"Expected exactly 5 arguments: ${args.mkString(",")}")
+    val resultFile = new File(args(0))
+    val writer = new PrintWriter(resultFile)
+    try {
+      val conf = new SparkConf()
+      val sc = new SparkContext(conf)
+      val firstLine = args(1).toInt + args(2).toInt + args(3).toInt
+      val secondLine = sc.parallelize(1 to args(4).toInt)
+        .map { i => (i / 2, i) }
+        .reduceByKey(_ + _)
+        .count()
+      writer.println(firstLine)
+      writer.println(secondLine)
+    } catch {
+      case e: Exception =>
+        writer.println(e)
+        e.getStackTrace.foreach { l => writer.println("  " + l) }
+    } finally {
+      writer.close()
+    }
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala
new file mode 100644
index 0000000000000..1d64ec201e647
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala
@@ -0,0 +1,324 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.rest
+
+import java.lang.Boolean
+import java.lang.Integer
+
+import org.json4s.jackson.JsonMethods._
+import org.scalatest.FunSuite
+
+import org.apache.spark.SparkConf
+
+/**
+ * Tests for the REST application submission protocol.
+ */
+class SubmitRestProtocolSuite extends FunSuite {
+
+  test("validate") {
+    val request = new DummyRequest
+    intercept[SubmitRestProtocolException] { request.validate() } // missing everything
+    request.clientSparkVersion = "1.2.3"
+    intercept[SubmitRestProtocolException] { request.validate() } // missing name and age
+    request.name = "something"
+    intercept[SubmitRestProtocolException] { request.validate() } // missing only age
+    request.age = 2
+    intercept[SubmitRestProtocolException] { request.validate() } // age too low
+    request.age = 10
+    request.validate() // everything is set properly
+    request.clientSparkVersion = null
+    intercept[SubmitRestProtocolException] { request.validate() } // missing only Spark version
+    request.clientSparkVersion = "1.2.3"
+    request.name = null
+    intercept[SubmitRestProtocolException] { request.validate() } // missing only name
+    request.message = "not-setting-name"
+    intercept[SubmitRestProtocolException] { request.validate() } // still missing name
+  }
+
+  test("request to and from JSON") {
+    val request = new DummyRequest
+    intercept[SubmitRestProtocolException] { request.toJson } // implicit validation
+    request.clientSparkVersion = "1.2.3"
+    request.active = true
+    request.age = 25
+    request.name = "jung"
+    val json = request.toJson
+    assertJsonEquals(json, dummyRequestJson)
+    val newRequest = SubmitRestProtocolMessage.fromJson(json, classOf[DummyRequest])
+    assert(newRequest.clientSparkVersion === "1.2.3")
+    assert(newRequest.clientSparkVersion === "1.2.3")
+    assert(newRequest.active)
+    assert(newRequest.age === 25)
+    assert(newRequest.name === "jung")
+    assert(newRequest.message === null)
+  }
+
+  test("response to and from JSON") {
+    val response = new DummyResponse
+    response.serverSparkVersion = "3.3.4"
+    response.success = true
+    val json = response.toJson
+    assertJsonEquals(json, dummyResponseJson)
+    val newResponse = SubmitRestProtocolMessage.fromJson(json, classOf[DummyResponse])
+    assert(newResponse.serverSparkVersion === "3.3.4")
+    assert(newResponse.serverSparkVersion === "3.3.4")
+    assert(newResponse.success)
+    assert(newResponse.message === null)
+  }
+
+  test("CreateSubmissionRequest") {
+    val message = new CreateSubmissionRequest
+    intercept[SubmitRestProtocolException] { message.validate() }
+    message.clientSparkVersion = "1.2.3"
+    message.appResource = "honey-walnut-cherry.jar"
+    message.mainClass = "org.apache.spark.examples.SparkPie"
+    val conf = new SparkConf(false)
+    conf.set("spark.app.name", "SparkPie")
+    message.sparkProperties = conf.getAll.toMap
+    message.validate()
+    // optional fields
+    conf.set("spark.jars", "mayonnaise.jar,ketchup.jar")
+    conf.set("spark.files", "fireball.png")
+    conf.set("spark.driver.memory", "512m")
+    conf.set("spark.driver.cores", "180")
+    conf.set("spark.driver.extraJavaOptions", " -Dslices=5 -Dcolor=mostly_red")
+    conf.set("spark.driver.extraClassPath", "food-coloring.jar")
+    conf.set("spark.driver.extraLibraryPath", "pickle.jar")
+    conf.set("spark.driver.supervise", "false")
+    conf.set("spark.executor.memory", "256m")
+    conf.set("spark.cores.max", "10000")
+    message.sparkProperties = conf.getAll.toMap
+    message.appArgs = Array("two slices", "a hint of cinnamon")
+    message.environmentVariables = Map("PATH" -> "/dev/null")
+    message.validate()
+    // bad fields
+    var badConf = conf.clone().set("spark.driver.cores", "one hundred feet")
+    message.sparkProperties = badConf.getAll.toMap
+    intercept[SubmitRestProtocolException] { message.validate() }
+    badConf = conf.clone().set("spark.driver.supervise", "nope, never")
+    message.sparkProperties = badConf.getAll.toMap
+    intercept[SubmitRestProtocolException] { message.validate() }
+    badConf = conf.clone().set("spark.cores.max", "two men")
+    message.sparkProperties = badConf.getAll.toMap
+    intercept[SubmitRestProtocolException] { message.validate() }
+    message.sparkProperties = conf.getAll.toMap
+    // test JSON
+    val json = message.toJson
+    assertJsonEquals(json, submitDriverRequestJson)
+    val newMessage = SubmitRestProtocolMessage.fromJson(json, classOf[CreateSubmissionRequest])
+    assert(newMessage.clientSparkVersion === "1.2.3")
+    assert(newMessage.appResource === "honey-walnut-cherry.jar")
+    assert(newMessage.mainClass === "org.apache.spark.examples.SparkPie")
+    assert(newMessage.sparkProperties("spark.app.name") === "SparkPie")
+    assert(newMessage.sparkProperties("spark.jars") === "mayonnaise.jar,ketchup.jar")
+    assert(newMessage.sparkProperties("spark.files") === "fireball.png")
+    assert(newMessage.sparkProperties("spark.driver.memory") === "512m")
+    assert(newMessage.sparkProperties("spark.driver.cores") === "180")
+    assert(newMessage.sparkProperties("spark.driver.extraJavaOptions") === " -Dslices=5 -Dcolor=mostly_red")
+    assert(newMessage.sparkProperties("spark.driver.extraClassPath") === "food-coloring.jar")
+    assert(newMessage.sparkProperties("spark.driver.extraLibraryPath") === "pickle.jar")
+    assert(newMessage.sparkProperties("spark.driver.supervise") === "false")
+    assert(newMessage.sparkProperties("spark.executor.memory") === "256m")
+    assert(newMessage.sparkProperties("spark.cores.max") === "10000")
+    assert(newMessage.appArgs === message.appArgs)
+    assert(newMessage.sparkProperties === message.sparkProperties)
+    assert(newMessage.environmentVariables === message.environmentVariables)
+  }
+
+  test("CreateSubmissionResponse") {
+    val message = new CreateSubmissionResponse
+    intercept[SubmitRestProtocolException] { message.validate() }
+    message.serverSparkVersion = "1.2.3"
+    message.submissionId = "driver_123"
+    message.success = true
+    message.validate()
+    // test JSON
+    val json = message.toJson
+    assertJsonEquals(json, submitDriverResponseJson)
+    val newMessage = SubmitRestProtocolMessage.fromJson(json, classOf[CreateSubmissionResponse])
+    assert(newMessage.serverSparkVersion === "1.2.3")
+    assert(newMessage.submissionId === "driver_123")
+    assert(newMessage.success)
+  }
+
+  test("KillSubmissionResponse") {
+    val message = new KillSubmissionResponse
+    intercept[SubmitRestProtocolException] { message.validate() }
+    message.serverSparkVersion = "1.2.3"
+    message.submissionId = "driver_123"
+    message.success = true
+    message.validate()
+    // test JSON
+    val json = message.toJson
+    assertJsonEquals(json, killDriverResponseJson)
+    val newMessage = SubmitRestProtocolMessage.fromJson(json, classOf[KillSubmissionResponse])
+    assert(newMessage.serverSparkVersion === "1.2.3")
+    assert(newMessage.submissionId === "driver_123")
+    assert(newMessage.success)
+  }
+
+  test("SubmissionStatusResponse") {
+    val message = new SubmissionStatusResponse
+    intercept[SubmitRestProtocolException] { message.validate() }
+    message.serverSparkVersion = "1.2.3"
+    message.submissionId = "driver_123"
+    message.success = true
+    message.validate()
+    // optional fields
+    message.driverState = "RUNNING"
+    message.workerId = "worker_123"
+    message.workerHostPort = "1.2.3.4:7780"
+    // test JSON
+    val json = message.toJson
+    assertJsonEquals(json, driverStatusResponseJson)
+    val newMessage = SubmitRestProtocolMessage.fromJson(json, classOf[SubmissionStatusResponse])
+    assert(newMessage.serverSparkVersion === "1.2.3")
+    assert(newMessage.submissionId === "driver_123")
+    assert(newMessage.driverState === "RUNNING")
+    assert(newMessage.success)
+    assert(newMessage.workerId === "worker_123")
+    assert(newMessage.workerHostPort === "1.2.3.4:7780")
+  }
+
+  test("ErrorResponse") {
+    val message = new ErrorResponse
+    intercept[SubmitRestProtocolException] { message.validate() }
+    message.serverSparkVersion = "1.2.3"
+    message.message = "Field not found in submit request: X"
+    message.validate()
+    // test JSON
+    val json = message.toJson
+    assertJsonEquals(json, errorJson)
+    val newMessage = SubmitRestProtocolMessage.fromJson(json, classOf[ErrorResponse])
+    assert(newMessage.serverSparkVersion === "1.2.3")
+    assert(newMessage.message === "Field not found in submit request: X")
+  }
+
+  private val dummyRequestJson =
+    """
+      |{
+      |  "action" : "DummyRequest",
+      |  "active" : true,
+      |  "age" : 25,
+      |  "clientSparkVersion" : "1.2.3",
+      |  "name" : "jung"
+      |}
+    """.stripMargin
+
+  private val dummyResponseJson =
+    """
+      |{
+      |  "action" : "DummyResponse",
+      |  "serverSparkVersion" : "3.3.4",
+      |  "success": true
+      |}
+    """.stripMargin
+
+  private val submitDriverRequestJson =
+    """
+      |{
+      |  "action" : "CreateSubmissionRequest",
+      |  "appArgs" : [ "two slices", "a hint of cinnamon" ],
+      |  "appResource" : "honey-walnut-cherry.jar",
+      |  "clientSparkVersion" : "1.2.3",
+      |  "environmentVariables" : {
+      |    "PATH" : "/dev/null"
+      |  },
+      |  "mainClass" : "org.apache.spark.examples.SparkPie",
+      |  "sparkProperties" : {
+      |    "spark.driver.extraLibraryPath" : "pickle.jar",
+      |    "spark.jars" : "mayonnaise.jar,ketchup.jar",
+      |    "spark.driver.supervise" : "false",
+      |    "spark.app.name" : "SparkPie",
+      |    "spark.cores.max" : "10000",
+      |    "spark.driver.memory" : "512m",
+      |    "spark.files" : "fireball.png",
+      |    "spark.driver.cores" : "180",
+      |    "spark.driver.extraJavaOptions" : " -Dslices=5 -Dcolor=mostly_red",
+      |    "spark.executor.memory" : "256m",
+      |    "spark.driver.extraClassPath" : "food-coloring.jar"
+      |  }
+      |}
+    """.stripMargin
+
+  private val submitDriverResponseJson =
+    """
+      |{
+      |  "action" : "CreateSubmissionResponse",
+      |  "serverSparkVersion" : "1.2.3",
+      |  "submissionId" : "driver_123",
+      |  "success" : true
+      |}
+    """.stripMargin
+
+  private val killDriverResponseJson =
+    """
+      |{
+      |  "action" : "KillSubmissionResponse",
+      |  "serverSparkVersion" : "1.2.3",
+      |  "submissionId" : "driver_123",
+      |  "success" : true
+      |}
+    """.stripMargin
+
+  private val driverStatusResponseJson =
+    """
+      |{
+      |  "action" : "SubmissionStatusResponse",
+      |  "driverState" : "RUNNING",
+      |  "serverSparkVersion" : "1.2.3",
+      |  "submissionId" : "driver_123",
+      |  "success" : true,
+      |  "workerHostPort" : "1.2.3.4:7780",
+      |  "workerId" : "worker_123"
+      |}
+    """.stripMargin
+
+  private val errorJson =
+    """
+      |{
+      |  "action" : "ErrorResponse",
+      |  "message" : "Field not found in submit request: X",
+      |  "serverSparkVersion" : "1.2.3"
+      |}
+    """.stripMargin
+
+  /** Assert that the contents in the two JSON strings are equal after ignoring whitespace. */
+  private def assertJsonEquals(jsonString1: String, jsonString2: String): Unit = {
+    val trimmedJson1 = jsonString1.trim
+    val trimmedJson2 = jsonString2.trim
+    val json1 = compact(render(parse(trimmedJson1)))
+    val json2 = compact(render(parse(trimmedJson2)))
+    // Put this on a separate line to avoid printing comparison twice when test fails
+    val equals = json1 == json2
+    assert(equals, "\"[%s]\" did not equal \"[%s]\"".format(trimmedJson1, trimmedJson2))
+  }
+}
+
+private class DummyResponse extends SubmitRestProtocolResponse
+private class DummyRequest extends SubmitRestProtocolRequest {
+  var active: Boolean = null
+  var age: Integer = null
+  var name: String = null
+  protected override def doValidate(): Unit = {
+    super.doValidate()
+    assertFieldIsSet(name, "name")
+    assertFieldIsSet(age, "age")
+    assert(age > 5, "Not old enough!")
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala
index 855f1b6276089..054a4c64897a9 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala
@@ -29,9 +29,9 @@ class KryoSerializerDistributedSuite extends FunSuite {
 
   test("kryo objects are serialised consistently in different processes") {
     val conf = new SparkConf(false)
-    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
-    conf.set("spark.kryo.registrator", classOf[AppJarRegistrator].getName)
-    conf.set("spark.task.maxFailures", "1")
+      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+      .set("spark.kryo.registrator", classOf[AppJarRegistrator].getName)
+      .set("spark.task.maxFailures", "1")
 
     val jar = TestUtils.createJarWithClasses(List(AppJarRegistrator.customClassName))
     conf.setJars(List(jar.getPath))
diff --git a/pom.xml b/pom.xml
index aef450ae63121..da8ee077ddf85 100644
--- a/pom.xml
+++ b/pom.xml
@@ -154,6 +154,7 @@
     <jline.groupid>org.scala-lang</jline.groupid>
     <jodd.version>3.6.3</jodd.version>
     <codehaus.jackson.version>1.8.8</codehaus.jackson.version>
+    <fasterxml.jackson.version>2.4.4</fasterxml.jackson.version>
     <snappy.version>1.1.1.6</snappy.version>
 
     <!--
@@ -577,6 +578,16 @@
         <artifactId>metrics-graphite</artifactId>
         <version>${codahale.metrics.version}</version>
       </dependency>
+      <dependency>
+        <groupId>com.fasterxml.jackson.core</groupId>
+        <artifactId>jackson-databind</artifactId>
+        <version>${fasterxml.jackson.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.fasterxml.jackson.module</groupId>
+        <artifactId>jackson-module-scala_2.10</artifactId>
+        <version>${fasterxml.jackson.version}</version>
+      </dependency>
       <dependency>
         <groupId>org.scala-lang</groupId>
         <artifactId>scala-compiler</artifactId>

From e9a4fe12d3dc4afeb715c7649df2c5527a2d5d35 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Sat, 7 Feb 2015 00:14:38 -0800
Subject: [PATCH 047/357] [BUILD] Add the ability to launch spark-shell from
 SBT.

Now you can quickly launch the spark-shell without building an assembly.  For quick development iteration run `build/sbt ~sparkShell` and calling exit will relaunch with any changes.

Author: Michael Armbrust <michael@databricks.com>

Closes #4438 from marmbrus/sparkShellSbt and squashes the following commits:

b4e44fe [Michael Armbrust] [BUILD] Add the ability to launch spark-shell from SBT.
---
 project/SparkBuild.scala | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 5e3051c091030..95f8dfa3d270f 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -177,6 +177,29 @@ object SparkBuild extends PomBuild {
 
   enable(Flume.settings)(streamingFlumeSink)
 
+
+  /**
+   * Adds the ability to run the spark shell directly from SBT without building an assembly
+   * jar.
+   *
+   * Usage: `build/sbt sparkShell`
+   */
+  val sparkShell = taskKey[Unit]("start a spark-shell.")
+
+  enable(Seq(
+    connectInput in run := true,
+    fork := true,
+    outputStrategy in run := Some (StdoutOutput),
+
+    javaOptions ++= Seq("-Xmx2G", "-XX:MaxPermSize=1g"),
+
+    sparkShell := {
+      (runMain in Compile).toTask(" org.apache.spark.repl.Main -usejavacp").value
+    }
+  ))(assembly)
+
+  enable(Seq(sparkShell := sparkShell in "assembly"))(spark)
+
   // TODO: move this to its upstream project.
   override def projectDefinitions(baseDirectory: File): Seq[Project] = {
     super.projectDefinitions(baseDirectory).map { x =>

From dd4cb33a271b893adebd5ec46dd349b16b591edc Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <lewandowski.jacek@gmail.com>
Date: Sat, 7 Feb 2015 15:58:04 +0000
Subject: [PATCH 048/357] SPARK-5408: Use -XX:MaxPermSize specified by user
 instead of default in ...

...ExecutorRunner and DriverRunner

Author: Jacek Lewandowski <lewandowski.jacek@gmail.com>

Closes #4203 from jacek-lewandowski/SPARK-5408-1.3 and squashes the following commits:

d913686 [Jacek Lewandowski] SPARK-5408: Use -XX:MaxPermSize specified by used instead of default in ExecutorRunner and DriverRunner
---
 .../apache/spark/deploy/worker/CommandUtils.scala  | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
index 28e9662db5da9..3e013c32096c5 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
@@ -115,9 +115,19 @@ object CommandUtils extends Logging {
     val userClassPath = command.classPathEntries ++ Seq(classPath)
 
     val javaVersion = System.getProperty("java.version")
-    val permGenOpt = if (!javaVersion.startsWith("1.8")) Some("-XX:MaxPermSize=128m") else None
+
+    val javaOpts = workerLocalOpts ++ command.javaOpts
+
+    val permGenOpt =
+      if (!javaVersion.startsWith("1.8") && !javaOpts.exists(_.startsWith("-XX:MaxPermSize="))) {
+        // do not specify -XX:MaxPermSize if it was already specified by user
+        Some("-XX:MaxPermSize=128m")
+      } else {
+        None
+      }
+
     Seq("-cp", userClassPath.filterNot(_.isEmpty).mkString(File.pathSeparator)) ++
-      permGenOpt ++ workerLocalOpts ++ command.javaOpts ++ memoryOpts
+      permGenOpt ++ javaOpts ++ memoryOpts
   }
 
   /** Spawn a thread that will redirect a given stream to a file */

From ecbbed2e4ef3e69ddda4a2913fdc00c0f68e1489 Mon Sep 17 00:00:00 2001
From: Zhan Zhang <zhazhan@gmail.com>
Date: Sat, 7 Feb 2015 19:41:30 +0000
Subject: [PATCH 049/357] [SPARK-5108][BUILD] Jackson dependency management for
 Hadoop-2.6.0 support

There is dependency compatibility issue. Currently hadoop-2.6.0 use 1.9.13 for jackson. Upgrade to the same version to make it consistent.

Author: Zhan Zhang <zhazhan@gmail.com>

Closes #3938 from zhzhan/spark5108 and squashes the following commits:

0080a84 [Zhan Zhang] change to upgrade jackson version only in hadoop-2.x
0b9bad6 [Zhan Zhang] Merge branch 'master' of https://github.com/apache/spark into spark5108
917600a [Zhan Zhang] solve conflicts
f7064d0 [Zhan Zhang] hadoop2.6 dependency management fix
fc56b25 [Zhan Zhang] squash all commits
3bf966c [Zhan Zhang] test
---
 pom.xml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pom.xml b/pom.xml
index da8ee077ddf85..5314bb10c3334 100644
--- a/pom.xml
+++ b/pom.xml
@@ -939,6 +939,16 @@
         <version>${codehaus.jackson.version}</version>
         <scope>${hadoop.deps.scope}</scope>
       </dependency>
+      <dependency>
+        <groupId>org.codehaus.jackson</groupId>
+        <artifactId>jackson-xc</artifactId>
+        <version>${codehaus.jackson.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.codehaus.jackson</groupId>
+        <artifactId>jackson-jaxrs</artifactId>
+        <version>${codehaus.jackson.version}</version>
+      </dependency>
       <dependency>
         <groupId>${hive.group}</groupId>
         <artifactId>hive-beeline</artifactId>
@@ -1549,6 +1559,7 @@
         <protobuf.version>2.5.0</protobuf.version>
         <hbase.version>0.98.7-hadoop2</hbase.version>
         <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
+        <codehaus.jackson.version>1.9.13</codehaus.jackson.version>
       </properties>
     </profile>
 
@@ -1561,6 +1572,7 @@
         <hbase.version>0.98.7-hadoop2</hbase.version>
         <commons.math3.version>3.1.1</commons.math3.version>
         <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
+        <codehaus.jackson.version>1.9.13</codehaus.jackson.version>
       </properties>
     </profile>
 
@@ -1573,6 +1585,7 @@
         <hbase.version>0.98.7-hadoop2</hbase.version>
         <commons.math3.version>3.1.1</commons.math3.version>
         <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
+        <codehaus.jackson.version>1.9.13</codehaus.jackson.version>
       </properties>
     </profile>
 

From 5de14cc2763a8211f77eeb55940dec025822eb78 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sat, 7 Feb 2015 17:19:08 -0800
Subject: [PATCH 050/357] [SPARK-5671] Upgrade jets3t to 0.9.2 in hadoop-2.3
 and 2.4 profiles

Upgrading from jets3t 0.9.0 to 0.9.2 fixes a dependency issue that was
causing UISeleniumSuite to fail with ClassNotFoundExceptions when run
the hadoop-2.3 or hadoop-2.4 profiles.

The jets3t release notes can be found at http://www.jets3t.org/RELEASE_NOTES.html

Author: Josh Rosen <joshrosen@databricks.com>

Closes #4454 from JoshRosen/SPARK-5671 and squashes the following commits:

fa6cb3e [Josh Rosen] [SPARK-5671] Upgrade jets3t to 0.9.2 in hadoop-2.3 and 2.4 profiles
---
 pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index 5314bb10c3334..e0c796b186578 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1568,7 +1568,7 @@
       <properties>
         <hadoop.version>2.3.0</hadoop.version>
         <protobuf.version>2.5.0</protobuf.version>
-        <jets3t.version>0.9.0</jets3t.version>
+        <jets3t.version>0.9.2</jets3t.version>
         <hbase.version>0.98.7-hadoop2</hbase.version>
         <commons.math3.version>3.1.1</commons.math3.version>
         <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
@@ -1581,7 +1581,7 @@
       <properties>
         <hadoop.version>2.4.0</hadoop.version>
         <protobuf.version>2.5.0</protobuf.version>
-        <jets3t.version>0.9.0</jets3t.version>
+        <jets3t.version>0.9.2</jets3t.version>
         <hbase.version>0.98.7-hadoop2</hbase.version>
         <commons.math3.version>3.1.1</commons.math3.version>
         <avro.mapred.classifier>hadoop2</avro.mapred.classifier>

From 6fb141e2a9e728499f8782310560bfaef7a5ed6c Mon Sep 17 00:00:00 2001
From: liuchang0812 <liuchang0812@gmail.com>
Date: Sun, 8 Feb 2015 10:08:51 +0000
Subject: [PATCH 051/357] [SPARK-5366][EC2] Check the mode of private key

Check the mode of private key file.

Author: liuchang0812 <liuchang0812@gmail.com>

Closes #4162 from Liuchang0812/ec2-script and squashes the following commits:

fc37355 [liuchang0812] quota file name
01ed464 [liuchang0812] more output
ce2a207 [liuchang0812] pep8
f44efd2 [liuchang0812] move code to real_main
8475a54 [liuchang0812] fix bug
cd61a1a [liuchang0812] import stat
c106cb2 [liuchang0812] fix trivis bug
89c9953 [liuchang0812] more output about checking private key
1177a90 [liuchang0812] remove commet
41188ab [liuchang0812] check the mode of private key
---
 ec2/spark_ec2.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 3f7242a53d6fd..725b1e47e0cea 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -24,10 +24,12 @@
 import hashlib
 import logging
 import os
+import os.path
 import pipes
 import random
 import shutil
 import string
+from stat import S_IRUSR
 import subprocess
 import sys
 import tarfile
@@ -349,6 +351,7 @@ def launch_cluster(conn, opts, cluster_name):
     if opts.identity_file is None:
         print >> stderr, "ERROR: Must provide an identity file (-i) for ssh connections."
         sys.exit(1)
+
     if opts.key_pair is None:
         print >> stderr, "ERROR: Must provide a key pair name (-k) to use on instances."
         sys.exit(1)
@@ -1007,6 +1010,18 @@ def real_main():
             DeprecationWarning
         )
 
+    if opts.identity_file is not None:
+        if not os.path.exists(opts.identity_file):
+            print >> stderr,\
+                "ERROR: The identity file '{f}' doesn't exist.".format(f=opts.identity_file)
+            sys.exit(1)
+
+        file_mode = os.stat(opts.identity_file).st_mode
+        if not (file_mode & S_IRUSR) or not oct(file_mode)[-2:] == '00':
+            print >> stderr, "ERROR: The identity file must be accessible only by you."
+            print >> stderr, 'You can fix this with: chmod 400 "{f}"'.format(f=opts.identity_file)
+            sys.exit(1)
+
     if opts.ebs_vol_num > 8:
         print >> stderr, "ebs-vol-num cannot be greater than 8"
         sys.exit(1)

From 48783136958e76d96f477802805e000ee5da5697 Mon Sep 17 00:00:00 2001
From: mbittmann <mbittmann@gmail.com>
Date: Sun, 8 Feb 2015 10:13:29 +0000
Subject: [PATCH 052/357] [SPARK-5656] Fail gracefully for large values of k
 and/or n that will ex...

...ceed max int.

Large values of k and/or n in EigenValueDecomposition.symmetricEigs will result in array initialization to a value larger than Integer.MAX_VALUE in the following: var v = new Array[Double](n * ncv)

Author: mbittmann <mbittmann@gmail.com>
Author: bittmannm <mark.bittmann@agilex.com>

Closes #4433 from mbittmann/master and squashes the following commits:

ee56e05 [mbittmann] [SPARK-5656] Combine checks into simple message
e49cbbb [mbittmann] [SPARK-5656] Simply error message
860836b [mbittmann] Array size check updates based on code review
a604816 [bittmannm] [SPARK-5656] Fail gracefully for large values of k and/or n that will exceed max int.
---
 .../apache/spark/mllib/linalg/EigenValueDecomposition.scala    | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
index 3515461b52493..9d6f97528148e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
@@ -79,6 +79,9 @@ private[mllib] object EigenValueDecomposition {
     // Mode 1: A*x = lambda*x, A symmetric
     iparam(6) = 1
 
+    require(n * ncv.toLong <= Integer.MAX_VALUE && ncv * (ncv.toLong + 8) <= Integer.MAX_VALUE,
+      s"k = $k and/or n = $n are too large to compute an eigendecomposition")
+    
     var ido = new intW(0)
     var info = new intW(0)
     var resid = new Array[Double](n)

From 23a99dabf10761b7c8ffc4fddd96bf8b5af13f38 Mon Sep 17 00:00:00 2001
From: "Kirill A. Korinskiy" <catap@catap.ru>
Date: Sun, 8 Feb 2015 10:31:46 +0000
Subject: [PATCH 053/357] [SPARK-5672][Web UI] Don't return `ERROR 500` when
 have missing args

Spark web UI return `HTTP ERROR 500` when GET arguments is missing.

Author: Kirill A. Korinskiy <catap@catap.ru>

Closes #4239 from catap/ui_500 and squashes the following commits:

520e180 [Kirill A. Korinskiy] [SPARK-5672][Web UI] Return `HTTP ERROR 400` when have missing args
---
 .../org/apache/spark/ui/JettyUtils.scala      | 27 +++++++++++--------
 .../ui/exec/ExecutorThreadDumpPage.scala      |  2 +-
 .../org/apache/spark/ui/jobs/JobPage.scala    |  5 +++-
 .../org/apache/spark/ui/jobs/PoolPage.scala   |  2 ++
 .../org/apache/spark/ui/jobs/StagePage.scala  | 10 +++++--
 .../org/apache/spark/ui/storage/RDDPage.scala |  5 +++-
 6 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index 88fed833f922d..bf4b24e98b134 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -62,17 +62,22 @@ private[spark] object JettyUtils extends Logging {
       securityMgr: SecurityManager): HttpServlet = {
     new HttpServlet {
       override def doGet(request: HttpServletRequest, response: HttpServletResponse) {
-        if (securityMgr.checkUIViewPermissions(request.getRemoteUser)) {
-          response.setContentType("%s;charset=utf-8".format(servletParams.contentType))
-          response.setStatus(HttpServletResponse.SC_OK)
-          val result = servletParams.responder(request)
-          response.setHeader("Cache-Control", "no-cache, no-store, must-revalidate")
-          response.getWriter.println(servletParams.extractFn(result))
-        } else {
-          response.setStatus(HttpServletResponse.SC_UNAUTHORIZED)
-          response.setHeader("Cache-Control", "no-cache, no-store, must-revalidate")
-          response.sendError(HttpServletResponse.SC_UNAUTHORIZED,
-            "User is not authorized to access this page.")
+        try {
+          if (securityMgr.checkUIViewPermissions(request.getRemoteUser)) {
+            response.setContentType("%s;charset=utf-8".format(servletParams.contentType))
+            response.setStatus(HttpServletResponse.SC_OK)
+            val result = servletParams.responder(request)
+            response.setHeader("Cache-Control", "no-cache, no-store, must-revalidate")
+            response.getWriter.println(servletParams.extractFn(result))
+          } else {
+            response.setStatus(HttpServletResponse.SC_UNAUTHORIZED)
+            response.setHeader("Cache-Control", "no-cache, no-store, must-revalidate")
+            response.sendError(HttpServletResponse.SC_UNAUTHORIZED,
+              "User is not authorized to access this page.")
+          }
+        } catch {
+          case e: IllegalArgumentException =>
+            response.sendError(HttpServletResponse.SC_BAD_REQUEST, e.getMessage)
         }
       }
     }
diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
index c82730f524eb7..f0ae95bb8c812 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
@@ -43,7 +43,7 @@ private[ui] class ExecutorThreadDumpPage(parent: ExecutorsTab) extends WebUIPage
         }
         id
     }.getOrElse {
-      return Text(s"Missing executorId parameter")
+      throw new IllegalArgumentException(s"Missing executorId parameter")
     }
     val time = System.currentTimeMillis()
     val maybeThreadDump = sc.get.getExecutorThreadDump(executorId)
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
index 77d36209c6048..7541d3e9c72e7 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
@@ -32,7 +32,10 @@ private[ui] class JobPage(parent: JobsTab) extends WebUIPage("job") {
 
   def render(request: HttpServletRequest): Seq[Node] = {
     listener.synchronized {
-      val jobId = request.getParameter("id").toInt
+      val parameterId = request.getParameter("id")
+      require(parameterId != null && parameterId.nonEmpty, "Missing id parameter")
+
+      val jobId = parameterId.toInt
       val jobDataOption = listener.jobIdToData.get(jobId)
       if (jobDataOption.isEmpty) {
         val content =
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala
index 5fc6cc7533150..f47cdc935e539 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/PoolPage.scala
@@ -32,6 +32,8 @@ private[ui] class PoolPage(parent: StagesTab) extends WebUIPage("pool") {
   def render(request: HttpServletRequest): Seq[Node] = {
     listener.synchronized {
       val poolName = request.getParameter("poolname")
+      require(poolName != null && poolName.nonEmpty, "Missing poolname parameter")
+
       val poolToActiveStages = listener.poolToActiveStages
       val activeStages = poolToActiveStages.get(poolName) match {
         case Some(s) => s.values.toSeq
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 02a3cc3e43c25..05ffd5bc58fbb 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -36,8 +36,14 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
 
   def render(request: HttpServletRequest): Seq[Node] = {
     listener.synchronized {
-      val stageId = request.getParameter("id").toInt
-      val stageAttemptId = request.getParameter("attempt").toInt
+      val parameterId = request.getParameter("id")
+      require(parameterId != null && parameterId.nonEmpty, "Missing id parameter")
+
+      val parameterAttempt = request.getParameter("attempt")
+      require(parameterAttempt != null && parameterAttempt.nonEmpty, "Missing attempt parameter")
+
+      val stageId = parameterId.toInt
+      val stageAttemptId = parameterAttempt.toInt
       val stageDataOption = listener.stageIdToData.get((stageId, stageAttemptId))
 
       if (stageDataOption.isEmpty || stageDataOption.get.taskData.isEmpty) {
diff --git a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
index 12d23a92878cf..199f731b92bcc 100644
--- a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
@@ -30,7 +30,10 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") {
   private val listener = parent.listener
 
   def render(request: HttpServletRequest): Seq[Node] = {
-    val rddId = request.getParameter("id").toInt
+    val parameterId = request.getParameter("id")
+    require(parameterId != null && parameterId.nonEmpty, "Missing id parameter")
+
+    val rddId = parameterId.toInt
     val storageStatusList = listener.storageStatusList
     val rddInfo = listener.rddInfoList.find(_.id == rddId).getOrElse {
       // Rather than crashing, render an "RDD Not Found" page

From 75fdccca32972f86a975033d7c4ce576dd79290f Mon Sep 17 00:00:00 2001
From: medale <medale94@yahoo.com>
Date: Sun, 8 Feb 2015 10:35:29 +0000
Subject: [PATCH 054/357] [SPARK-3039] [BUILD] Spark assembly for new hadoop
 API (hadoop 2) contai...

...ns avro-mapred for

hadoop 1 API had been marked as resolved but did not work for at least some
builds due to version conflicts using avro-mapred-1.7.5.jar and
avro-mapred-1.7.6-hadoop2.jar (the correct version) when building for hadoop2.

sql/hive/pom.xml org.spark-project.hive:hive-exec's depends on 1.7.5:

Building Spark Project Hive 1.2.0
[INFO] ------------------------------------------------------------------------
[INFO]
[INFO] --- maven-dependency-plugin:2.4:tree (default-cli)  spark-hive_2.10 ---
[INFO] org.apache.spark:spark-hive_2.10:jar:1.2.0
[INFO] +- org.spark-project.hive:hive-exec:jar:0.13.1a:compile
[INFO] |  \- org.apache.avro:avro-mapred:jar:1.7.5:compile
[INFO] \- org.apache.avro:avro-mapred:jar:hadoop2:1.7.6:compile
[INFO]

Excluding this dependency allows the explicitly listed avro-mapred dependency
to be picked up.

Author: medale <medale94@yahoo.com>

Closes #4315 from medale/avro-hadoop2 and squashes the following commits:

1ab4fa3 [medale] Merge branch 'master' into avro-hadoop2
9d85e2a [medale] Merge remote-tracking branch 'upstream/master' into avro-hadoop2
51b9c2a [medale] [SPARK-3039] [BUILD] Spark assembly for new hadoop API (hadoop 2) contains avro-mapred for hadoop 1 API had been marked as resolved but did not work for at least some builds due to version conflicts using avro-mapred-1.7.5.jar and avro-mapred-1.7.6-hadoop2.jar (the correct version) when building for hadoop2.
---
 pom.xml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pom.xml b/pom.xml
index e0c796b186578..f6f176d2004b7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -975,6 +975,10 @@
             <groupId>com.esotericsoftware.kryo</groupId>
             <artifactId>kryo</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>org.apache.avro</groupId>
+            <artifactId>avro-mapred</artifactId>
+          </exclusion>
         </exclusions>
       </dependency>
       <dependency>

From 804949d519e2caa293a409d84b4e6190c1105444 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Sun, 8 Feb 2015 14:55:07 -0800
Subject: [PATCH 055/357] [SQL] Set sessionState in QueryExecution.

This PR sets the SessionState in HiveContext's QueryExecution. So, we can make sure that SessionState.get can return the SessionState every time.

Author: Yin Huai <yhuai@databricks.com>

Closes #4445 from yhuai/setSessionState and squashes the following commits:

769c9f1 [Yin Huai] Remove unused import.
439f329 [Yin Huai] Try again.
427a0c9 [Yin Huai] Set SessionState everytime when we create a QueryExecution in HiveContext.
a3b7793 [Yin Huai] Set sessionState when dealing with CreateTableAsSelect.
---
 .../main/scala/org/apache/spark/sql/hive/HiveContext.scala   | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index ad37b7d0e6f59..2c00659496972 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -424,6 +424,11 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   /** Extends QueryExecution with hive specific features. */
   protected[sql] class QueryExecution(logicalPlan: LogicalPlan)
     extends super.QueryExecution(logicalPlan) {
+    // Like what we do in runHive, makes sure the session represented by the
+    // `sessionState` field is activated.
+    if (SessionState.get() != sessionState) {
+      SessionState.start(sessionState)
+    }
 
     /**
      * Returns the result as a hive compatible sequence of strings.  For native commands, the

From 5c299c58fb9a5434a40be82150d4725bba805adf Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 8 Feb 2015 16:26:20 -0800
Subject: [PATCH 056/357] [SPARK-5598][MLLIB] model save/load for ALS

following #4233. jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #4422 from mengxr/SPARK-5598 and squashes the following commits:

a059394 [Xiangrui Meng] SaveLoad not extending Loader
14b7ea6 [Xiangrui Meng] address comments
f487cb2 [Xiangrui Meng] add unit tests
62fc43c [Xiangrui Meng] implement save/load for MFM
---
 .../spark/mllib/recommendation/ALS.scala      |  2 +-
 .../MatrixFactorizationModel.scala            | 82 ++++++++++++++++++-
 .../MatrixFactorizationModelSuite.scala       | 19 +++++
 3 files changed, 100 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index 4bb28d1b1e071..caacab943030b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.mllib.recommendation
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.ml.recommendation.{ALS => NewALS}
 import org.apache.spark.rdd.RDD
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index ed2f8b41bcae5..9ff06ac362a31 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -17,13 +17,17 @@
 
 package org.apache.spark.mllib.recommendation
 
+import java.io.IOException
 import java.lang.{Integer => JavaInteger}
 
+import org.apache.hadoop.fs.Path
 import org.jblas.DoubleMatrix
 
-import org.apache.spark.Logging
+import org.apache.spark.{Logging, SparkContext}
 import org.apache.spark.api.java.{JavaPairRDD, JavaRDD}
+import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{Row, SQLContext}
 import org.apache.spark.storage.StorageLevel
 
 /**
@@ -41,7 +45,8 @@ import org.apache.spark.storage.StorageLevel
 class MatrixFactorizationModel(
     val rank: Int,
     val userFeatures: RDD[(Int, Array[Double])],
-    val productFeatures: RDD[(Int, Array[Double])]) extends Serializable with Logging {
+    val productFeatures: RDD[(Int, Array[Double])])
+  extends Saveable with Serializable with Logging {
 
   require(rank > 0)
   validateFeatures("User", userFeatures)
@@ -125,6 +130,12 @@ class MatrixFactorizationModel(
     recommend(productFeatures.lookup(product).head, userFeatures, num)
       .map(t => Rating(t._1, product, t._2))
 
+  protected override val formatVersion: String = "1.0"
+
+  override def save(sc: SparkContext, path: String): Unit = {
+    MatrixFactorizationModel.SaveLoadV1_0.save(this, path)
+  }
+
   private def recommend(
       recommendToFeatures: Array[Double],
       recommendableFeatures: RDD[(Int, Array[Double])],
@@ -136,3 +147,70 @@ class MatrixFactorizationModel(
     scored.top(num)(Ordering.by(_._2))
   }
 }
+
+object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
+
+  import org.apache.spark.mllib.util.Loader._
+
+  override def load(sc: SparkContext, path: String): MatrixFactorizationModel = {
+    val (loadedClassName, formatVersion, metadata) = loadMetadata(sc, path)
+    val classNameV1_0 = SaveLoadV1_0.thisClassName
+    (loadedClassName, formatVersion) match {
+      case (className, "1.0") if className == classNameV1_0 =>
+        SaveLoadV1_0.load(sc, path)
+      case _ =>
+        throw new IOException("MatrixFactorizationModel.load did not recognize model with" +
+          s"(class: $loadedClassName, version: $formatVersion). Supported:\n" +
+          s"  ($classNameV1_0, 1.0)")
+    }
+  }
+
+  private[recommendation]
+  object SaveLoadV1_0 {
+
+    private val thisFormatVersion = "1.0"
+
+    private[recommendation]
+    val thisClassName = "org.apache.spark.mllib.recommendation.MatrixFactorizationModel"
+
+    /**
+     * Saves a [[MatrixFactorizationModel]], where user features are saved under `data/users` and
+     * product features are saved under `data/products`.
+     */
+    def save(model: MatrixFactorizationModel, path: String): Unit = {
+      val sc = model.userFeatures.sparkContext
+      val sqlContext = new SQLContext(sc)
+      import sqlContext.implicits.createDataFrame
+      val metadata = (thisClassName, thisFormatVersion, model.rank)
+      val metadataRDD = sc.parallelize(Seq(metadata), 1).toDataFrame("class", "version", "rank")
+      metadataRDD.toJSON.saveAsTextFile(metadataPath(path))
+      model.userFeatures.toDataFrame("id", "features").saveAsParquetFile(userPath(path))
+      model.productFeatures.toDataFrame("id", "features").saveAsParquetFile(productPath(path))
+    }
+
+    def load(sc: SparkContext, path: String): MatrixFactorizationModel = {
+      val sqlContext = new SQLContext(sc)
+      val (className, formatVersion, metadata) = loadMetadata(sc, path)
+      assert(className == thisClassName)
+      assert(formatVersion == thisFormatVersion)
+      val rank = metadata.select("rank").first().getInt(0)
+      val userFeatures = sqlContext.parquetFile(userPath(path))
+        .map { case Row(id: Int, features: Seq[Double]) =>
+          (id, features.toArray)
+        }
+      val productFeatures = sqlContext.parquetFile(productPath(path))
+        .map { case Row(id: Int, features: Seq[Double]) =>
+        (id, features.toArray)
+      }
+      new MatrixFactorizationModel(rank, userFeatures, productFeatures)
+    }
+
+    private def userPath(path: String): String = {
+      new Path(dataPath(path), "user").toUri.toString
+    }
+
+    private def productPath(path: String): String = {
+      new Path(dataPath(path), "product").toUri.toString
+    }
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModelSuite.scala
index b9caecc904a23..9801e87576744 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModelSuite.scala
@@ -22,6 +22,7 @@ import org.scalatest.FunSuite
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.rdd.RDD
+import org.apache.spark.util.Utils
 
 class MatrixFactorizationModelSuite extends FunSuite with MLlibTestSparkContext {
 
@@ -53,4 +54,22 @@ class MatrixFactorizationModelSuite extends FunSuite with MLlibTestSparkContext
       new MatrixFactorizationModel(rank, userFeatures, prodFeatures1)
     }
   }
+
+  test("save/load") {
+    val model = new MatrixFactorizationModel(rank, userFeatures, prodFeatures)
+    val tempDir = Utils.createTempDir()
+    val path = tempDir.toURI.toString
+    def collect(features: RDD[(Int, Array[Double])]): Set[(Int, Seq[Double])] = {
+      features.mapValues(_.toSeq).collect().toSet
+    }
+    try {
+      model.save(sc, path)
+      val newModel = MatrixFactorizationModel.load(sc, path)
+      assert(newModel.rank === rank)
+      assert(collect(newModel.userFeatures) === collect(userFeatures))
+      assert(collect(newModel.productFeatures) === collect(prodFeatures))
+    } finally {
+      Utils.deleteRecursively(tempDir)
+    }
+  }
 }

From 56aff4bd6c7c9d18f4f962025708f20a4a82dcf0 Mon Sep 17 00:00:00 2001
From: Sam Halliday <sam.halliday@Gmail.com>
Date: Sun, 8 Feb 2015 16:34:26 -0800
Subject: [PATCH 057/357] SPARK-5665 [DOCS] Update netlib-java documentation

I am the author of netlib-java and I found this documentation to be out of date. Some main points:

1. Breeze has not depended on jBLAS for some time
2. netlib-java provides a pure JVM implementation as the fallback (the original docs did not appear to be aware of this, claiming that gfortran was necessary)
3. The licensing issue is not just about LGPL: optimised natives have proprietary licenses. Building with the LGPL flag turned on really doesn't help you get past this.
4. I really think it's best to direct people to my detailed setup guide instead of trying to compress it into one sentence. It is different for each architecture, each OS, and for each backend.

I hope this helps to clear things up :smile:

Author: Sam Halliday <sam.halliday@Gmail.com>
Author: Sam Halliday <sam.halliday@gmail.com>

Closes #4448 from fommil/patch-1 and squashes the following commits:

18cda11 [Sam Halliday] remove link to skillsmatters at request of @mengxr
a35e4a9 [Sam Halliday] reword netlib-java/breeze docs
---
 docs/mllib-guide.md | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 7779fbc9c49e4..3d32d03e35c62 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -56,25 +56,32 @@ See the **[spark.ml programming guide](ml-guide.html)** for more information on
 
 # Dependencies
 
-MLlib uses the linear algebra package [Breeze](http://www.scalanlp.org/),
-which depends on [netlib-java](https://github.com/fommil/netlib-java),
-and [jblas](https://github.com/mikiobraun/jblas). 
-`netlib-java` and `jblas` depend on native Fortran routines.
-You need to install the
+MLlib uses the linear algebra package
+[Breeze](http://www.scalanlp.org/), which depends on
+[netlib-java](https://github.com/fommil/netlib-java) for optimised
+numerical processing. If natives are not available at runtime, you
+will see a warning message and a pure JVM implementation will be used
+instead.
+
+To learn more about the benefits and background of system optimised
+natives, you may wish to watch Sam Halliday's ScalaX talk on
+[High Performance Linear Algebra in Scala](http://fommil.github.io/scalax14/#/)).
+
+Due to licensing issues with runtime proprietary binaries, we do not
+include `netlib-java`'s native proxies by default. To configure
+`netlib-java` / Breeze to use system optimised binaries, include
+`com.github.fommil.netlib:all:1.1.2` (or build Spark with
+`-Pnetlib-lgpl`) as a dependency of your project and read the
+[netlib-java](https://github.com/fommil/netlib-java) documentation for
+your platform's additional installation instructions.
+
+MLlib also uses [jblas](https://github.com/mikiobraun/jblas) which
+will require you to install the
 [gfortran runtime library](https://github.com/mikiobraun/jblas/wiki/Missing-Libraries)
 if it is not already present on your nodes.
-MLlib will throw a linking error if it cannot detect these libraries automatically.
-Due to license issues, we do not include `netlib-java`'s native libraries in MLlib's
-dependency set under default settings.
-If no native library is available at runtime, you will see a warning message.
-To use native libraries from `netlib-java`, please build Spark with `-Pnetlib-lgpl` or
-include `com.github.fommil.netlib:all:1.1.2` as a dependency of your project.
-If you want to use optimized BLAS/LAPACK libraries such as
-[OpenBLAS](http://www.openblas.net/), please link its shared libraries to
-`/usr/lib/libblas.so.3` and `/usr/lib/liblapack.so.3`, respectively.
-BLAS/LAPACK libraries on worker nodes should be built without multithreading.
-
-To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.4 or newer.
+
+To use MLlib in Python, you will need [NumPy](http://www.numpy.org)
+version 1.4 or newer.
 
 ---
 

From a052ed42501fee3641348337505b6176426653c4 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 8 Feb 2015 18:56:51 -0800
Subject: [PATCH 058/357] [SPARK-5643][SQL] Add a show method to print the
 content of a DataFrame in tabular format.

An example:
```
year  month AVG('Adj Close) MAX('Adj Close)
1980  12    0.503218        0.595103
1981  01    0.523289        0.570307
1982  02    0.436504        0.475256
1983  03    0.410516        0.442194
1984  04    0.450090        0.483521
```

Author: Reynold Xin <rxin@databricks.com>

Closes #4416 from rxin/SPARK-5643 and squashes the following commits:

d0e0d6e [Reynold Xin] [SQL] Minor update to data source and statistics documentation.
269da83 [Reynold Xin] Updated isLocal comment.
2cf3c27 [Reynold Xin] Moved logic into optimizer.
1a04d8b [Reynold Xin] [SPARK-5643][SQL] Add a show method to print the content of a DataFrame in columnar format.
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 18 +++++-
 .../catalyst/plans/logical/LogicalPlan.scala  |  7 ++-
 .../ConvertToLocalRelationSuite.scala         | 57 +++++++++++++++++++
 .../org/apache/spark/sql/DataFrame.scala      | 21 ++++++-
 .../org/apache/spark/sql/DataFrameImpl.scala  | 41 +++++++++++--
 .../apache/spark/sql/IncomputableColumn.scala |  6 +-
 .../spark/sql/execution/basicOperators.scala  |  7 +--
 .../apache/spark/sql/sources/interfaces.scala | 15 +++--
 8 files changed, 151 insertions(+), 21 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConvertToLocalRelationSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 8c8f2896eb99b..3bc48c95c5653 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -50,7 +50,9 @@ object DefaultOptimizer extends Optimizer {
       CombineFilters,
       PushPredicateThroughProject,
       PushPredicateThroughJoin,
-      ColumnPruning) :: Nil
+      ColumnPruning) ::
+    Batch("LocalRelation", FixedPoint(100),
+      ConvertToLocalRelation) :: Nil
 }
 
 /**
@@ -610,3 +612,17 @@ object DecimalAggregates extends Rule[LogicalPlan] {
         DecimalType(prec + 4, scale + 4))
   }
 }
+
+/**
+ * Converts local operations (i.e. ones that don't require data exchange) on LocalRelation to
+ * another LocalRelation.
+ *
+ * This is relatively simple as it currently handles only a single case: Project.
+ */
+object ConvertToLocalRelation extends Rule[LogicalPlan] {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case Project(projectList, LocalRelation(output, data)) =>
+      val projection = new InterpretedProjection(projectList, output)
+      LocalRelation(projectList.map(_.toAttribute), data.map(projection))
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 8d30528328946..7cf4b81274906 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -29,12 +29,15 @@ import org.apache.spark.sql.catalyst.trees
 /**
  * Estimates of various statistics.  The default estimation logic simply lazily multiplies the
  * corresponding statistic produced by the children.  To override this behavior, override
- * `statistics` and assign it an overriden version of `Statistics`.
+ * `statistics` and assign it an overridden version of `Statistics`.
  *
- * '''NOTE''': concrete and/or overriden versions of statistics fields should pay attention to the
+ * '''NOTE''': concrete and/or overridden versions of statistics fields should pay attention to the
  * performance of the implementations.  The reason is that estimations might get triggered in
  * performance-critical processes, such as query plan planning.
  *
+ * Note that we are using a BigInt here since it is easy to overflow a 64-bit integer in
+ * cardinality estimation (e.g. cartesian joins).
+ *
  * @param sizeInBytes Physical size in bytes. For leaf operators this defaults to 1, otherwise it
  *                    defaults to the product of children's `sizeInBytes`.
  */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConvertToLocalRelationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConvertToLocalRelationSuite.scala
new file mode 100644
index 0000000000000..cf42d43823399
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConvertToLocalRelationSuite.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+
+
+class ConvertToLocalRelationSuite extends PlanTest {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("LocalRelation", FixedPoint(100),
+        ConvertToLocalRelation) :: Nil
+  }
+
+  test("Project on LocalRelation should be turned into a single LocalRelation") {
+    val testRelation = LocalRelation(
+      LocalRelation('a.int, 'b.int).output,
+      Row(1, 2) ::
+      Row(4, 5) :: Nil)
+
+    val correctAnswer = LocalRelation(
+      LocalRelation('a1.int, 'b1.int).output,
+      Row(1, 3) ::
+      Row(4, 6) :: Nil)
+
+    val projectOnLocal = testRelation.select(
+      UnresolvedAttribute("a").as("a1"),
+      (UnresolvedAttribute("b") + 1).as("b1"))
+
+    val optimized = Optimize(projectOnLocal.analyze)
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 8ad6526f872e5..17ea3cde8e50e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -102,7 +102,7 @@ trait DataFrame extends RDDApi[Row] {
    * }}}
    */
   @scala.annotation.varargs
-  def toDataFrame(colName: String, colNames: String*): DataFrame
+  def toDataFrame(colNames: String*): DataFrame
 
   /** Returns the schema of this [[DataFrame]]. */
   def schema: StructType
@@ -116,6 +116,25 @@ trait DataFrame extends RDDApi[Row] {
   /** Prints the schema to the console in a nice tree format. */
   def printSchema(): Unit
 
+  /**
+   * Returns true if the `collect` and `take` methods can be run locally
+   * (without any Spark executors).
+   */
+  def isLocal: Boolean
+
+  /**
+   * Displays the [[DataFrame]] in a tabular form. For example:
+   * {{{
+   *   year  month AVG('Adj Close) MAX('Adj Close)
+   *   1980  12    0.503218        0.595103
+   *   1981  01    0.523289        0.570307
+   *   1982  02    0.436504        0.475256
+   *   1983  03    0.410516        0.442194
+   *   1984  04    0.450090        0.483521
+   * }}}
+   */
+  def show(): Unit
+
   /**
    * Cartesian join with another [[DataFrame]].
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index 789bcf6184b3e..fa05a5dcac6bf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -90,14 +90,13 @@ private[sql] class DataFrameImpl protected[sql](
     }
   }
 
-  override def toDataFrame(colName: String, colNames: String*): DataFrame = {
-    val newNames = colName +: colNames
-    require(schema.size == newNames.size,
+  override def toDataFrame(colNames: String*): DataFrame = {
+    require(schema.size == colNames.size,
       "The number of columns doesn't match.\n" +
         "Old column names: " + schema.fields.map(_.name).mkString(", ") + "\n" +
-        "New column names: " + newNames.mkString(", "))
+        "New column names: " + colNames.mkString(", "))
 
-    val newCols = schema.fieldNames.zip(newNames).map { case (oldName, newName) =>
+    val newCols = schema.fieldNames.zip(colNames).map { case (oldName, newName) =>
       apply(oldName).as(newName)
     }
     select(newCols :_*)
@@ -113,6 +112,38 @@ private[sql] class DataFrameImpl protected[sql](
 
   override def printSchema(): Unit = println(schema.treeString)
 
+  override def isLocal: Boolean = {
+    logicalPlan.isInstanceOf[LocalRelation]
+  }
+
+  override def show(): Unit = {
+    val data = take(20)
+    val numCols = schema.fieldNames.length
+
+    // For cells that are beyond 20 characters, replace it with the first 17 and "..."
+    val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { row =>
+      row.toSeq.map { cell =>
+        val str = if (cell == null) "null" else cell.toString
+        if (str.length > 20) str.substring(0, 17) + "..." else str
+      } : Seq[String]
+    }
+
+    // Compute the width of each column
+    val colWidths = Array.fill(numCols)(0)
+    for (row <- rows) {
+      for ((cell, i) <- row.zipWithIndex)  {
+        colWidths(i) = math.max(colWidths(i), cell.length)
+      }
+    }
+
+    // Pad the cells and print them
+    println(rows.map { row =>
+      row.zipWithIndex.map { case (cell, i) =>
+        String.format(s"%-${colWidths(i)}s", cell)
+      }.mkString(" ")
+    }.mkString("\n"))
+  }
+
   override def join(right: DataFrame): DataFrame = {
     Join(logicalPlan, right.logicalPlan, joinType = Inner, None)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
index 6043fb4dee01d..782f6e28eebb0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
@@ -48,7 +48,7 @@ private[sql] class IncomputableColumn(protected[sql] val expr: Expression) exten
 
   protected[sql] override def logicalPlan: LogicalPlan = err()
 
-  override def toDataFrame(colName: String, colNames: String*): DataFrame = err()
+  override def toDataFrame(colNames: String*): DataFrame = err()
 
   override def schema: StructType = err()
 
@@ -58,6 +58,10 @@ private[sql] class IncomputableColumn(protected[sql] val expr: Expression) exten
 
   override def printSchema(): Unit = err()
 
+  override def show(): Unit = err()
+
+  override def isLocal: Boolean = false
+
   override def join(right: DataFrame): DataFrame = err()
 
   override def join(right: DataFrame, joinExprs: Column): DataFrame = err()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 66aed5d5113d1..4dc506c21ab9e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -17,9 +17,6 @@
 
 package org.apache.spark.sql.execution
 
-import scala.collection.mutable.ArrayBuffer
-import scala.reflect.runtime.universe.TypeTag
-
 import org.apache.spark.{SparkEnv, HashPartitioner, SparkConf}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.{RDD, ShuffledRDD}
@@ -40,7 +37,7 @@ case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends
 
   @transient lazy val buildProjection = newMutableProjection(projectList, child.output)
 
-  def execute() = child.execute().mapPartitions { iter =>
+  override def execute() = child.execute().mapPartitions { iter =>
     val resuableProjection = buildProjection()
     iter.map(resuableProjection)
   }
@@ -55,7 +52,7 @@ case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
 
   @transient lazy val conditionEvaluator = newPredicate(condition, child.output)
 
-  def execute() = child.execute().mapPartitions { iter =>
+  override def execute() = child.execute().mapPartitions { iter =>
     iter.filter(conditionEvaluator)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index a640ba57e0885..5eecc303ef72b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -87,13 +87,13 @@ trait CreatableRelationProvider {
 
 /**
  * ::DeveloperApi::
- * Represents a collection of tuples with a known schema.  Classes that extend BaseRelation must
- * be able to produce the schema of their data in the form of a [[StructType]]  Concrete
+ * Represents a collection of tuples with a known schema. Classes that extend BaseRelation must
+ * be able to produce the schema of their data in the form of a [[StructType]]. Concrete
  * implementation should inherit from one of the descendant `Scan` classes, which define various
  * abstract methods for execution.
  *
  * BaseRelations must also define a equality function that only returns true when the two
- * instances will return the same data.  This equality function is used when determining when
+ * instances will return the same data. This equality function is used when determining when
  * it is safe to substitute cached results for a given relation.
  */
 @DeveloperApi
@@ -102,13 +102,16 @@ abstract class BaseRelation {
   def schema: StructType
 
   /**
-   * Returns an estimated size of this relation in bytes.  This information is used by the planner
+   * Returns an estimated size of this relation in bytes. This information is used by the planner
    * to decided when it is safe to broadcast a relation and can be overridden by sources that
    * know the size ahead of time. By default, the system will assume that tables are too
-   * large to broadcast.  This method will be called multiple times during query planning
+   * large to broadcast. This method will be called multiple times during query planning
    * and thus should not perform expensive operations for each invocation.
+   *
+   * Note that it is always better to overestimate size than underestimate, because underestimation
+   * could lead to execution plans that are suboptimal (i.e. broadcasting a very large table).
    */
-  def sizeInBytes = sqlContext.conf.defaultSizeInBytes
+  def sizeInBytes: Long = sqlContext.conf.defaultSizeInBytes
 }
 
 /**

From c17161189d57f2e3a8d3550ea59a68edf487c8b7 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Sun, 8 Feb 2015 21:07:36 -0800
Subject: [PATCH 059/357] [SPARK-5660][MLLIB] Make Matrix apply public

This is #4447 with `override`.

Closes #4447

Author: Joseph K. Bradley <joseph@databricks.com>
Author: Xiangrui Meng <meng@databricks.com>

Closes #4462 from mengxr/SPARK-5660 and squashes the following commits:

f82c8d6 [Xiangrui Meng] add override to matrix.apply
91cedde [Joseph K. Bradley] made matrix apply public
---
 .../main/scala/org/apache/spark/mllib/linalg/Matrices.scala | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 84f8ac2e0d9cd..c8a97b8c53d9b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -50,7 +50,7 @@ sealed trait Matrix extends Serializable {
   private[mllib] def toBreeze: BM[Double]
 
   /** Gets the (i, j)-th element. */
-  private[mllib] def apply(i: Int, j: Int): Double
+  def apply(i: Int, j: Int): Double
 
   /** Return the index for the (i, j)-th element in the backing array. */
   private[mllib] def index(i: Int, j: Int): Int
@@ -163,7 +163,7 @@ class DenseMatrix(
 
   private[mllib] def apply(i: Int): Double = values(i)
 
-  private[mllib] def apply(i: Int, j: Int): Double = values(index(i, j))
+  override def apply(i: Int, j: Int): Double = values(index(i, j))
 
   private[mllib] def index(i: Int, j: Int): Int = {
     if (!isTransposed) i + numRows * j else j + numCols * i
@@ -398,7 +398,7 @@ class SparseMatrix(
      }
   }
 
-  private[mllib] def apply(i: Int, j: Int): Double = {
+  override def apply(i: Int, j: Int): Double = {
     val ind = index(i, j)
     if (ind < 0) 0.0 else values(ind)
   }

From 4396dfb37f433ef186e3e0a09db9906986ec940b Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sun, 8 Feb 2015 21:08:50 -0800
Subject: [PATCH 060/357] SPARK-4405 [MLLIB] Matrices.* construction methods
 should check for rows x cols overflow

Check that size of dense matrix array is not beyond Int.MaxValue in Matrices.* methods. jkbradley this should be an easy one. Review and/or merge as you see fit.

Author: Sean Owen <sowen@cloudera.com>

Closes #4461 from srowen/SPARK-4405 and squashes the following commits:

c67574e [Sean Owen] Check that size of dense matrix array is not beyond Int.MaxValue in Matrices.* methods
---
 .../org/apache/spark/mllib/linalg/Matrices.scala   | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index c8a97b8c53d9b..89b38679b7494 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -256,8 +256,11 @@ object DenseMatrix {
    * @param numCols number of columns of the matrix
    * @return `DenseMatrix` with size `numRows` x `numCols` and values of zeros
    */
-  def zeros(numRows: Int, numCols: Int): DenseMatrix =
+  def zeros(numRows: Int, numCols: Int): DenseMatrix = {
+    require(numRows.toLong * numCols <= Int.MaxValue,
+            s"$numRows x $numCols dense matrix is too large to allocate")
     new DenseMatrix(numRows, numCols, new Array[Double](numRows * numCols))
+  }
 
   /**
    * Generate a `DenseMatrix` consisting of ones.
@@ -265,8 +268,11 @@ object DenseMatrix {
    * @param numCols number of columns of the matrix
    * @return `DenseMatrix` with size `numRows` x `numCols` and values of ones
    */
-  def ones(numRows: Int, numCols: Int): DenseMatrix =
+  def ones(numRows: Int, numCols: Int): DenseMatrix = {
+    require(numRows.toLong * numCols <= Int.MaxValue,
+            s"$numRows x $numCols dense matrix is too large to allocate")
     new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(1.0))
+  }
 
   /**
    * Generate an Identity Matrix in `DenseMatrix` format.
@@ -291,6 +297,8 @@ object DenseMatrix {
    * @return `DenseMatrix` with size `numRows` x `numCols` and values in U(0, 1)
    */
   def rand(numRows: Int, numCols: Int, rng: Random): DenseMatrix = {
+    require(numRows.toLong * numCols <= Int.MaxValue,
+            s"$numRows x $numCols dense matrix is too large to allocate")
     new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(rng.nextDouble()))
   }
 
@@ -302,6 +310,8 @@ object DenseMatrix {
    * @return `DenseMatrix` with size `numRows` x `numCols` and values in N(0, 1)
    */
   def randn(numRows: Int, numCols: Int, rng: Random): DenseMatrix = {
+    require(numRows.toLong * numCols <= Int.MaxValue,
+            s"$numRows x $numCols dense matrix is too large to allocate")
     new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(rng.nextGaussian()))
   }
 

From 4575c5643a82818bf64f9648314bdc2fdc12febb Mon Sep 17 00:00:00 2001
From: Hung Lin <hung@zoomdata.com>
Date: Sun, 8 Feb 2015 22:36:42 -0800
Subject: [PATCH 061/357] [SPARK-5472][SQL] Fix Scala code style

Fix Scala code style.

Author: Hung Lin <hung@zoomdata.com>

Closes #4464 from hunglin/SPARK-5472 and squashes the following commits:

ef7a3b3 [Hung Lin] SPARK-5472: fix scala style
---
 .../org/apache/spark/sql/jdbc/JDBCRDD.scala   | 42 +++++++++----------
 .../apache/spark/sql/jdbc/JDBCRelation.scala  | 35 +++++++++-------
 2 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index a2f94675fb5a3..0bec32cca1325 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -17,13 +17,10 @@
 
 package org.apache.spark.sql.jdbc
 
-import java.sql.{Connection, DatabaseMetaData, DriverManager, ResultSet, ResultSetMetaData, SQLException}
-import scala.collection.mutable.ArrayBuffer
+import java.sql.{Connection, DriverManager, ResultSet, ResultSetMetaData, SQLException}
 
 import org.apache.spark.{Logging, Partition, SparkContext, TaskContext}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.util.NextIterator
-import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion
 import org.apache.spark.sql.catalyst.expressions.{Row, SpecificMutableRow}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.sources._
@@ -100,7 +97,7 @@ private[sql] object JDBCRDD extends Logging {
       try {
         val rsmd = rs.getMetaData
         val ncols = rsmd.getColumnCount
-        var fields = new Array[StructField](ncols);
+        val fields = new Array[StructField](ncols)
         var i = 0
         while (i < ncols) {
           val columnName = rsmd.getColumnName(i + 1)
@@ -176,23 +173,27 @@ private[sql] object JDBCRDD extends Logging {
    *
    * @return An RDD representing "SELECT requiredColumns FROM fqTable".
    */
-  def scanTable(sc: SparkContext,
-                schema: StructType,
-                driver: String,
-                url: String,
-                fqTable: String,
-                requiredColumns: Array[String],
-                filters: Array[Filter],
-                parts: Array[Partition]): RDD[Row] = {
+  def scanTable(
+      sc: SparkContext,
+      schema: StructType,
+      driver: String,
+      url: String,
+      fqTable: String,
+      requiredColumns: Array[String],
+      filters: Array[Filter],
+      parts: Array[Partition]): RDD[Row] = {
+
     val prunedSchema = pruneSchema(schema, requiredColumns)
 
-    return new JDBCRDD(sc,
-        getConnector(driver, url),
-        prunedSchema,
-        fqTable,
-        requiredColumns,
-        filters,
-        parts)
+    return new
+        JDBCRDD(
+          sc,
+          getConnector(driver, url),
+          prunedSchema,
+          fqTable,
+          requiredColumns,
+          filters,
+          parts)
   }
 }
 
@@ -412,6 +413,5 @@ private[sql] class JDBCRDD(
       gotNext = false
       nextValue
     }
-
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
index e09125e406ba2..66ad38eb7c45b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
@@ -96,7 +96,8 @@ private[sql] class DefaultSource extends RelationProvider {
 
     if (driver != null) Class.forName(driver)
 
-    if (   partitionColumn != null
+    if (
+      partitionColumn != null
         && (lowerBound == null || upperBound == null || numPartitions == null)) {
       sys.error("Partitioning incompletely specified")
     }
@@ -104,30 +105,34 @@ private[sql] class DefaultSource extends RelationProvider {
     val partitionInfo = if (partitionColumn == null) {
       null
     } else {
-      JDBCPartitioningInfo(partitionColumn,
-                           lowerBound.toLong, upperBound.toLong,
-                           numPartitions.toInt)
+      JDBCPartitioningInfo(
+        partitionColumn,
+        lowerBound.toLong,
+        upperBound.toLong,
+        numPartitions.toInt)
     }
     val parts = JDBCRelation.columnPartition(partitionInfo)
     JDBCRelation(url, table, parts)(sqlContext)
   }
 }
 
-private[sql] case class JDBCRelation(url: String,
-                                     table: String,
-                                     parts: Array[Partition])(
-    @transient val sqlContext: SQLContext)
-  extends PrunedFilteredScan {
+private[sql] case class JDBCRelation(
+    url: String,
+    table: String,
+    parts: Array[Partition])(@transient val sqlContext: SQLContext) extends PrunedFilteredScan {
 
   override val schema = JDBCRDD.resolveTable(url, table)
 
   override def buildScan(requiredColumns: Array[String], filters: Array[Filter]) = {
     val driver: String = DriverManager.getDriver(url).getClass.getCanonicalName
-    JDBCRDD.scanTable(sqlContext.sparkContext,
-                      schema,
-                      driver, url,
-                      table,
-                      requiredColumns, filters,
-                      parts)
+    JDBCRDD.scanTable(
+      sqlContext.sparkContext,
+      schema,
+      driver,
+      url,
+      table,
+      requiredColumns,
+      filters,
+      parts)
   }
 }

From 855d12ac0a9cdade4cd2cc64c4e7209478be6690 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 8 Feb 2015 23:40:36 -0800
Subject: [PATCH 062/357] [SPARK-5539][MLLIB] LDA guide

This is the LDA user guide from jkbradley with Java and Scala code example.

Author: Xiangrui Meng <meng@databricks.com>
Author: Joseph K. Bradley <joseph@databricks.com>

Closes #4465 from mengxr/lda-guide and squashes the following commits:

6dcb7d1 [Xiangrui Meng] update java example in the user guide
76169ff [Xiangrui Meng] update java example
36c3ae2 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into lda-guide
c2a1efe [Joseph K. Bradley] Added LDA programming guide, plus Java example (which is in the guide and probably should be removed).
---
 data/mllib/sample_lda_data.txt                |  12 ++
 docs/mllib-clustering.md                      | 129 +++++++++++++++++-
 .../spark/examples/mllib/JavaLDAExample.java  |  75 ++++++++++
 3 files changed, 215 insertions(+), 1 deletion(-)
 create mode 100644 data/mllib/sample_lda_data.txt
 create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java

diff --git a/data/mllib/sample_lda_data.txt b/data/mllib/sample_lda_data.txt
new file mode 100644
index 0000000000000..2e76702ca9d67
--- /dev/null
+++ b/data/mllib/sample_lda_data.txt
@@ -0,0 +1,12 @@
+1 2 6 0 2 3 1 1 0 0 3
+1 3 0 1 3 0 0 2 0 0 1
+1 4 1 0 0 4 9 0 1 2 0
+2 1 0 3 0 0 5 0 2 3 9
+3 1 1 9 3 0 2 0 0 1 3
+4 2 0 3 4 5 1 1 1 4 0
+2 1 0 3 0 0 5 0 2 2 9
+1 1 1 9 2 1 2 0 0 1 3
+4 4 0 3 4 2 1 3 0 0 0
+2 8 2 0 3 0 2 0 2 7 2
+1 1 1 9 0 2 2 0 0 3 3
+4 1 0 0 4 5 1 3 0 1 0
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 1e9ef345b7435..99ed6b60e3f00 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -55,7 +55,7 @@ has the following parameters:
 
 Power iteration clustering is a scalable and efficient algorithm for clustering points given pointwise mutual affinity values.  Internally the algorithm:
 
-* accepts a [Graph](https://spark.apache.org/docs/0.9.2/api/graphx/index.html#org.apache.spark.graphx.Graph) that represents a  normalized pairwise affinity between all input points.
+* accepts a [Graph](api/graphx/index.html#org.apache.spark.graphx.Graph) that represents a  normalized pairwise affinity between all input points.
 * calculates the principal eigenvalue and eigenvector
 * Clusters each of the input points according to their principal eigenvector component value
 
@@ -71,6 +71,35 @@ Example outputs for a dataset inspired by the paper - but with five clusters ins
   <!-- Images are downsized intentionally to improve quality on retina displays -->
 </p>
 
+### Latent Dirichlet Allocation (LDA)
+
+[Latent Dirichlet Allocation (LDA)](http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation)
+is a topic model which infers topics from a collection of text documents.
+LDA can be thought of as a clustering algorithm as follows:
+
+* Topics correspond to cluster centers, and documents correspond to examples (rows) in a dataset.
+* Topics and documents both exist in a feature space, where feature vectors are vectors of word counts.
+* Rather than estimating a clustering using a traditional distance, LDA uses a function based
+ on a statistical model of how text documents are generated.
+
+LDA takes in a collection of documents as vectors of word counts.
+It learns clustering using [expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
+on the likelihood function. After fitting on the documents, LDA provides:
+
+* Topics: Inferred topics, each of which is a probability distribution over terms (words).
+* Topic distributions for documents: For each document in the training set, LDA gives a probability distribution over topics.
+
+LDA takes the following parameters:
+
+* `k`: Number of topics (i.e., cluster centers)
+* `maxIterations`: Limit on the number of iterations of EM used for learning
+* `docConcentration`: Hyperparameter for prior over documents' distributions over topics. Currently must be > 1, where larger values encourage smoother inferred distributions.
+* `topicConcentration`: Hyperparameter for prior over topics' distributions over terms (words). Currently must be > 1, where larger values encourage smoother inferred distributions.
+* `checkpointInterval`: If using checkpointing (set in the Spark configuration), this parameter specifies the frequency with which checkpoints will be created.  If `maxIterations` is large, using checkpointing can help reduce shuffle file sizes on disk and help with failure recovery.
+
+*Note*: LDA is a new feature with some missing functionality.  In particular, it does not yet
+support prediction on new documents, and it does not have a Python API.  These will be added in the future.
+
 ### Examples
 
 #### k-means
@@ -293,6 +322,104 @@ for i in range(2):
 
 </div>
 
+#### Latent Dirichlet Allocation (LDA) Example
+
+In the following example, we load word count vectors representing a corpus of documents.
+We then use [LDA](api/scala/index.html#org.apache.spark.mllib.clustering.LDA)
+to infer three topics from the documents. The number of desired clusters is passed 
+to the algorithm. We then output the topics, represented as probability distributions over words.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+{% highlight scala %}
+import org.apache.spark.mllib.clustering.LDA
+import org.apache.spark.mllib.linalg.Vectors
+
+// Load and parse the data
+val data = sc.textFile("data/mllib/sample_lda_data.txt")
+val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble)))
+// Index documents with unique IDs
+val corpus = parsedData.zipWithIndex.map(_.swap).cache()
+
+// Cluster the documents into three topics using LDA
+val ldaModel = new LDA().setK(3).run(corpus)
+
+// Output topics. Each is a distribution over words (matching word count vectors)
+println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):")
+val topics = ldaModel.topicsMatrix
+for (topic <- Range(0, 3)) {
+  print("Topic " + topic + ":")
+  for (word <- Range(0, ldaModel.vocabSize)) { print(" " + topics(word, topic)); }
+  println()
+}
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.clustering.DistributedLDAModel;
+import org.apache.spark.mllib.clustering.LDA;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.SparkConf;
+
+public class JavaLDAExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("LDA Example");
+    JavaSparkContext sc = new JavaSparkContext(conf);
+
+    // Load and parse the data
+    String path = "data/mllib/sample_lda_data.txt";
+    JavaRDD<String> data = sc.textFile(path);
+    JavaRDD<Vector> parsedData = data.map(
+        new Function<String, Vector>() {
+          public Vector call(String s) {
+            String[] sarray = s.trim().split(" ");
+            double[] values = new double[sarray.length];
+            for (int i = 0; i < sarray.length; i++)
+              values[i] = Double.parseDouble(sarray[i]);
+            return Vectors.dense(values);
+          }
+        }
+    );
+    // Index documents with unique IDs
+    JavaPairRDD<Long, Vector> corpus = JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map(
+        new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() {
+          public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) {
+            return doc_id.swap();
+          }
+        }
+    ));
+    corpus.cache();
+
+    // Cluster the documents into three topics using LDA
+    DistributedLDAModel ldaModel = new LDA().setK(3).run(corpus);
+
+    // Output topics. Each is a distribution over words (matching word count vectors)
+    System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize()
+        + " words):");
+    Matrix topics = ldaModel.topicsMatrix();
+    for (int topic = 0; topic < 3; topic++) {
+      System.out.print("Topic " + topic + ":");
+      for (int word = 0; word < ldaModel.vocabSize(); word++) {
+        System.out.print(" " + topics.apply(word, topic));
+      }
+      System.out.println();
+    }
+  }
+}
+{% endhighlight %}
+</div>
+
+</div>
+
+
 In order to run the above application, follow the instructions
 provided in the [Self-Contained Applications](quick-start.html#self-contained-applications)
 section of the Spark
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java
new file mode 100644
index 0000000000000..f394ff2084463
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.clustering.DistributedLDAModel;
+import org.apache.spark.mllib.clustering.LDA;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.SparkConf;
+
+public class JavaLDAExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("LDA Example");
+    JavaSparkContext sc = new JavaSparkContext(conf);
+
+    // Load and parse the data
+    String path = "data/mllib/sample_lda_data.txt";
+    JavaRDD<String> data = sc.textFile(path);
+    JavaRDD<Vector> parsedData = data.map(
+        new Function<String, Vector>() {
+          public Vector call(String s) {
+            String[] sarray = s.trim().split(" ");
+            double[] values = new double[sarray.length];
+            for (int i = 0; i < sarray.length; i++)
+              values[i] = Double.parseDouble(sarray[i]);
+            return Vectors.dense(values);
+          }
+        }
+    );
+    // Index documents with unique IDs
+    JavaPairRDD<Long, Vector> corpus = JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map(
+        new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() {
+          public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) {
+            return doc_id.swap();
+          }
+        }
+    ));
+    corpus.cache();
+
+    // Cluster the documents into three topics using LDA
+    DistributedLDAModel ldaModel = new LDA().setK(3).run(corpus);
+
+    // Output topics. Each is a distribution over words (matching word count vectors)
+    System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize()
+        + " words):");
+    Matrix topics = ldaModel.topicsMatrix();
+    for (int topic = 0; topic < 3; topic++) {
+      System.out.print("Topic " + topic + ":");
+      for (int word = 0; word < ldaModel.vocabSize(); word++) {
+        System.out.print(" " + topics.apply(word, topic));
+      }
+      System.out.println();
+    }
+  }
+}

From 4dfe180fc893bee1146161f8b2a6efd4d6d2bb8c Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Mon, 9 Feb 2015 09:44:53 +0000
Subject: [PATCH 063/357] [SPARK-5473] [EC2] Expose SSH failures after status
 checks pass

If there is some fatal problem with launching a cluster, `spark-ec2` just hangs without giving the user useful feedback on what the problem is.

This PR exposes the output of the SSH calls to the user if the SSH test fails during cluster launch for any reason but the instance status checks are all green. It also removes the growing trail of dots while waiting in favor of a fixed 3 dots.

For example:

```
$ ./ec2/spark-ec2 -k key -i /incorrect/path/identity.pem --instance-type m3.medium --slaves 1 --zone us-east-1c launch "spark-test"
Setting up security groups...
Searching for existing cluster spark-test...
Spark AMI: ami-35b1885c
Launching instances...
Launched 1 slaves in us-east-1c, regid = r-7dadd096
Launched master in us-east-1c, regid = r-fcadd017
Waiting for cluster to enter 'ssh-ready' state...
Warning: SSH connection error. (This could be temporary.)
Host: 127.0.0.1
SSH return code: 255
SSH output: Warning: Identity file /incorrect/path/identity.pem not accessible: No such file or directory.
Warning: Permanently added '127.0.0.1' (RSA) to the list of known hosts.
Permission denied (publickey).
```

This should give users enough information when some unrecoverable error occurs during launch so they can know to abort the launch. This will help avoid situations like the ones reported [here on Stack Overflow](http://stackoverflow.com/q/28002443/) and [here on the user list](http://mail-archives.apache.org/mod_mbox/spark-user/201501.mbox/%3C1422323829398-21381.postn3.nabble.com%3E), where the users couldn't tell what the problem was because it was being hidden by `spark-ec2`.

This is a usability improvement that should be backported to 1.2.

Resolves [SPARK-5473](https://issues.apache.org/jira/browse/SPARK-5473).

Author: Nicholas Chammas <nicholas.chammas@gmail.com>

Closes #4262 from nchammas/expose-ssh-failure and squashes the following commits:

8bda6ed [Nicholas Chammas] default to print SSH output
2b92534 [Nicholas Chammas] show SSH output after status check pass
---
 ec2/spark_ec2.py | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 725b1e47e0cea..87b2112fe4628 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -34,6 +34,7 @@
 import sys
 import tarfile
 import tempfile
+import textwrap
 import time
 import urllib2
 import warnings
@@ -681,21 +682,32 @@ def setup_spark_cluster(master, opts):
         print "Ganglia started at http://%s:5080/ganglia" % master
 
 
-def is_ssh_available(host, opts):
+def is_ssh_available(host, opts, print_ssh_output=True):
     """
     Check if SSH is available on a host.
     """
-    try:
-        with open(os.devnull, 'w') as devnull:
-            ret = subprocess.check_call(
-                ssh_command(opts) + ['-t', '-t', '-o', 'ConnectTimeout=3',
-                                     '%s@%s' % (opts.user, host), stringify_command('true')],
-                stdout=devnull,
-                stderr=devnull
-            )
-        return ret == 0
-    except subprocess.CalledProcessError as e:
-        return False
+    s = subprocess.Popen(
+        ssh_command(opts) + ['-t', '-t', '-o', 'ConnectTimeout=3',
+                             '%s@%s' % (opts.user, host), stringify_command('true')],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT  # we pipe stderr through stdout to preserve output order
+    )
+    cmd_output = s.communicate()[0]  # [1] is stderr, which we redirected to stdout
+
+    if s.returncode != 0 and print_ssh_output:
+        # extra leading newline is for spacing in wait_for_cluster_state()
+        print textwrap.dedent("""\n
+            Warning: SSH connection error. (This could be temporary.)
+            Host: {h}
+            SSH return code: {r}
+            SSH output: {o}
+        """).format(
+            h=host,
+            r=s.returncode,
+            o=cmd_output.strip()
+        )
+
+    return s.returncode == 0
 
 
 def is_cluster_ssh_available(cluster_instances, opts):

From 0793ee1b4dea1f4b0df749e8ad7c1ab70b512faf Mon Sep 17 00:00:00 2001
From: Sandy Ryza <sandy@cloudera.com>
Date: Mon, 9 Feb 2015 10:12:12 +0000
Subject: [PATCH 064/357] SPARK-2149. [MLLIB] Univariate kernel density
 estimation

Author: Sandy Ryza <sandy@cloudera.com>

Closes #1093 from sryza/sandy-spark-2149 and squashes the following commits:

5f06b33 [Sandy Ryza] More review comments
0f73060 [Sandy Ryza] Respond to Sean's review comments
0dfa005 [Sandy Ryza] SPARK-2149. Univariate kernel density estimation
---
 .../spark/mllib/stat/KernelDensity.scala      | 71 +++++++++++++++++++
 .../apache/spark/mllib/stat/Statistics.scala  | 14 ++++
 .../spark/mllib/stat/KernelDensitySuite.scala | 47 ++++++++++++
 3 files changed, 132 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
new file mode 100644
index 0000000000000..0deef11b4511a
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat
+
+import org.apache.spark.rdd.RDD
+
+private[stat] object KernelDensity {
+  /**
+   * Given a set of samples from a distribution, estimates its density at the set of given points.
+   * Uses a Gaussian kernel with the given standard deviation.
+   */
+  def estimate(samples: RDD[Double], standardDeviation: Double,
+      evaluationPoints: Array[Double]): Array[Double] = {
+    if (standardDeviation <= 0.0) {
+      throw new IllegalArgumentException("Standard deviation must be positive")
+    }
+
+    // This gets used in each Gaussian PDF computation, so compute it up front
+    val logStandardDeviationPlusHalfLog2Pi =
+      Math.log(standardDeviation) + 0.5 * Math.log(2 * Math.PI)
+
+    val (points, count) = samples.aggregate((new Array[Double](evaluationPoints.length), 0))(
+      (x, y) => {
+        var i = 0
+        while (i < evaluationPoints.length) {
+          x._1(i) += normPdf(y, standardDeviation, logStandardDeviationPlusHalfLog2Pi,
+            evaluationPoints(i))
+          i += 1
+        }
+        (x._1, i)
+      },
+      (x, y) => {
+        var i = 0
+        while (i < evaluationPoints.length) {
+          x._1(i) += y._1(i)
+          i += 1
+        }
+        (x._1, x._2 + y._2)
+      })
+
+    var i = 0
+    while (i < points.length) {
+      points(i) /= count
+      i += 1
+    }
+    points
+  }
+
+  private def normPdf(mean: Double, standardDeviation: Double,
+      logStandardDeviationPlusHalfLog2Pi: Double, x: Double): Double = {
+    val x0 = x - mean
+    val x1 = x0 / standardDeviation
+    val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi
+    Math.exp(logDensity)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index b3fad0c52d655..32561620ac914 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -149,4 +149,18 @@ object Statistics {
   def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSqTestResult] = {
     ChiSqTest.chiSquaredFeatures(data)
   }
+
+  /**
+   * Given an empirical distribution defined by the input RDD of samples, estimate its density at
+   * each of the given evaluation points using a Gaussian kernel.
+   *
+   * @param samples The samples RDD used to define the empirical distribution.
+   * @param standardDeviation The standard deviation of the kernel Gaussians.
+   * @param evaluationPoints The points at which to estimate densities.
+   * @return An array the same size as evaluationPoints with the density at each point.
+   */
+  def kernelDensity(samples: RDD[Double], standardDeviation: Double,
+      evaluationPoints: Iterable[Double]): Array[Double] = {
+    KernelDensity.estimate(samples, standardDeviation, evaluationPoints.toArray)
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala
new file mode 100644
index 0000000000000..f6a1e19f50296
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/KernelDensitySuite.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat
+
+import org.scalatest.FunSuite
+
+import org.apache.commons.math3.distribution.NormalDistribution
+
+import org.apache.spark.mllib.util.LocalClusterSparkContext
+
+class KernelDensitySuite extends FunSuite with LocalClusterSparkContext {
+  test("kernel density single sample") {
+    val rdd = sc.parallelize(Array(5.0))
+    val evaluationPoints = Array(5.0, 6.0)
+    val densities = KernelDensity.estimate(rdd, 3.0, evaluationPoints)
+    val normal = new NormalDistribution(5.0, 3.0)
+    val acceptableErr = 1e-6
+    assert(densities(0) - normal.density(5.0) < acceptableErr)
+    assert(densities(0) - normal.density(6.0) < acceptableErr)
+  }
+
+  test("kernel density multiple samples") {
+    val rdd = sc.parallelize(Array(5.0, 10.0))
+    val evaluationPoints = Array(5.0, 6.0)
+    val densities = KernelDensity.estimate(rdd, 3.0, evaluationPoints)
+    val normal1 = new NormalDistribution(5.0, 3.0)
+    val normal2 = new NormalDistribution(10.0, 3.0)
+    val acceptableErr = 1e-6
+    assert(densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2 < acceptableErr)
+    assert(densities(0) - (normal1.density(6.0) + normal2.density(6.0)) / 2 < acceptableErr)
+  }
+}

From de7806048ac49a8bfdf44d8f87bc11cea1dfb242 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 9 Feb 2015 10:33:57 -0800
Subject: [PATCH 065/357] SPARK-4267 [YARN] Failing to launch jobs on Spark on
 YARN with Hadoop 2.5.0 or later

Before passing to YARN, escape arguments in "extraJavaOptions" args, in order to correctly handle cases like -Dfoo="one two three". Also standardize how these args are handled and ensure that individual args are treated as stand-alone args, not one string.

vanzin andrewor14

Author: Sean Owen <sowen@cloudera.com>

Closes #4452 from srowen/SPARK-4267.2 and squashes the following commits:

c8297d2 [Sean Owen] Before passing to YARN, escape arguments in "extraJavaOptions" args, in order to correctly handle cases like -Dfoo="one two three". Also standardize how these args are handled and ensure that individual args are treated as stand-alone args, not one string.
---
 .../org/apache/spark/deploy/yarn/Client.scala   |  9 +++++----
 .../spark/deploy/yarn/ExecutorRunnable.scala    | 17 +++++++++--------
 .../spark/deploy/yarn/YarnClusterSuite.scala    |  6 ++++--
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index e7005094b5f3c..8afc1ccdad732 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -435,10 +435,11 @@ private[spark] class Client(
 
     // Include driver-specific java options if we are launching a driver
     if (isClusterMode) {
-      sparkConf.getOption("spark.driver.extraJavaOptions")
+      val driverOpts = sparkConf.getOption("spark.driver.extraJavaOptions")
         .orElse(sys.env.get("SPARK_JAVA_OPTS"))
-        .map(Utils.splitCommandString).getOrElse(Seq.empty)
-        .foreach(opts => javaOpts += opts)
+      driverOpts.foreach { opts =>
+        javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
+      }
       val libraryPaths = Seq(sys.props.get("spark.driver.extraLibraryPath"),
         sys.props.get("spark.driver.libraryPath")).flatten
       if (libraryPaths.nonEmpty) {
@@ -460,7 +461,7 @@ private[spark] class Client(
           val msg = s"$amOptsKey is not allowed to alter memory settings (was '$opts')."
           throw new SparkException(msg)
         }
-        javaOpts ++= Utils.splitCommandString(opts)
+        javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
       }
     }
 
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
index 408cf09b9bdfa..7cd8c5f0f9204 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
@@ -128,14 +128,15 @@ class ExecutorRunnable(
 
     // Set the JVM memory
     val executorMemoryString = executorMemory + "m"
-    javaOpts += "-Xms" + executorMemoryString + " -Xmx" + executorMemoryString + " "
+    javaOpts += "-Xms" + executorMemoryString
+    javaOpts += "-Xmx" + executorMemoryString
 
     // Set extra Java options for the executor, if defined
     sys.props.get("spark.executor.extraJavaOptions").foreach { opts =>
-      javaOpts += opts
+      javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
     }
     sys.env.get("SPARK_JAVA_OPTS").foreach { opts =>
-      javaOpts += opts
+      javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
     }
     sys.props.get("spark.executor.extraLibraryPath").foreach { p =>
       prefixEnv = Some(Utils.libraryPathEnvPrefix(Seq(p)))
@@ -173,11 +174,11 @@ class ExecutorRunnable(
           // The options are based on
           // http://www.oracle.com/technetwork/java/gc-tuning-5-138395.html#0.0.0.%20When%20to%20Use
           // %20the%20Concurrent%20Low%20Pause%20Collector|outline
-          javaOpts += " -XX:+UseConcMarkSweepGC "
-          javaOpts += " -XX:+CMSIncrementalMode "
-          javaOpts += " -XX:+CMSIncrementalPacing "
-          javaOpts += " -XX:CMSIncrementalDutyCycleMin=0 "
-          javaOpts += " -XX:CMSIncrementalDutyCycle=10 "
+          javaOpts += "-XX:+UseConcMarkSweepGC"
+          javaOpts += "-XX:+CMSIncrementalMode"
+          javaOpts += "-XX:+CMSIncrementalPacing"
+          javaOpts += "-XX:CMSIncrementalDutyCycleMin=0"
+          javaOpts += "-XX:CMSIncrementalDutyCycle=10"
         }
     */
 
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index eda40efc4c77f..e39de82740b1d 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -75,6 +75,8 @@ class YarnClusterSuite extends FunSuite with BeforeAndAfterAll with Matchers wit
   private var oldConf: Map[String, String] = _
 
   override def beforeAll() {
+    super.beforeAll()
+
     tempDir = Utils.createTempDir()
 
     val logConfDir = new File(tempDir, "log4j")
@@ -129,8 +131,8 @@ class YarnClusterSuite extends FunSuite with BeforeAndAfterAll with Matchers wit
     sys.props += ("spark.executor.instances" -> "1")
     sys.props += ("spark.driver.extraClassPath" -> childClasspath)
     sys.props += ("spark.executor.extraClassPath" -> childClasspath)
-
-    super.beforeAll()
+    sys.props += ("spark.executor.extraJavaOptions" -> "-Dfoo=\"one two three\"")
+    sys.props += ("spark.driver.extraJavaOptions" -> "-Dfoo=\"one two three\"")
   }
 
   override def afterAll() {

From afb131637d96e1e5e07eb8abf24e32e7f3b2304d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 9 Feb 2015 11:42:52 -0800
Subject: [PATCH 066/357] [SPARK-5678] Convert DataFrame to pandas.DataFrame
 and Series

```
pyspark.sql.DataFrame.to_pandas = to_pandas(self) unbound pyspark.sql.DataFrame method
    Collect all the rows and return a `pandas.DataFrame`.

    >>> df.to_pandas()  # doctest: +SKIP
       age   name
    0    2  Alice
    1    5    Bob

pyspark.sql.Column.to_pandas = to_pandas(self) unbound pyspark.sql.Column method
    Return a pandas.Series from the column

    >>> df.age.to_pandas()  # doctest: +SKIP
    0    2
    1    5
    dtype: int64
```

Not tests by jenkins (they depends on pandas)

Author: Davies Liu <davies@databricks.com>

Closes #4476 from davies/to_pandas and squashes the following commits:

6276fb6 [Davies Liu] Convert DataFrame to pandas.DataFrame and Series
---
 python/pyspark/sql.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index e55f285a778c4..6a6dfbc5851b8 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -2284,6 +2284,18 @@ def addColumn(self, colName, col):
         """
         return self.select('*', col.alias(colName))
 
+    def to_pandas(self):
+        """
+        Collect all the rows and return a `pandas.DataFrame`.
+
+        >>> df.to_pandas()  # doctest: +SKIP
+           age   name
+        0    2  Alice
+        1    5    Bob
+        """
+        import pandas as pd
+        return pd.DataFrame.from_records(self.collect(), columns=self.columns)
+
 
 # Having SchemaRDD for backward compatibility (for docs)
 class SchemaRDD(DataFrame):
@@ -2551,6 +2563,19 @@ def cast(self, dataType):
             jc = self._jc.cast(jdt)
         return Column(jc, self.sql_ctx)
 
+    def to_pandas(self):
+        """
+        Return a pandas.Series from the column
+
+        >>> df.age.to_pandas()  # doctest: +SKIP
+        0    2
+        1    5
+        dtype: int64
+        """
+        import pandas as pd
+        data = [c for c, in self.collect()]
+        return pd.Series(data)
+
 
 def _aggregate_func(name, doc=""):
     """ Create a function for aggregator by name"""

From dae216147f2247fd722fb0909da74fe71cf2fa8b Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 9 Feb 2015 11:45:12 -0800
Subject: [PATCH 067/357] [SPARK-5664][BUILD] Restore stty settings when
 exiting from SBT's spark-shell

For launching spark-shell from SBT.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #4451 from viirya/restore_stty and squashes the following commits:

fdfc480 [Liang-Chi Hsieh] Restore stty settings when exit (for launching spark-shell from SBT).
---
 build/sbt                 | 28 ++++++++++++++++++++++++++++
 build/sbt-launch-lib.bash |  2 +-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/build/sbt b/build/sbt
index 28ebb64f7197c..cc3203d79bccd 100755
--- a/build/sbt
+++ b/build/sbt
@@ -125,4 +125,32 @@ loadConfigFile() {
 [[ -f "$etc_sbt_opts_file" ]] && set -- $(loadConfigFile "$etc_sbt_opts_file") "$@"
 [[ -f "$sbt_opts_file" ]] && set -- $(loadConfigFile "$sbt_opts_file") "$@"
 
+exit_status=127
+saved_stty=""
+
+restoreSttySettings() {
+  stty $saved_stty
+  saved_stty=""
+}
+
+onExit() {
+  if [[ "$saved_stty" != "" ]]; then
+    restoreSttySettings
+  fi
+  exit $exit_status
+}
+
+saveSttySettings() {
+  saved_stty=$(stty -g 2>/dev/null)
+  if [[ ! $? ]]; then
+    saved_stty=""
+  fi
+}
+
+saveSttySettings
+trap onExit INT
+
 run "$@"
+
+exit_status=$?
+onExit
diff --git a/build/sbt-launch-lib.bash b/build/sbt-launch-lib.bash
index 5e0c640fa5919..504be48b358fa 100755
--- a/build/sbt-launch-lib.bash
+++ b/build/sbt-launch-lib.bash
@@ -81,7 +81,7 @@ execRunner () {
     echo ""
   }
 
-  exec "$@"
+  "$@"
 }
 
 addJava () {

From 6fe70d8432314f0b7290a66f114306f61e0a87cc Mon Sep 17 00:00:00 2001
From: mcheah <mcheah@palantir.com>
Date: Mon, 9 Feb 2015 13:20:14 -0800
Subject: [PATCH 068/357] [SPARK-5691] Fixing wrong data structure lookup for
 dupe app registratio...

In Master's registerApplication method, it checks if the application had
already registered by examining the addressToWorker hash map. In reality,
it should refer to the addressToApp data structure, as this is what
really tracks which apps have been registered.

Author: mcheah <mcheah@palantir.com>

Closes #4477 from mccheah/spark-5691 and squashes the following commits:

efdc573 [mcheah] [SPARK-5691] Fixing wrong data structure lookup for dupe app registration
---
 core/src/main/scala/org/apache/spark/deploy/master/Master.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index b8b1a25abff2e..53e453990f8c7 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -671,7 +671,7 @@ private[spark] class Master(
 
   def registerApplication(app: ApplicationInfo): Unit = {
     val appAddress = app.driver.path.address
-    if (addressToWorker.contains(appAddress)) {
+    if (addressToApp.contains(appAddress)) {
       logInfo("Attempted to re-register application at same address: " + appAddress)
       return
     }

From 0765af9b21e9204c410c7a849c7201bc3eda8cc3 Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Mon, 9 Feb 2015 14:17:14 -0800
Subject: [PATCH 069/357] [SPARK-4905][STREAMING] FlumeStreamSuite fix.

Using String constructor instead of CharsetDecoder to see if it fixes the issue of empty strings in Flume test output.

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #4371 from harishreedharan/Flume-stream-attempted-fix and squashes the following commits:

550d363 [Hari Shreedharan] Fix imports.
8695950 [Hari Shreedharan] Use Charsets.UTF_8 instead of "UTF-8" in String constructors.
af3ba14 [Hari Shreedharan] [SPARK-4905][STREAMING] FlumeStreamSuite fix.
---
 .../apache/spark/streaming/flume/FlumeStreamSuite.scala    | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
index f333e3891b5f0..322de7bf2fed8 100644
--- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
+++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
@@ -19,13 +19,13 @@ package org.apache.spark.streaming.flume
 
 import java.net.{InetSocketAddress, ServerSocket}
 import java.nio.ByteBuffer
-import java.nio.charset.Charset
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer}
 import scala.concurrent.duration._
 import scala.language.postfixOps
 
+import com.google.common.base.Charsets
 import org.apache.avro.ipc.NettyTransceiver
 import org.apache.avro.ipc.specific.SpecificRequestor
 import org.apache.flume.source.avro
@@ -108,7 +108,7 @@ class FlumeStreamSuite extends FunSuite with BeforeAndAfter with Matchers with L
 
     val inputEvents = input.map { item =>
       val event = new AvroFlumeEvent
-      event.setBody(ByteBuffer.wrap(item.getBytes("UTF-8")))
+      event.setBody(ByteBuffer.wrap(item.getBytes(Charsets.UTF_8)))
       event.setHeaders(Map[CharSequence, CharSequence]("test" -> "header"))
       event
     }
@@ -138,14 +138,13 @@ class FlumeStreamSuite extends FunSuite with BeforeAndAfter with Matchers with L
       status should be (avro.Status.OK)
     }
     
-    val decoder = Charset.forName("UTF-8").newDecoder()    
     eventually(timeout(10 seconds), interval(100 milliseconds)) {
       val outputEvents = outputBuffer.flatten.map { _.event }
       outputEvents.foreach {
         event =>
           event.getHeaders.get("test") should be("header")
       }
-      val output = outputEvents.map(event => decoder.decode(event.getBody()).toString)
+      val output = outputEvents.map(event => new String(event.getBody.array(), Charsets.UTF_8))
       output should be (input)
     }
   }

From f48199eb354d6ec8675c2c1f96c3005064058d66 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 9 Feb 2015 14:51:46 -0800
Subject: [PATCH 070/357] [SPARK-5675][SQL] XyzType companion object should
 subclass XyzType

Otherwise, the following will always return false in Java.

```scala
dataType instanceof StringType
```

Author: Reynold Xin <rxin@databricks.com>

Closes #4463 from rxin/type-companion-object and squashes the following commits:

04d5d8d [Reynold Xin] Comment.
976e11e [Reynold Xin] [SPARK-5675][SQL]StringType case object should be subclass of StringType class
---
 .../apache/spark/sql/types/dataTypes.scala    | 85 ++++++++++++++++---
 1 file changed, 73 insertions(+), 12 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala
index 91efe320546a7..2abb1caee9cd9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala
@@ -240,10 +240,16 @@ abstract class DataType {
  * @group dataType
  */
 @DeveloperApi
-case object NullType extends DataType {
+class NullType private() extends DataType {
+  // The companion object and this class is separated so the companion object also subclasses
+  // this type. Otherwise, the companion object would be of type "NullType$" in byte code.
+  // Defined with a private constructor so the companion object is the only possible instantiation.
   override def defaultSize: Int = 1
 }
 
+case object NullType extends NullType
+
+
 protected[sql] object NativeType {
   val all = Seq(
     IntegerType, BooleanType, LongType, DoubleType, FloatType, ShortType, ByteType, StringType)
@@ -292,7 +298,10 @@ protected[sql] abstract class NativeType extends DataType {
  * @group dataType
  */
 @DeveloperApi
-case object StringType extends NativeType with PrimitiveType {
+class StringType private() extends NativeType with PrimitiveType {
+  // The companion object and this class is separated so the companion object also subclasses
+  // this type. Otherwise, the companion object would be of type "StringType$" in byte code.
+  // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type JvmType = String
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
   private[sql] val ordering = implicitly[Ordering[JvmType]]
@@ -303,6 +312,8 @@ case object StringType extends NativeType with PrimitiveType {
   override def defaultSize: Int = 4096
 }
 
+case object StringType extends StringType
+
 
 /**
  * :: DeveloperApi ::
@@ -313,7 +324,10 @@ case object StringType extends NativeType with PrimitiveType {
  * @group dataType
  */
 @DeveloperApi
-case object BinaryType extends NativeType with PrimitiveType {
+class BinaryType private() extends NativeType with PrimitiveType {
+  // The companion object and this class is separated so the companion object also subclasses
+  // this type. Otherwise, the companion object would be of type "BinaryType$" in byte code.
+  // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type JvmType = Array[Byte]
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
   private[sql] val ordering = new Ordering[JvmType] {
@@ -332,6 +346,8 @@ case object BinaryType extends NativeType with PrimitiveType {
   override def defaultSize: Int = 4096
 }
 
+case object BinaryType extends BinaryType
+
 
 /**
  * :: DeveloperApi ::
@@ -341,7 +357,10 @@ case object BinaryType extends NativeType with PrimitiveType {
  *@group dataType
  */
 @DeveloperApi
-case object BooleanType extends NativeType with PrimitiveType {
+class BooleanType private() extends NativeType with PrimitiveType {
+  // The companion object and this class is separated so the companion object also subclasses
+  // this type. Otherwise, the companion object would be of type "BooleanType$" in byte code.
+  // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type JvmType = Boolean
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
   private[sql] val ordering = implicitly[Ordering[JvmType]]
@@ -352,6 +371,8 @@ case object BooleanType extends NativeType with PrimitiveType {
   override def defaultSize: Int = 1
 }
 
+case object BooleanType extends BooleanType
+
 
 /**
  * :: DeveloperApi ::
@@ -362,7 +383,10 @@ case object BooleanType extends NativeType with PrimitiveType {
  * @group dataType
  */
 @DeveloperApi
-case object TimestampType extends NativeType {
+class TimestampType private() extends NativeType {
+  // The companion object and this class is separated so the companion object also subclasses
+  // this type. Otherwise, the companion object would be of type "TimestampType$" in byte code.
+  // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type JvmType = Timestamp
 
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
@@ -377,6 +401,8 @@ case object TimestampType extends NativeType {
   override def defaultSize: Int = 12
 }
 
+case object TimestampType extends TimestampType
+
 
 /**
  * :: DeveloperApi ::
@@ -387,7 +413,10 @@ case object TimestampType extends NativeType {
  * @group dataType
  */
 @DeveloperApi
-case object DateType extends NativeType {
+class DateType private() extends NativeType {
+  // The companion object and this class is separated so the companion object also subclasses
+  // this type. Otherwise, the companion object would be of type "DateType$" in byte code.
+  // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type JvmType = Int
 
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
@@ -400,6 +429,8 @@ case object DateType extends NativeType {
   override def defaultSize: Int = 4
 }
 
+case object DateType extends DateType
+
 
 abstract class NumericType extends NativeType with PrimitiveType {
   // Unfortunately we can't get this implicitly as that breaks Spark Serialization. In order for
@@ -438,7 +469,10 @@ protected[sql] sealed abstract class IntegralType extends NumericType {
  * @group dataType
  */
 @DeveloperApi
-case object LongType extends IntegralType {
+class LongType private() extends IntegralType {
+  // The companion object and this class is separated so the companion object also subclasses
+  // this type. Otherwise, the companion object would be of type "LongType$" in byte code.
+  // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type JvmType = Long
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
   private[sql] val numeric = implicitly[Numeric[Long]]
@@ -453,6 +487,8 @@ case object LongType extends IntegralType {
   override def simpleString = "bigint"
 }
 
+case object LongType extends LongType
+
 
 /**
  * :: DeveloperApi ::
@@ -462,7 +498,10 @@ case object LongType extends IntegralType {
  * @group dataType
  */
 @DeveloperApi
-case object IntegerType extends IntegralType {
+class IntegerType private() extends IntegralType {
+  // The companion object and this class is separated so the companion object also subclasses
+  // this type. Otherwise, the companion object would be of type "IntegerType$" in byte code.
+  // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type JvmType = Int
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
   private[sql] val numeric = implicitly[Numeric[Int]]
@@ -477,6 +516,8 @@ case object IntegerType extends IntegralType {
   override def simpleString = "int"
 }
 
+case object IntegerType extends IntegerType
+
 
 /**
  * :: DeveloperApi ::
@@ -486,7 +527,10 @@ case object IntegerType extends IntegralType {
  * @group dataType
  */
 @DeveloperApi
-case object ShortType extends IntegralType {
+class ShortType private() extends IntegralType {
+  // The companion object and this class is separated so the companion object also subclasses
+  // this type. Otherwise, the companion object would be of type "ShortType$" in byte code.
+  // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type JvmType = Short
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
   private[sql] val numeric = implicitly[Numeric[Short]]
@@ -501,6 +545,8 @@ case object ShortType extends IntegralType {
   override def simpleString = "smallint"
 }
 
+case object ShortType extends ShortType
+
 
 /**
  * :: DeveloperApi ::
@@ -510,7 +556,10 @@ case object ShortType extends IntegralType {
  * @group dataType
  */
 @DeveloperApi
-case object ByteType extends IntegralType {
+class ByteType private() extends IntegralType {
+  // The companion object and this class is separated so the companion object also subclasses
+  // this type. Otherwise, the companion object would be of type "ByteType$" in byte code.
+  // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type JvmType = Byte
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
   private[sql] val numeric = implicitly[Numeric[Byte]]
@@ -525,6 +574,8 @@ case object ByteType extends IntegralType {
   override def simpleString = "tinyint"
 }
 
+case object ByteType extends ByteType
+
 
 /** Matcher for any expressions that evaluate to [[FractionalType]]s */
 protected[sql] object FractionalType {
@@ -630,7 +681,10 @@ object DecimalType {
  * @group dataType
  */
 @DeveloperApi
-case object DoubleType extends FractionalType {
+class DoubleType private() extends FractionalType {
+  // The companion object and this class is separated so the companion object also subclasses
+  // this type. Otherwise, the companion object would be of type "DoubleType$" in byte code.
+  // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type JvmType = Double
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
   private[sql] val numeric = implicitly[Numeric[Double]]
@@ -644,6 +698,8 @@ case object DoubleType extends FractionalType {
   override def defaultSize: Int = 8
 }
 
+case object DoubleType extends DoubleType
+
 
 /**
  * :: DeveloperApi ::
@@ -653,7 +709,10 @@ case object DoubleType extends FractionalType {
  * @group dataType
  */
 @DeveloperApi
-case object FloatType extends FractionalType {
+class FloatType private() extends FractionalType {
+  // The companion object and this class is separated so the companion object also subclasses
+  // this type. Otherwise, the companion object would be of type "FloatType$" in byte code.
+  // Defined with a private constructor so the companion object is the only possible instantiation.
   private[sql] type JvmType = Float
   @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] }
   private[sql] val numeric = implicitly[Numeric[Float]]
@@ -667,6 +726,8 @@ case object FloatType extends FractionalType {
   override def defaultSize: Int = 4
 }
 
+case object FloatType extends FloatType
+
 
 object ArrayType {
   /** Construct a [[ArrayType]] object with the given element type. The `containsNull` is true. */

From b884daa58084d4f42e2318894067565b94e07f9d Mon Sep 17 00:00:00 2001
From: Florian Verhein <florian.verhein@gmail.com>
Date: Mon, 9 Feb 2015 23:47:07 +0000
Subject: [PATCH 071/357] [SPARK-5611] [EC2] Allow spark-ec2 repo and branch to
 be set on CLI of spark_ec2.py

and by extension, the ami-list

Useful for using alternate spark-ec2 repos or branches.

Author: Florian Verhein <florian.verhein@gmail.com>

Closes #4385 from florianverhein/master and squashes the following commits:

7e2b4be [Florian Verhein] [SPARK-5611] [EC2] typo
8b653dc [Florian Verhein] [SPARK-5611] [EC2] Enforce only supporting spark-ec2 forks from github, log improvement
bc4b0ed [Florian Verhein] [SPARK-5611] allow spark-ec2 repos with different names
8b5c551 [Florian Verhein] improve option naming, fix logging, fix lint failing, add guard to enforce spark-ec2
7724308 [Florian Verhein] [SPARK-5611] [EC2] fixes
b42b68c [Florian Verhein] [SPARK-5611] [EC2] Allow spark-ec2 repo and branch to be set on CLI of spark_ec2.py
---
 ec2/spark_ec2.py | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 87b2112fe4628..3e4c49c0e1db6 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -62,10 +62,10 @@
 
 DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION
 DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark"
-MESOS_SPARK_EC2_BRANCH = "branch-1.3"
 
-# A URL prefix from which to fetch AMI information
-AMI_PREFIX = "https://raw.github.com/mesos/spark-ec2/{b}/ami-list".format(b=MESOS_SPARK_EC2_BRANCH)
+# Default location to get the spark-ec2 scripts (and ami-list) from
+DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/mesos/spark-ec2"
+DEFAULT_SPARK_EC2_BRANCH = "branch-1.3"
 
 
 def setup_boto():
@@ -147,6 +147,14 @@ def parse_args():
         "--spark-git-repo",
         default=DEFAULT_SPARK_GITHUB_REPO,
         help="Github repo from which to checkout supplied commit hash (default: %default)")
+    parser.add_option(
+        "--spark-ec2-git-repo",
+        default=DEFAULT_SPARK_EC2_GITHUB_REPO,
+        help="Github repo from which to checkout spark-ec2 (default: %default)")
+    parser.add_option(
+        "--spark-ec2-git-branch",
+        default=DEFAULT_SPARK_EC2_BRANCH,
+        help="Github repo branch of spark-ec2 to use (default: %default)")
     parser.add_option(
         "--hadoop-major-version", default="1",
         help="Major version of Hadoop (default: %default)")
@@ -333,7 +341,12 @@ def get_spark_ami(opts):
         print >> stderr,\
             "Don't recognize %s, assuming type is pvm" % opts.instance_type
 
-    ami_path = "%s/%s/%s" % (AMI_PREFIX, opts.region, instance_type)
+    # URL prefix from which to fetch AMI information
+    ami_prefix = "{r}/{b}/ami-list".format(
+        r=opts.spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1),
+        b=opts.spark_ec2_git_branch)
+
+    ami_path = "%s/%s/%s" % (ami_prefix, opts.region, instance_type)
     try:
         ami = urllib2.urlopen(ami_path).read().strip()
         print "Spark AMI: " + ami
@@ -650,12 +663,15 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
 
     # NOTE: We should clone the repository before running deploy_files to
     # prevent ec2-variables.sh from being overwritten
+    print "Cloning spark-ec2 scripts from {r}/tree/{b} on master...".format(
+        r=opts.spark_ec2_git_repo, b=opts.spark_ec2_git_branch)
     ssh(
         host=master,
         opts=opts,
         command="rm -rf spark-ec2"
         + " && "
-        + "git clone https://github.com/mesos/spark-ec2.git -b {b}".format(b=MESOS_SPARK_EC2_BRANCH)
+        + "git clone {r} -b {b} spark-ec2".format(r=opts.spark_ec2_git_repo,
+                                                  b=opts.spark_ec2_git_branch)
     )
 
     print "Deploying files to master..."
@@ -1038,6 +1054,17 @@ def real_main():
         print >> stderr, "ebs-vol-num cannot be greater than 8"
         sys.exit(1)
 
+    # Prevent breaking ami_prefix (/, .git and startswith checks)
+    # Prevent forks with non spark-ec2 names for now.
+    if opts.spark_ec2_git_repo.endswith("/") or \
+            opts.spark_ec2_git_repo.endswith(".git") or \
+            not opts.spark_ec2_git_repo.startswith("https://github.com") or \
+            not opts.spark_ec2_git_repo.endswith("spark-ec2"):
+        print >> stderr, "spark-ec2-git-repo must be a github repo and it must not have a " \
+                         "trailing / or .git. " \
+                         "Furthermore, we currently only support forks named spark-ec2."
+        sys.exit(1)
+
     try:
         conn = ec2.connect_to_region(opts.region)
     except Exception as e:

From 68b25cf695e0fce9e465288d5a053e540a3fccb4 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Mon, 9 Feb 2015 16:02:56 -0800
Subject: [PATCH 072/357] [SQL] Add some missing DataFrame functions.

- as with a `Symbol`
- distinct
- sqlContext.emptyDataFrame
- move add/remove col out of RDDApi section

Author: Michael Armbrust <michael@databricks.com>

Closes #4437 from marmbrus/dfMissingFuncs and squashes the following commits:

2004023 [Michael Armbrust] Add missing functions
---
 .../scala/org/apache/spark/sql/Column.scala   |  9 +++++
 .../org/apache/spark/sql/DataFrame.scala      | 12 +++++--
 .../org/apache/spark/sql/DataFrameImpl.scala  | 34 +++++++++++--------
 .../apache/spark/sql/IncomputableColumn.scala | 10 +++---
 .../scala/org/apache/spark/sql/RDDApi.scala   |  2 ++
 .../org/apache/spark/sql/SQLContext.scala     |  5 ++-
 6 files changed, 51 insertions(+), 21 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 878b2b0556de7..1011bf0bb5ef4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -549,6 +549,15 @@ trait Column extends DataFrame {
    */
   override def as(alias: String): Column = exprToColumn(Alias(expr, alias)())
 
+  /**
+   * Gives the column an alias.
+   * {{{
+   *   // Renames colA to colB in select output.
+   *   df.select($"colA".as('colB))
+   * }}}
+   */
+  override def as(alias: Symbol): Column = exprToColumn(Alias(expr, alias.name)())
+
   /**
    * Casts the column to a different data type.
    * {{{
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 17ea3cde8e50e..6abfb7853cf1c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -156,7 +156,7 @@ trait DataFrame extends RDDApi[Row] {
   def join(right: DataFrame, joinExprs: Column): DataFrame
 
   /**
-   * Join with another [[DataFrame]], usin  g the given join expression. The following performs
+   * Join with another [[DataFrame]], using the given join expression. The following performs
    * a full outer join between `df1` and `df2`.
    *
    * {{{
@@ -233,7 +233,12 @@ trait DataFrame extends RDDApi[Row] {
   /**
    * Returns a new [[DataFrame]] with an alias set.
    */
-  def as(name: String): DataFrame
+  def as(alias: String): DataFrame
+
+  /**
+   * (Scala-specific) Returns a new [[DataFrame]] with an alias set.
+   */
+  def as(alias: Symbol): DataFrame
 
   /**
    * Selects a set of expressions.
@@ -516,6 +521,9 @@ trait DataFrame extends RDDApi[Row] {
    */
   override def repartition(numPartitions: Int): DataFrame
 
+  /** Returns a new [[DataFrame]] that contains only the unique rows from this [[DataFrame]]. */
+  override def distinct: DataFrame
+
   override def persist(): this.type
 
   override def persist(newLevel: StorageLevel): this.type
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index fa05a5dcac6bf..73393295ab0a5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -196,7 +196,9 @@ private[sql] class DataFrameImpl protected[sql](
     }.toSeq :_*)
   }
 
-  override def as(name: String): DataFrame = Subquery(name, logicalPlan)
+  override def as(alias: String): DataFrame = Subquery(alias, logicalPlan)
+
+  override def as(alias: Symbol): DataFrame = Subquery(alias.name, logicalPlan)
 
   override def select(cols: Column*): DataFrame = {
     val exprs = cols.zipWithIndex.map {
@@ -215,7 +217,19 @@ private[sql] class DataFrameImpl protected[sql](
   override def selectExpr(exprs: String*): DataFrame = {
     select(exprs.map { expr =>
       Column(new SqlParser().parseExpression(expr))
-    } :_*)
+    }: _*)
+  }
+
+  override def addColumn(colName: String, col: Column): DataFrame = {
+    select(Column("*"), col.as(colName))
+  }
+
+  override def renameColumn(existingName: String, newName: String): DataFrame = {
+    val colNames = schema.map { field =>
+      val name = field.name
+      if (name == existingName) Column(name).as(newName) else Column(name)
+    }
+    select(colNames :_*)
   }
 
   override def filter(condition: Column): DataFrame = {
@@ -264,18 +278,8 @@ private[sql] class DataFrameImpl protected[sql](
   }
 
   /////////////////////////////////////////////////////////////////////////////
-
-  override def addColumn(colName: String, col: Column): DataFrame = {
-    select(Column("*"), col.as(colName))
-  }
-
-  override def renameColumn(existingName: String, newName: String): DataFrame = {
-    val colNames = schema.map { field =>
-      val name = field.name
-      if (name == existingName) Column(name).as(newName) else Column(name)
-    }
-    select(colNames :_*)
-  }
+  // RDD API
+  /////////////////////////////////////////////////////////////////////////////
 
   override def head(n: Int): Array[Row] = limit(n).collect()
 
@@ -307,6 +311,8 @@ private[sql] class DataFrameImpl protected[sql](
     sqlContext.applySchema(rdd.repartition(numPartitions), schema)
   }
 
+  override def distinct: DataFrame = Distinct(logicalPlan)
+
   override def persist(): this.type = {
     sqlContext.cacheManager.cacheQuery(this)
     this
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
index 782f6e28eebb0..0600dcc226b4d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
@@ -86,6 +86,10 @@ private[sql] class IncomputableColumn(protected[sql] val expr: Expression) exten
 
   override def selectExpr(exprs: String*): DataFrame = err()
 
+  override def addColumn(colName: String, col: Column): DataFrame = err()
+
+  override def renameColumn(existingName: String, newName: String): DataFrame = err()
+
   override def filter(condition: Column): DataFrame = err()
 
   override def filter(conditionExpr: String): DataFrame = err()
@@ -110,10 +114,6 @@ private[sql] class IncomputableColumn(protected[sql] val expr: Expression) exten
 
   /////////////////////////////////////////////////////////////////////////////
 
-  override def addColumn(colName: String, col: Column): DataFrame = err()
-
-  override def renameColumn(existingName: String, newName: String): DataFrame = err()
-
   override def head(n: Int): Array[Row] = err()
 
   override def head(): Row = err()
@@ -140,6 +140,8 @@ private[sql] class IncomputableColumn(protected[sql] val expr: Expression) exten
 
   override def repartition(numPartitions: Int): DataFrame = err()
 
+  override def distinct: DataFrame = err()
+
   override def persist(): this.type = err()
 
   override def persist(newLevel: StorageLevel): this.type = err()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RDDApi.scala b/sql/core/src/main/scala/org/apache/spark/sql/RDDApi.scala
index 38e6382f171d5..df866fd1ad8ad 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RDDApi.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RDDApi.scala
@@ -60,4 +60,6 @@ private[sql] trait RDDApi[T] {
   def first(): T
 
   def repartition(numPartitions: Int): DataFrame
+
+  def distinct: DataFrame
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index bf3990671029e..97e3777f933e4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.optimizer.{DefaultOptimizer, Optimizer}
-import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, NoRelation}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.jdbc.{JDBCPartition, JDBCPartitioningInfo, JDBCRelation}
@@ -130,6 +130,9 @@ class SQLContext(@transient val sparkContext: SparkContext)
    */
   val experimental: ExperimentalMethods = new ExperimentalMethods(this)
 
+  /** Returns a [[DataFrame]] with no rows or columns. */
+  lazy val emptyDataFrame = DataFrame(this, NoRelation)
+
   /**
    * A collection of methods for registering user-defined functions (UDF).
    *

From 5f0b30e59cc6a3017168189d3aaf09402699dc3b Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 9 Feb 2015 16:20:42 -0800
Subject: [PATCH 073/357] [SQL] Code cleanup.

I added an unnecessary line of code in https://github.com/apache/spark/commit/13531dd97c08563e53dacdaeaf1102bdd13ef825.

My bad. Let's delete it.

Author: Yin Huai <yhuai@databricks.com>

Closes #4482 from yhuai/unnecessaryCode and squashes the following commits:

3645af0 [Yin Huai] Code cleanup.
---
 .../org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala  | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index c23575fe96898..036efa84d7c85 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -351,9 +351,6 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
         |)
       """.stripMargin)
 
-    new Path("/Users/yhuai/Desktop/whatever")
-
-
     val expectedPath = catalog.hiveDefaultTableFilePath("ctasJsonTable")
     val filesystemPath = new Path(expectedPath)
     val fs = filesystemPath.getFileSystem(sparkContext.hadoopConfiguration)

From b8080aa86d55e0467fd4328f10a2f0d6605e6cc6 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 9 Feb 2015 16:23:12 -0800
Subject: [PATCH 074/357] [SPARK-5696] [SQL] [HOTFIX] Asks HiveThriftServer2 to
 re-initialize log4j using Hive configurations

In this way, log4j configurations overriden by jets3t-0.9.2.jar can be again overriden by Hive default log4j configurations.

This might not be the best solution for this issue since it requires users to use `hive-log4j.properties` rather than `log4j.properties` to initialize `HiveThriftServer2` logging configurations, which can be confusing. The main purpose of this PR is to fix Jenkins PR build.

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4484)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>

Closes #4484 from liancheng/spark-5696 and squashes the following commits:

df83956 [Cheng Lian] Hot fix: asks HiveThriftServer2 to re-initialize log4j using Hive configurations
---
 .../apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
index 6e07df18b0e15..525777aa454c4 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive.thriftserver
 
 import org.apache.commons.logging.LogFactory
+import org.apache.hadoop.hive.common.LogUtils
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hive.service.cli.thrift.{ThriftBinaryCLIService, ThriftHttpCLIService}
@@ -54,6 +55,8 @@ object HiveThriftServer2 extends Logging {
       System.exit(-1)
     }
 
+    LogUtils.initHiveLog4j()
+
     logInfo("Starting SparkContext")
     SparkSQLEnv.init()
 

From 2a36292534a1e9f7a501e88f69bfc3a09fb62cb3 Mon Sep 17 00:00:00 2001
From: Lu Yan <luyan02@baidu.com>
Date: Mon, 9 Feb 2015 16:25:38 -0800
Subject: [PATCH 075/357] [SPARK-5614][SQL] Predicate pushdown through
 Generate.

Now in Catalyst's rules, predicates can not be pushed through "Generate" nodes. Further more, partition pruning in HiveTableScan can not be applied on those queries involves "Generate". This makes such queries very inefficient. In practice, it finds patterns like

```scala
Filter(predicate, Generate(generator, _, _, _, grandChild))
```

and splits the predicate into 2 parts by referencing the generated column from Generate node or not. And a new Filter will be created for those conjuncts can be pushed beneath Generate node. If nothing left for the original Filter, it will be removed.
For example, physical plan for query
```sql
select len, bk
from s_server lateral view explode(len_arr) len_table as len
where len > 5 and day = '20150102';
```
where 'day' is a partition column in metastore is like this in current version of Spark SQL:

> Project [len, bk]
>
> Filter ((len > "5") && "(day = "20150102")")
>
> Generate explode(len_arr), true, false
>
> HiveTableScan [bk, len_arr, day], (MetastoreRelation default, s_server, None), None

But theoretically the plan should be like this

> Project [len, bk]
>
> Filter (len > "5")
>
> Generate explode(len_arr), true, false
>
> HiveTableScan [bk, len_arr, day], (MetastoreRelation default, s_server, None), Some(day = "20150102")

Where partition pruning predicates can be pushed to HiveTableScan nodes.

Author: Lu Yan <luyan02@baidu.com>

Closes #4394 from ianluyan/ppd and squashes the following commits:

a67dce9 [Lu Yan] Fix English grammar.
7cea911 [Lu Yan] Revised based on @marmbrus's opinions
ffc59fc [Lu Yan] [SPARK-5614][SQL] Predicate pushdown through Generate.
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 25 ++++++++
 .../optimizer/FilterPushdownSuite.scala       | 63 ++++++++++++++++++-
 2 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 3bc48c95c5653..fd58b9681ea24 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -50,6 +50,7 @@ object DefaultOptimizer extends Optimizer {
       CombineFilters,
       PushPredicateThroughProject,
       PushPredicateThroughJoin,
+      PushPredicateThroughGenerate,
       ColumnPruning) ::
     Batch("LocalRelation", FixedPoint(100),
       ConvertToLocalRelation) :: Nil
@@ -455,6 +456,30 @@ object PushPredicateThroughProject extends Rule[LogicalPlan] {
   }
 }
 
+/**
+ * Push [[Filter]] operators through [[Generate]] operators. Parts of the predicate that reference
+ * attributes generated in [[Generate]] will remain above, and the rest should be pushed beneath.
+ */
+object PushPredicateThroughGenerate extends Rule[LogicalPlan] with PredicateHelper {
+
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case filter @ Filter(condition,
+    generate @ Generate(generator, join, outer, alias, grandChild)) =>
+      // Predicates that reference attributes produced by the `Generate` operator cannot
+      // be pushed below the operator.
+      val (pushDown, stayUp) = splitConjunctivePredicates(condition).partition {
+        conjunct => conjunct.references subsetOf grandChild.outputSet
+      }
+      if (pushDown.nonEmpty) {
+        val pushDownPredicate = pushDown.reduce(And)
+        val withPushdown = generate.copy(child = Filter(pushDownPredicate, grandChild))
+        stayUp.reduceOption(And).map(Filter(_, withPushdown)).getOrElse(withPushdown)
+      } else {
+        filter
+      }
+  }
+}
+
 /**
  * Pushes down [[Filter]] operators where the `condition` can be
  * evaluated using only the attributes of the left or right side of a join.  Other
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index ebb123c1f909e..1158b5dfc6147 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -19,11 +19,13 @@ package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.analysis.EliminateAnalysisOperators
+import org.apache.spark.sql.catalyst.expressions.Explode
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.{PlanTest, LeftOuter, RightOuter}
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.types.IntegerType
 
 class FilterPushdownSuite extends PlanTest {
 
@@ -34,7 +36,8 @@ class FilterPushdownSuite extends PlanTest {
       Batch("Filter Pushdown", Once,
         CombineFilters,
         PushPredicateThroughProject,
-        PushPredicateThroughJoin) :: Nil
+        PushPredicateThroughJoin,
+        PushPredicateThroughGenerate) :: Nil
   }
 
   val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
@@ -411,4 +414,62 @@ class FilterPushdownSuite extends PlanTest {
 
     comparePlans(optimized, analysis.EliminateAnalysisOperators(correctAnswer))
   }
+
+  val testRelationWithArrayType = LocalRelation('a.int, 'b.int, 'c_arr.array(IntegerType))
+
+  test("generate: predicate referenced no generated column") {
+    val originalQuery = {
+      testRelationWithArrayType
+        .generate(Explode(Seq("c"), 'c_arr), true, false, Some("arr"))
+        .where(('b >= 5) && ('a > 6))
+    }
+    val optimized = Optimize(originalQuery.analyze)
+    val correctAnswer = {
+      testRelationWithArrayType
+        .where(('b >= 5) && ('a > 6))
+        .generate(Explode(Seq("c"), 'c_arr), true, false, Some("arr")).analyze
+    }
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("generate: part of conjuncts referenced generated column") {
+    val generator = Explode(Seq("c"), 'c_arr)
+    val originalQuery = {
+      testRelationWithArrayType
+        .generate(generator, true, false, Some("arr"))
+        .where(('b >= 5) && ('c > 6))
+    }
+    val optimized = Optimize(originalQuery.analyze)
+    val referenceResult = {
+      testRelationWithArrayType
+        .where('b >= 5)
+        .generate(generator, true, false, Some("arr"))
+        .where('c > 6).analyze
+    }
+
+    // Since newly generated columns get different ids every time being analyzed
+    // e.g. comparePlans(originalQuery.analyze, originalQuery.analyze) fails.
+    // So we check operators manually here.
+    // Filter("c" > 6)
+    assertResult(classOf[Filter])(optimized.getClass)
+    assertResult(1)(optimized.asInstanceOf[Filter].condition.references.size)
+    assertResult("c"){
+      optimized.asInstanceOf[Filter].condition.references.toSeq(0).name
+    }
+
+    // the rest part
+    comparePlans(optimized.children(0), referenceResult.children(0))
+  }
+
+  test("generate: all conjuncts referenced generated column") {
+    val originalQuery = {
+      testRelationWithArrayType
+        .generate(Explode(Seq("c"), 'c_arr), true, false, Some("arr"))
+        .where(('c > 6) || ('b > 5)).analyze
+    }
+    val optimized = Optimize(originalQuery)
+
+    comparePlans(optimized, originalQuery)
+  }
 }

From 0ee53ebce9944722e76b2b28fae79d9956be9f17 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Mon, 9 Feb 2015 16:39:34 -0800
Subject: [PATCH 076/357] [SPARK-2096][SQL] support dot notation on array of
 struct

~~The rule is simple: If you want `a.b` work, then `a` must be some level of nested array of struct(level 0 means just a StructType). And the result of `a.b` is same level of nested array of b-type.
An optimization is: the resolve chain looks like `Attribute -> GetItem -> GetField -> GetField ...`, so we could transmit the nested array information between `GetItem` and `GetField` to avoid repeated computation of `innerDataType` and `containsNullList` of that nested array.~~
marmbrus Could you take a look?

to evaluate `a.b`, if `a` is array of struct, then `a.b` means get field `b` on each element of `a`, and return a result of array.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #2405 from cloud-fan/nested-array-dot and squashes the following commits:

08a228a [Wenchen Fan] support dot notation on array of struct
---
 .../sql/catalyst/analysis/Analyzer.scala      | 30 +++++++++-------
 .../catalyst/expressions/complexTypes.scala   | 34 ++++++++++++++++---
 .../sql/catalyst/optimizer/Optimizer.scala    |  3 +-
 .../ExpressionEvaluationSuite.scala           |  2 +-
 .../org/apache/spark/sql/json/JsonSuite.scala |  6 ++--
 5 files changed, 53 insertions(+), 22 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 0b59ed1739566..fb2ff014cef07 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -22,8 +22,7 @@ import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.types.IntegerType
+import org.apache.spark.sql.types.{ArrayType, StructField, StructType, IntegerType}
 
 /**
  * A trivial [[Analyzer]] with an [[EmptyCatalog]] and [[EmptyFunctionRegistry]]. Used for testing
@@ -311,18 +310,25 @@ class Analyzer(catalog: Catalog,
      * desired fields are found.
      */
     protected def resolveGetField(expr: Expression, fieldName: String): Expression = {
+      def findField(fields: Array[StructField]): Int = {
+        val checkField = (f: StructField) => resolver(f.name, fieldName)
+        val ordinal = fields.indexWhere(checkField)
+        if (ordinal == -1) {
+          sys.error(
+            s"No such struct field $fieldName in ${fields.map(_.name).mkString(", ")}")
+        } else if (fields.indexWhere(checkField, ordinal + 1) != -1) {
+          sys.error(s"Ambiguous reference to fields ${fields.filter(checkField).mkString(", ")}")
+        } else {
+          ordinal
+        }
+      }
       expr.dataType match {
         case StructType(fields) =>
-          val actualField = fields.filter(f => resolver(f.name, fieldName))
-          if (actualField.length == 0) {
-            sys.error(
-              s"No such struct field $fieldName in ${fields.map(_.name).mkString(", ")}")
-          } else if (actualField.length == 1) {
-            val field = actualField(0)
-            GetField(expr, field, fields.indexOf(field))
-          } else {
-            sys.error(s"Ambiguous reference to fields ${actualField.mkString(", ")}")
-          }
+          val ordinal = findField(fields)
+          StructGetField(expr, fields(ordinal), ordinal)
+        case ArrayType(StructType(fields), containsNull) =>
+          val ordinal = findField(fields)
+          ArrayGetField(expr, fields(ordinal), ordinal, containsNull)
         case otherType => sys.error(s"GetField is not valid on fields of type $otherType")
       }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
index 66e2e5c4bafce..68051a2a2007e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypes.scala
@@ -70,22 +70,48 @@ case class GetItem(child: Expression, ordinal: Expression) extends Expression {
   }
 }
 
+
+trait GetField extends UnaryExpression {
+  self: Product =>
+
+  type EvaluatedType = Any
+  override def foldable = child.foldable
+  override def toString = s"$child.${field.name}"
+
+  def field: StructField
+}
+
 /**
  * Returns the value of fields in the Struct `child`.
  */
-case class GetField(child: Expression, field: StructField, ordinal: Int) extends UnaryExpression {
-  type EvaluatedType = Any
+case class StructGetField(child: Expression, field: StructField, ordinal: Int) extends GetField {
 
   def dataType = field.dataType
   override def nullable = child.nullable || field.nullable
-  override def foldable = child.foldable
 
   override def eval(input: Row): Any = {
     val baseValue = child.eval(input).asInstanceOf[Row]
     if (baseValue == null) null else baseValue(ordinal)
   }
+}
 
-  override def toString = s"$child.${field.name}"
+/**
+ * Returns the array of value of fields in the Array of Struct `child`.
+ */
+case class ArrayGetField(child: Expression, field: StructField, ordinal: Int, containsNull: Boolean)
+  extends GetField {
+
+  def dataType = ArrayType(field.dataType, containsNull)
+  override def nullable = child.nullable
+
+  override def eval(input: Row): Any = {
+    val baseValue = child.eval(input).asInstanceOf[Seq[Row]]
+    if (baseValue == null) null else {
+      baseValue.map { row =>
+        if (row == null) null else row(ordinal)
+      }
+    }
+  }
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index fd58b9681ea24..0da081ed1a6e2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -209,7 +209,8 @@ object NullPropagation extends Rule[LogicalPlan] {
       case e @ IsNotNull(c) if !c.nullable => Literal(true, BooleanType)
       case e @ GetItem(Literal(null, _), _) => Literal(null, e.dataType)
       case e @ GetItem(_, Literal(null, _)) => Literal(null, e.dataType)
-      case e @ GetField(Literal(null, _), _, _) => Literal(null, e.dataType)
+      case e @ StructGetField(Literal(null, _), _, _) => Literal(null, e.dataType)
+      case e @ ArrayGetField(Literal(null, _), _, _, _) => Literal(null, e.dataType)
       case e @ EqualNullSafe(Literal(null, _), r) => IsNull(r)
       case e @ EqualNullSafe(l, Literal(null, _)) => IsNull(l)
       case e @ Count(expr) if !expr.nullable => Count(Literal(1))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
index 7cf6c80194f6c..dcfd8b28cb02a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
@@ -851,7 +851,7 @@ class ExpressionEvaluationSuite extends FunSuite {
       expr.dataType match {
         case StructType(fields) =>
           val field = fields.find(_.name == fieldName).get
-          GetField(expr, field, fields.indexOf(field))
+          StructGetField(expr, field, fields.indexOf(field))
       }
     }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 926ba68828ee8..7870cf9b0a868 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -342,21 +342,19 @@ class JsonSuite extends QueryTest {
     )
   }
 
-  ignore("Complex field and type inferring (Ignored)") {
+  test("GetField operation on complex data type") {
     val jsonDF = jsonRDD(complexFieldAndType1)
     jsonDF.registerTempTable("jsonTable")
 
-    // Right now, "field1" and "field2" are treated as aliases. We should fix it.
     checkAnswer(
       sql("select arrayOfStruct[0].field1, arrayOfStruct[0].field2 from jsonTable"),
       Row(true, "str1")
     )
 
-    // Right now, the analyzer cannot resolve arrayOfStruct.field1 and arrayOfStruct.field2.
     // Getting all values of a specific field from an array of structs.
     checkAnswer(
       sql("select arrayOfStruct.field1, arrayOfStruct.field2 from jsonTable"),
-      Row(Seq(true, false), Seq("str1", null))
+      Row(Seq(true, false, null), Seq("str1", null, null))
     )
   }
 

From d08e7c2b498584609cb3c7922eaaa2a0d115603f Mon Sep 17 00:00:00 2001
From: DoingDone9 <799203320@qq.com>
Date: Mon, 9 Feb 2015 16:40:26 -0800
Subject: [PATCH 077/357] [SPARK-5648][SQL] support "alter ... unset
 tblproperties("key")"

make hivecontext support "alter ... unset tblproperties("key")"
like :
alter view viewName unset tblproperties("k")
alter table tableName unset tblproperties("k")

Author: DoingDone9 <799203320@qq.com>

Closes #4424 from DoingDone9/unset and squashes the following commits:

6dd8bee [DoingDone9] support "alter ... unset tblproperties("key")"
---
 sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 2a4b88092179f..f51af62d3340b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -103,6 +103,7 @@ private[hive] object HiveQl {
     "TOK_CREATEINDEX",
     "TOK_DROPDATABASE",
     "TOK_DROPINDEX",
+    "TOK_DROPTABLE_PROPERTIES",
     "TOK_MSCK",
 
     "TOK_ALTERVIEW_ADDPARTS",
@@ -111,6 +112,7 @@ private[hive] object HiveQl {
     "TOK_ALTERVIEW_PROPERTIES",
     "TOK_ALTERVIEW_RENAME",
     "TOK_CREATEVIEW",
+    "TOK_DROPVIEW_PROPERTIES",
     "TOK_DROPVIEW",
 
     "TOK_EXPORT",

From 3ec3ad295ddd1435da68251b7479ffb60aec7037 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 9 Feb 2015 16:52:05 -0800
Subject: [PATCH 078/357] [SPARK-5699] [SQL] [Tests] Runs hive-thriftserver
 tests whenever SQL code is modified

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4486)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>

Closes #4486 from liancheng/spark-5699 and squashes the following commits:

538001d [Cheng Lian] Runs hive-thriftserver tests whenever SQL code is modified
---
 dev/run-tests | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dev/run-tests b/dev/run-tests
index 2257a566bb1bb..483958757a2dd 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -36,7 +36,7 @@ function handle_error () {
 }
 
 
-# Build against the right verison of Hadoop.
+# Build against the right version of Hadoop.
 {
   if [ -n "$AMPLAB_JENKINS_BUILD_PROFILE" ]; then
     if [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop1.0" ]; then
@@ -77,7 +77,7 @@ export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl"
   fi
 }
 
-# Only run Hive tests if there are sql changes.
+# Only run Hive tests if there are SQL changes.
 # Partial solution for SPARK-1455.
 if [ -n "$AMPLAB_JENKINS" ]; then
   git fetch origin master:master
@@ -183,7 +183,7 @@ CURRENT_BLOCK=$BLOCK_SPARK_UNIT_TESTS
   if [ -n "$_SQL_TESTS_ONLY" ]; then
     # This must be an array of individual arguments. Otherwise, having one long string
     # will be interpreted as a single test, which doesn't work.
-    SBT_MAVEN_TEST_ARGS=("catalyst/test" "sql/test" "hive/test" "mllib/test")
+    SBT_MAVEN_TEST_ARGS=("catalyst/test" "sql/test" "hive/test" "hive-thriftserver/test" "mllib/test")
   else
     SBT_MAVEN_TEST_ARGS=("test")
   fi

From d302c4800bf2f74eceb731169ddf1766136b7398 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Mon, 9 Feb 2015 17:33:29 -0800
Subject: [PATCH 079/357] [SPARK-5698] Do not let user request negative # of
 executors

Otherwise we might crash the ApplicationMaster. Why? Please see https://issues.apache.org/jira/browse/SPARK-5698.

sryza I believe this is also relevant in your patch #4168.

Author: Andrew Or <andrew@databricks.com>

Closes #4483 from andrewor14/da-negative and squashes the following commits:

53ed955 [Andrew Or] Throw IllegalArgumentException instead
0e89fd5 [Andrew Or] Check against negative requests
---
 .../scheduler/cluster/CoarseGrainedSchedulerBackend.scala    | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 9d2fb4f3b4729..f9ca93432bf41 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -314,6 +314,11 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
    * Return whether the request is acknowledged.
    */
   final override def requestExecutors(numAdditionalExecutors: Int): Boolean = synchronized {
+    if (numAdditionalExecutors < 0) {
+      throw new IllegalArgumentException(
+        "Attempted to request a negative number of additional executor(s) " +
+        s"$numAdditionalExecutors from the cluster manager. Please specify a positive number!")
+    }
     logInfo(s"Requesting $numAdditionalExecutors additional executor(s) from the cluster manager")
     logDebug(s"Number of pending executors is now $numPendingExecutors")
     numPendingExecutors += numAdditionalExecutors

From 08488c175f2e8532cb6aab84da2abd9ad57179cc Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 9 Feb 2015 20:49:22 -0800
Subject: [PATCH 080/357] [SPARK-5469] restructure pyspark.sql into multiple
 files

All the DataTypes moved into pyspark.sql.types

The changes can be tracked by `--find-copies-harder -M25`
```
davieslocalhost:~/work/spark/python$ git diff --find-copies-harder -M25 --numstat master..
2       5       python/docs/pyspark.ml.rst
0       3       python/docs/pyspark.mllib.rst
10      2       python/docs/pyspark.sql.rst
1       1       python/pyspark/mllib/linalg.py
21      14      python/pyspark/{mllib => sql}/__init__.py
14      2108    python/pyspark/{sql.py => sql/context.py}
10      1772    python/pyspark/{sql.py => sql/dataframe.py}
7       6       python/pyspark/{sql_tests.py => sql/tests.py}
8       1465    python/pyspark/{sql.py => sql/types.py}
4       2       python/run-tests
1       1       sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
```

Also `git blame -C -C python/pyspark/sql/context.py` to track the history.

Author: Davies Liu <davies@databricks.com>

Closes #4479 from davies/sql and squashes the following commits:

1b5f0a5 [Davies Liu] Merge branch 'master' of github.com:apache/spark into sql
2b2b983 [Davies Liu] restructure pyspark.sql
---
 python/docs/pyspark.ml.rst                    |    7 +-
 python/docs/pyspark.mllib.rst                 |    3 -
 python/docs/pyspark.sql.rst                   |   12 +-
 python/pyspark/mllib/linalg.py                |    2 +-
 python/pyspark/sql.py                         | 2736 -----------------
 python/pyspark/sql/__init__.py                |   42 +
 python/pyspark/sql/context.py                 |  642 ++++
 python/pyspark/sql/dataframe.py               |  974 ++++++
 python/pyspark/{sql_tests.py => sql/tests.py} |   13 +-
 python/pyspark/sql/types.py                   | 1279 ++++++++
 python/run-tests                              |    6 +-
 .../spark/sql/test/ExamplePointUDT.scala      |    2 +-
 12 files changed, 2962 insertions(+), 2756 deletions(-)
 delete mode 100644 python/pyspark/sql.py
 create mode 100644 python/pyspark/sql/__init__.py
 create mode 100644 python/pyspark/sql/context.py
 create mode 100644 python/pyspark/sql/dataframe.py
 rename python/pyspark/{sql_tests.py => sql/tests.py} (96%)
 create mode 100644 python/pyspark/sql/types.py

diff --git a/python/docs/pyspark.ml.rst b/python/docs/pyspark.ml.rst
index f10d1339a9a8f..4da6d4a74a299 100644
--- a/python/docs/pyspark.ml.rst
+++ b/python/docs/pyspark.ml.rst
@@ -1,11 +1,8 @@
 pyspark.ml package
 =====================
 
-Submodules
-----------
-
-pyspark.ml module
------------------
+Module Context
+--------------
 
 .. automodule:: pyspark.ml
     :members:
diff --git a/python/docs/pyspark.mllib.rst b/python/docs/pyspark.mllib.rst
index 4548b8739ed91..21f66ca344a3c 100644
--- a/python/docs/pyspark.mllib.rst
+++ b/python/docs/pyspark.mllib.rst
@@ -1,9 +1,6 @@
 pyspark.mllib package
 =====================
 
-Submodules
-----------
-
 pyspark.mllib.classification module
 -----------------------------------
 
diff --git a/python/docs/pyspark.sql.rst b/python/docs/pyspark.sql.rst
index 65b3650ae10ab..80c6f02a9df41 100644
--- a/python/docs/pyspark.sql.rst
+++ b/python/docs/pyspark.sql.rst
@@ -1,10 +1,18 @@
 pyspark.sql module
 ==================
 
-Module contents
----------------
+Module Context
+--------------
 
 .. automodule:: pyspark.sql
     :members:
     :undoc-members:
     :show-inheritance:
+
+
+pyspark.sql.types module
+------------------------
+.. automodule:: pyspark.sql.types
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 7f21190ed8c25..597012b1c967c 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -29,7 +29,7 @@
 
 import numpy as np
 
-from pyspark.sql import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
+from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
     IntegerType, ByteType
 
 
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
deleted file mode 100644
index 6a6dfbc5851b8..0000000000000
--- a/python/pyspark/sql.py
+++ /dev/null
@@ -1,2736 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""
-public classes of Spark SQL:
-
-    - L{SQLContext}
-      Main entry point for SQL functionality.
-    - L{DataFrame}
-      A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In
-      addition to normal RDD operations, DataFrames also support SQL.
-    - L{GroupedData}
-    - L{Column}
-      Column is a DataFrame with a single column.
-    - L{Row}
-      A Row of data returned by a Spark SQL query.
-    - L{HiveContext}
-      Main entry point for accessing data stored in Apache Hive..
-"""
-
-import sys
-import itertools
-import decimal
-import datetime
-import keyword
-import warnings
-import json
-import re
-import random
-import os
-from tempfile import NamedTemporaryFile
-from array import array
-from operator import itemgetter
-from itertools import imap
-
-from py4j.protocol import Py4JError
-from py4j.java_collections import ListConverter, MapConverter
-
-from pyspark.context import SparkContext
-from pyspark.rdd import RDD, _prepare_for_python_RDD
-from pyspark.serializers import BatchedSerializer, AutoBatchedSerializer, PickleSerializer, \
-    CloudPickleSerializer, UTF8Deserializer
-from pyspark.storagelevel import StorageLevel
-from pyspark.traceback_utils import SCCallSiteSync
-
-
-__all__ = [
-    "StringType", "BinaryType", "BooleanType", "DateType", "TimestampType", "DecimalType",
-    "DoubleType", "FloatType", "ByteType", "IntegerType", "LongType",
-    "ShortType", "ArrayType", "MapType", "StructField", "StructType",
-    "SQLContext", "HiveContext", "DataFrame", "GroupedData", "Column", "Row", "Dsl",
-    "SchemaRDD"]
-
-
-class DataType(object):
-
-    """Spark SQL DataType"""
-
-    def __repr__(self):
-        return self.__class__.__name__
-
-    def __hash__(self):
-        return hash(str(self))
-
-    def __eq__(self, other):
-        return (isinstance(other, self.__class__) and
-                self.__dict__ == other.__dict__)
-
-    def __ne__(self, other):
-        return not self.__eq__(other)
-
-    @classmethod
-    def typeName(cls):
-        return cls.__name__[:-4].lower()
-
-    def jsonValue(self):
-        return self.typeName()
-
-    def json(self):
-        return json.dumps(self.jsonValue(),
-                          separators=(',', ':'),
-                          sort_keys=True)
-
-
-class PrimitiveTypeSingleton(type):
-
-    """Metaclass for PrimitiveType"""
-
-    _instances = {}
-
-    def __call__(cls):
-        if cls not in cls._instances:
-            cls._instances[cls] = super(PrimitiveTypeSingleton, cls).__call__()
-        return cls._instances[cls]
-
-
-class PrimitiveType(DataType):
-
-    """Spark SQL PrimitiveType"""
-
-    __metaclass__ = PrimitiveTypeSingleton
-
-    def __eq__(self, other):
-        # because they should be the same object
-        return self is other
-
-
-class NullType(PrimitiveType):
-
-    """Spark SQL NullType
-
-    The data type representing None, used for the types which has not
-    been inferred.
-    """
-
-
-class StringType(PrimitiveType):
-
-    """Spark SQL StringType
-
-    The data type representing string values.
-    """
-
-
-class BinaryType(PrimitiveType):
-
-    """Spark SQL BinaryType
-
-    The data type representing bytearray values.
-    """
-
-
-class BooleanType(PrimitiveType):
-
-    """Spark SQL BooleanType
-
-    The data type representing bool values.
-    """
-
-
-class DateType(PrimitiveType):
-
-    """Spark SQL DateType
-
-    The data type representing datetime.date values.
-    """
-
-
-class TimestampType(PrimitiveType):
-
-    """Spark SQL TimestampType
-
-    The data type representing datetime.datetime values.
-    """
-
-
-class DecimalType(DataType):
-
-    """Spark SQL DecimalType
-
-    The data type representing decimal.Decimal values.
-    """
-
-    def __init__(self, precision=None, scale=None):
-        self.precision = precision
-        self.scale = scale
-        self.hasPrecisionInfo = precision is not None
-
-    def jsonValue(self):
-        if self.hasPrecisionInfo:
-            return "decimal(%d,%d)" % (self.precision, self.scale)
-        else:
-            return "decimal"
-
-    def __repr__(self):
-        if self.hasPrecisionInfo:
-            return "DecimalType(%d,%d)" % (self.precision, self.scale)
-        else:
-            return "DecimalType()"
-
-
-class DoubleType(PrimitiveType):
-
-    """Spark SQL DoubleType
-
-    The data type representing float values.
-    """
-
-
-class FloatType(PrimitiveType):
-
-    """Spark SQL FloatType
-
-    The data type representing single precision floating-point values.
-    """
-
-
-class ByteType(PrimitiveType):
-
-    """Spark SQL ByteType
-
-    The data type representing int values with 1 singed byte.
-    """
-
-
-class IntegerType(PrimitiveType):
-
-    """Spark SQL IntegerType
-
-    The data type representing int values.
-    """
-
-
-class LongType(PrimitiveType):
-
-    """Spark SQL LongType
-
-    The data type representing long values. If the any value is
-    beyond the range of [-9223372036854775808, 9223372036854775807],
-    please use DecimalType.
-    """
-
-
-class ShortType(PrimitiveType):
-
-    """Spark SQL ShortType
-
-    The data type representing int values with 2 signed bytes.
-    """
-
-
-class ArrayType(DataType):
-
-    """Spark SQL ArrayType
-
-    The data type representing list values. An ArrayType object
-    comprises two fields, elementType (a DataType) and containsNull (a bool).
-    The field of elementType is used to specify the type of array elements.
-    The field of containsNull is used to specify if the array has None values.
-
-    """
-
-    def __init__(self, elementType, containsNull=True):
-        """Creates an ArrayType
-
-        :param elementType: the data type of elements.
-        :param containsNull: indicates whether the list contains None values.
-
-        >>> ArrayType(StringType) == ArrayType(StringType, True)
-        True
-        >>> ArrayType(StringType, False) == ArrayType(StringType)
-        False
-        """
-        self.elementType = elementType
-        self.containsNull = containsNull
-
-    def __repr__(self):
-        return "ArrayType(%s,%s)" % (self.elementType,
-                                     str(self.containsNull).lower())
-
-    def jsonValue(self):
-        return {"type": self.typeName(),
-                "elementType": self.elementType.jsonValue(),
-                "containsNull": self.containsNull}
-
-    @classmethod
-    def fromJson(cls, json):
-        return ArrayType(_parse_datatype_json_value(json["elementType"]),
-                         json["containsNull"])
-
-
-class MapType(DataType):
-
-    """Spark SQL MapType
-
-    The data type representing dict values. A MapType object comprises
-    three fields, keyType (a DataType), valueType (a DataType) and
-    valueContainsNull (a bool).
-
-    The field of keyType is used to specify the type of keys in the map.
-    The field of valueType is used to specify the type of values in the map.
-    The field of valueContainsNull is used to specify if values of this
-    map has None values.
-
-    For values of a MapType column, keys are not allowed to have None values.
-
-    """
-
-    def __init__(self, keyType, valueType, valueContainsNull=True):
-        """Creates a MapType
-        :param keyType: the data type of keys.
-        :param valueType: the data type of values.
-        :param valueContainsNull: indicates whether values contains
-        null values.
-
-        >>> (MapType(StringType, IntegerType)
-        ...        == MapType(StringType, IntegerType, True))
-        True
-        >>> (MapType(StringType, IntegerType, False)
-        ...        == MapType(StringType, FloatType))
-        False
-        """
-        self.keyType = keyType
-        self.valueType = valueType
-        self.valueContainsNull = valueContainsNull
-
-    def __repr__(self):
-        return "MapType(%s,%s,%s)" % (self.keyType, self.valueType,
-                                      str(self.valueContainsNull).lower())
-
-    def jsonValue(self):
-        return {"type": self.typeName(),
-                "keyType": self.keyType.jsonValue(),
-                "valueType": self.valueType.jsonValue(),
-                "valueContainsNull": self.valueContainsNull}
-
-    @classmethod
-    def fromJson(cls, json):
-        return MapType(_parse_datatype_json_value(json["keyType"]),
-                       _parse_datatype_json_value(json["valueType"]),
-                       json["valueContainsNull"])
-
-
-class StructField(DataType):
-
-    """Spark SQL StructField
-
-    Represents a field in a StructType.
-    A StructField object comprises three fields, name (a string),
-    dataType (a DataType) and nullable (a bool). The field of name
-    is the name of a StructField. The field of dataType specifies
-    the data type of a StructField.
-
-    The field of nullable specifies if values of a StructField can
-    contain None values.
-
-    """
-
-    def __init__(self, name, dataType, nullable=True, metadata=None):
-        """Creates a StructField
-        :param name: the name of this field.
-        :param dataType: the data type of this field.
-        :param nullable: indicates whether values of this field
-                         can be null.
-        :param metadata: metadata of this field, which is a map from string
-                         to simple type that can be serialized to JSON
-                         automatically
-
-        >>> (StructField("f1", StringType, True)
-        ...      == StructField("f1", StringType, True))
-        True
-        >>> (StructField("f1", StringType, True)
-        ...      == StructField("f2", StringType, True))
-        False
-        """
-        self.name = name
-        self.dataType = dataType
-        self.nullable = nullable
-        self.metadata = metadata or {}
-
-    def __repr__(self):
-        return "StructField(%s,%s,%s)" % (self.name, self.dataType,
-                                          str(self.nullable).lower())
-
-    def jsonValue(self):
-        return {"name": self.name,
-                "type": self.dataType.jsonValue(),
-                "nullable": self.nullable,
-                "metadata": self.metadata}
-
-    @classmethod
-    def fromJson(cls, json):
-        return StructField(json["name"],
-                           _parse_datatype_json_value(json["type"]),
-                           json["nullable"],
-                           json["metadata"])
-
-
-class StructType(DataType):
-
-    """Spark SQL StructType
-
-    The data type representing rows.
-    A StructType object comprises a list of L{StructField}.
-
-    """
-
-    def __init__(self, fields):
-        """Creates a StructType
-
-        >>> struct1 = StructType([StructField("f1", StringType, True)])
-        >>> struct2 = StructType([StructField("f1", StringType, True)])
-        >>> struct1 == struct2
-        True
-        >>> struct1 = StructType([StructField("f1", StringType, True)])
-        >>> struct2 = StructType([StructField("f1", StringType, True),
-        ...   [StructField("f2", IntegerType, False)]])
-        >>> struct1 == struct2
-        False
-        """
-        self.fields = fields
-
-    def __repr__(self):
-        return ("StructType(List(%s))" %
-                ",".join(str(field) for field in self.fields))
-
-    def jsonValue(self):
-        return {"type": self.typeName(),
-                "fields": [f.jsonValue() for f in self.fields]}
-
-    @classmethod
-    def fromJson(cls, json):
-        return StructType([StructField.fromJson(f) for f in json["fields"]])
-
-
-class UserDefinedType(DataType):
-    """
-    .. note:: WARN: Spark Internal Use Only
-    SQL User-Defined Type (UDT).
-    """
-
-    @classmethod
-    def typeName(cls):
-        return cls.__name__.lower()
-
-    @classmethod
-    def sqlType(cls):
-        """
-        Underlying SQL storage type for this UDT.
-        """
-        raise NotImplementedError("UDT must implement sqlType().")
-
-    @classmethod
-    def module(cls):
-        """
-        The Python module of the UDT.
-        """
-        raise NotImplementedError("UDT must implement module().")
-
-    @classmethod
-    def scalaUDT(cls):
-        """
-        The class name of the paired Scala UDT.
-        """
-        raise NotImplementedError("UDT must have a paired Scala UDT.")
-
-    def serialize(self, obj):
-        """
-        Converts the a user-type object into a SQL datum.
-        """
-        raise NotImplementedError("UDT must implement serialize().")
-
-    def deserialize(self, datum):
-        """
-        Converts a SQL datum into a user-type object.
-        """
-        raise NotImplementedError("UDT must implement deserialize().")
-
-    def json(self):
-        return json.dumps(self.jsonValue(), separators=(',', ':'), sort_keys=True)
-
-    def jsonValue(self):
-        schema = {
-            "type": "udt",
-            "class": self.scalaUDT(),
-            "pyClass": "%s.%s" % (self.module(), type(self).__name__),
-            "sqlType": self.sqlType().jsonValue()
-        }
-        return schema
-
-    @classmethod
-    def fromJson(cls, json):
-        pyUDT = json["pyClass"]
-        split = pyUDT.rfind(".")
-        pyModule = pyUDT[:split]
-        pyClass = pyUDT[split+1:]
-        m = __import__(pyModule, globals(), locals(), [pyClass], -1)
-        UDT = getattr(m, pyClass)
-        return UDT()
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-
-_all_primitive_types = dict((v.typeName(), v)
-                            for v in globals().itervalues()
-                            if type(v) is PrimitiveTypeSingleton and
-                            v.__base__ == PrimitiveType)
-
-
-_all_complex_types = dict((v.typeName(), v)
-                          for v in [ArrayType, MapType, StructType])
-
-
-def _parse_datatype_json_string(json_string):
-    """Parses the given data type JSON string.
-    >>> def check_datatype(datatype):
-    ...     scala_datatype = sqlCtx._ssql_ctx.parseDataType(datatype.json())
-    ...     python_datatype = _parse_datatype_json_string(scala_datatype.json())
-    ...     return datatype == python_datatype
-    >>> all(check_datatype(cls()) for cls in _all_primitive_types.values())
-    True
-    >>> # Simple ArrayType.
-    >>> simple_arraytype = ArrayType(StringType(), True)
-    >>> check_datatype(simple_arraytype)
-    True
-    >>> # Simple MapType.
-    >>> simple_maptype = MapType(StringType(), LongType())
-    >>> check_datatype(simple_maptype)
-    True
-    >>> # Simple StructType.
-    >>> simple_structtype = StructType([
-    ...     StructField("a", DecimalType(), False),
-    ...     StructField("b", BooleanType(), True),
-    ...     StructField("c", LongType(), True),
-    ...     StructField("d", BinaryType(), False)])
-    >>> check_datatype(simple_structtype)
-    True
-    >>> # Complex StructType.
-    >>> complex_structtype = StructType([
-    ...     StructField("simpleArray", simple_arraytype, True),
-    ...     StructField("simpleMap", simple_maptype, True),
-    ...     StructField("simpleStruct", simple_structtype, True),
-    ...     StructField("boolean", BooleanType(), False),
-    ...     StructField("withMeta", DoubleType(), False, {"name": "age"})])
-    >>> check_datatype(complex_structtype)
-    True
-    >>> # Complex ArrayType.
-    >>> complex_arraytype = ArrayType(complex_structtype, True)
-    >>> check_datatype(complex_arraytype)
-    True
-    >>> # Complex MapType.
-    >>> complex_maptype = MapType(complex_structtype,
-    ...                           complex_arraytype, False)
-    >>> check_datatype(complex_maptype)
-    True
-    >>> check_datatype(ExamplePointUDT())
-    True
-    >>> structtype_with_udt = StructType([StructField("label", DoubleType(), False),
-    ...                                   StructField("point", ExamplePointUDT(), False)])
-    >>> check_datatype(structtype_with_udt)
-    True
-    """
-    return _parse_datatype_json_value(json.loads(json_string))
-
-
-_FIXED_DECIMAL = re.compile("decimal\\((\\d+),(\\d+)\\)")
-
-
-def _parse_datatype_json_value(json_value):
-    if type(json_value) is unicode:
-        if json_value in _all_primitive_types.keys():
-            return _all_primitive_types[json_value]()
-        elif json_value == u'decimal':
-            return DecimalType()
-        elif _FIXED_DECIMAL.match(json_value):
-            m = _FIXED_DECIMAL.match(json_value)
-            return DecimalType(int(m.group(1)), int(m.group(2)))
-        else:
-            raise ValueError("Could not parse datatype: %s" % json_value)
-    else:
-        tpe = json_value["type"]
-        if tpe in _all_complex_types:
-            return _all_complex_types[tpe].fromJson(json_value)
-        elif tpe == 'udt':
-            return UserDefinedType.fromJson(json_value)
-        else:
-            raise ValueError("not supported type: %s" % tpe)
-
-
-# Mapping Python types to Spark SQL DataType
-_type_mappings = {
-    type(None): NullType,
-    bool: BooleanType,
-    int: IntegerType,
-    long: LongType,
-    float: DoubleType,
-    str: StringType,
-    unicode: StringType,
-    bytearray: BinaryType,
-    decimal.Decimal: DecimalType,
-    datetime.date: DateType,
-    datetime.datetime: TimestampType,
-    datetime.time: TimestampType,
-}
-
-
-def _infer_type(obj):
-    """Infer the DataType from obj
-
-    >>> p = ExamplePoint(1.0, 2.0)
-    >>> _infer_type(p)
-    ExamplePointUDT
-    """
-    if obj is None:
-        raise ValueError("Can not infer type for None")
-
-    if hasattr(obj, '__UDT__'):
-        return obj.__UDT__
-
-    dataType = _type_mappings.get(type(obj))
-    if dataType is not None:
-        return dataType()
-
-    if isinstance(obj, dict):
-        for key, value in obj.iteritems():
-            if key is not None and value is not None:
-                return MapType(_infer_type(key), _infer_type(value), True)
-        else:
-            return MapType(NullType(), NullType(), True)
-    elif isinstance(obj, (list, array)):
-        for v in obj:
-            if v is not None:
-                return ArrayType(_infer_type(obj[0]), True)
-        else:
-            return ArrayType(NullType(), True)
-    else:
-        try:
-            return _infer_schema(obj)
-        except ValueError:
-            raise ValueError("not supported type: %s" % type(obj))
-
-
-def _infer_schema(row):
-    """Infer the schema from dict/namedtuple/object"""
-    if isinstance(row, dict):
-        items = sorted(row.items())
-
-    elif isinstance(row, tuple):
-        if hasattr(row, "_fields"):  # namedtuple
-            items = zip(row._fields, tuple(row))
-        elif hasattr(row, "__FIELDS__"):  # Row
-            items = zip(row.__FIELDS__, tuple(row))
-        elif all(isinstance(x, tuple) and len(x) == 2 for x in row):
-            items = row
-        else:
-            raise ValueError("Can't infer schema from tuple")
-
-    elif hasattr(row, "__dict__"):  # object
-        items = sorted(row.__dict__.items())
-
-    else:
-        raise ValueError("Can not infer schema for type: %s" % type(row))
-
-    fields = [StructField(k, _infer_type(v), True) for k, v in items]
-    return StructType(fields)
-
-
-def _need_python_to_sql_conversion(dataType):
-    """
-    Checks whether we need python to sql conversion for the given type.
-    For now, only UDTs need this conversion.
-
-    >>> _need_python_to_sql_conversion(DoubleType())
-    False
-    >>> schema0 = StructType([StructField("indices", ArrayType(IntegerType(), False), False),
-    ...                       StructField("values", ArrayType(DoubleType(), False), False)])
-    >>> _need_python_to_sql_conversion(schema0)
-    False
-    >>> _need_python_to_sql_conversion(ExamplePointUDT())
-    True
-    >>> schema1 = ArrayType(ExamplePointUDT(), False)
-    >>> _need_python_to_sql_conversion(schema1)
-    True
-    >>> schema2 = StructType([StructField("label", DoubleType(), False),
-    ...                       StructField("point", ExamplePointUDT(), False)])
-    >>> _need_python_to_sql_conversion(schema2)
-    True
-    """
-    if isinstance(dataType, StructType):
-        return any([_need_python_to_sql_conversion(f.dataType) for f in dataType.fields])
-    elif isinstance(dataType, ArrayType):
-        return _need_python_to_sql_conversion(dataType.elementType)
-    elif isinstance(dataType, MapType):
-        return _need_python_to_sql_conversion(dataType.keyType) or \
-            _need_python_to_sql_conversion(dataType.valueType)
-    elif isinstance(dataType, UserDefinedType):
-        return True
-    else:
-        return False
-
-
-def _python_to_sql_converter(dataType):
-    """
-    Returns a converter that converts a Python object into a SQL datum for the given type.
-
-    >>> conv = _python_to_sql_converter(DoubleType())
-    >>> conv(1.0)
-    1.0
-    >>> conv = _python_to_sql_converter(ArrayType(DoubleType(), False))
-    >>> conv([1.0, 2.0])
-    [1.0, 2.0]
-    >>> conv = _python_to_sql_converter(ExamplePointUDT())
-    >>> conv(ExamplePoint(1.0, 2.0))
-    [1.0, 2.0]
-    >>> schema = StructType([StructField("label", DoubleType(), False),
-    ...                      StructField("point", ExamplePointUDT(), False)])
-    >>> conv = _python_to_sql_converter(schema)
-    >>> conv((1.0, ExamplePoint(1.0, 2.0)))
-    (1.0, [1.0, 2.0])
-    """
-    if not _need_python_to_sql_conversion(dataType):
-        return lambda x: x
-
-    if isinstance(dataType, StructType):
-        names, types = zip(*[(f.name, f.dataType) for f in dataType.fields])
-        converters = map(_python_to_sql_converter, types)
-
-        def converter(obj):
-            if isinstance(obj, dict):
-                return tuple(c(obj.get(n)) for n, c in zip(names, converters))
-            elif isinstance(obj, tuple):
-                if hasattr(obj, "_fields") or hasattr(obj, "__FIELDS__"):
-                    return tuple(c(v) for c, v in zip(converters, obj))
-                elif all(isinstance(x, tuple) and len(x) == 2 for x in obj):  # k-v pairs
-                    d = dict(obj)
-                    return tuple(c(d.get(n)) for n, c in zip(names, converters))
-                else:
-                    return tuple(c(v) for c, v in zip(converters, obj))
-            else:
-                raise ValueError("Unexpected tuple %r with type %r" % (obj, dataType))
-        return converter
-    elif isinstance(dataType, ArrayType):
-        element_converter = _python_to_sql_converter(dataType.elementType)
-        return lambda a: [element_converter(v) for v in a]
-    elif isinstance(dataType, MapType):
-        key_converter = _python_to_sql_converter(dataType.keyType)
-        value_converter = _python_to_sql_converter(dataType.valueType)
-        return lambda m: dict([(key_converter(k), value_converter(v)) for k, v in m.items()])
-    elif isinstance(dataType, UserDefinedType):
-        return lambda obj: dataType.serialize(obj)
-    else:
-        raise ValueError("Unexpected type %r" % dataType)
-
-
-def _has_nulltype(dt):
-    """ Return whether there is NullType in `dt` or not """
-    if isinstance(dt, StructType):
-        return any(_has_nulltype(f.dataType) for f in dt.fields)
-    elif isinstance(dt, ArrayType):
-        return _has_nulltype((dt.elementType))
-    elif isinstance(dt, MapType):
-        return _has_nulltype(dt.keyType) or _has_nulltype(dt.valueType)
-    else:
-        return isinstance(dt, NullType)
-
-
-def _merge_type(a, b):
-    if isinstance(a, NullType):
-        return b
-    elif isinstance(b, NullType):
-        return a
-    elif type(a) is not type(b):
-        # TODO: type cast (such as int -> long)
-        raise TypeError("Can not merge type %s and %s" % (a, b))
-
-    # same type
-    if isinstance(a, StructType):
-        nfs = dict((f.name, f.dataType) for f in b.fields)
-        fields = [StructField(f.name, _merge_type(f.dataType, nfs.get(f.name, NullType())))
-                  for f in a.fields]
-        names = set([f.name for f in fields])
-        for n in nfs:
-            if n not in names:
-                fields.append(StructField(n, nfs[n]))
-        return StructType(fields)
-
-    elif isinstance(a, ArrayType):
-        return ArrayType(_merge_type(a.elementType, b.elementType), True)
-
-    elif isinstance(a, MapType):
-        return MapType(_merge_type(a.keyType, b.keyType),
-                       _merge_type(a.valueType, b.valueType),
-                       True)
-    else:
-        return a
-
-
-def _create_converter(dataType):
-    """Create an converter to drop the names of fields in obj """
-    if isinstance(dataType, ArrayType):
-        conv = _create_converter(dataType.elementType)
-        return lambda row: map(conv, row)
-
-    elif isinstance(dataType, MapType):
-        kconv = _create_converter(dataType.keyType)
-        vconv = _create_converter(dataType.valueType)
-        return lambda row: dict((kconv(k), vconv(v)) for k, v in row.iteritems())
-
-    elif isinstance(dataType, NullType):
-        return lambda x: None
-
-    elif not isinstance(dataType, StructType):
-        return lambda x: x
-
-    # dataType must be StructType
-    names = [f.name for f in dataType.fields]
-    converters = [_create_converter(f.dataType) for f in dataType.fields]
-
-    def convert_struct(obj):
-        if obj is None:
-            return
-
-        if isinstance(obj, tuple):
-            if hasattr(obj, "_fields"):
-                d = dict(zip(obj._fields, obj))
-            elif hasattr(obj, "__FIELDS__"):
-                d = dict(zip(obj.__FIELDS__, obj))
-            elif all(isinstance(x, tuple) and len(x) == 2 for x in obj):
-                d = dict(obj)
-            else:
-                raise ValueError("unexpected tuple: %s" % str(obj))
-
-        elif isinstance(obj, dict):
-            d = obj
-        elif hasattr(obj, "__dict__"):  # object
-            d = obj.__dict__
-        else:
-            raise ValueError("Unexpected obj: %s" % obj)
-
-        return tuple([conv(d.get(name)) for name, conv in zip(names, converters)])
-
-    return convert_struct
-
-
-_BRACKETS = {'(': ')', '[': ']', '{': '}'}
-
-
-def _split_schema_abstract(s):
-    """
-    split the schema abstract into fields
-
-    >>> _split_schema_abstract("a b  c")
-    ['a', 'b', 'c']
-    >>> _split_schema_abstract("a(a b)")
-    ['a(a b)']
-    >>> _split_schema_abstract("a b[] c{a b}")
-    ['a', 'b[]', 'c{a b}']
-    >>> _split_schema_abstract(" ")
-    []
-    """
-
-    r = []
-    w = ''
-    brackets = []
-    for c in s:
-        if c == ' ' and not brackets:
-            if w:
-                r.append(w)
-            w = ''
-        else:
-            w += c
-            if c in _BRACKETS:
-                brackets.append(c)
-            elif c in _BRACKETS.values():
-                if not brackets or c != _BRACKETS[brackets.pop()]:
-                    raise ValueError("unexpected " + c)
-
-    if brackets:
-        raise ValueError("brackets not closed: %s" % brackets)
-    if w:
-        r.append(w)
-    return r
-
-
-def _parse_field_abstract(s):
-    """
-    Parse a field in schema abstract
-
-    >>> _parse_field_abstract("a")
-    StructField(a,None,true)
-    >>> _parse_field_abstract("b(c d)")
-    StructField(b,StructType(...c,None,true),StructField(d...
-    >>> _parse_field_abstract("a[]")
-    StructField(a,ArrayType(None,true),true)
-    >>> _parse_field_abstract("a{[]}")
-    StructField(a,MapType(None,ArrayType(None,true),true),true)
-    """
-    if set(_BRACKETS.keys()) & set(s):
-        idx = min((s.index(c) for c in _BRACKETS if c in s))
-        name = s[:idx]
-        return StructField(name, _parse_schema_abstract(s[idx:]), True)
-    else:
-        return StructField(s, None, True)
-
-
-def _parse_schema_abstract(s):
-    """
-    parse abstract into schema
-
-    >>> _parse_schema_abstract("a b  c")
-    StructType...a...b...c...
-    >>> _parse_schema_abstract("a[b c] b{}")
-    StructType...a,ArrayType...b...c...b,MapType...
-    >>> _parse_schema_abstract("c{} d{a b}")
-    StructType...c,MapType...d,MapType...a...b...
-    >>> _parse_schema_abstract("a b(t)").fields[1]
-    StructField(b,StructType(List(StructField(t,None,true))),true)
-    """
-    s = s.strip()
-    if not s:
-        return
-
-    elif s.startswith('('):
-        return _parse_schema_abstract(s[1:-1])
-
-    elif s.startswith('['):
-        return ArrayType(_parse_schema_abstract(s[1:-1]), True)
-
-    elif s.startswith('{'):
-        return MapType(None, _parse_schema_abstract(s[1:-1]))
-
-    parts = _split_schema_abstract(s)
-    fields = [_parse_field_abstract(p) for p in parts]
-    return StructType(fields)
-
-
-def _infer_schema_type(obj, dataType):
-    """
-    Fill the dataType with types inferred from obj
-
-    >>> schema = _parse_schema_abstract("a b c d")
-    >>> row = (1, 1.0, "str", datetime.date(2014, 10, 10))
-    >>> _infer_schema_type(row, schema)
-    StructType...IntegerType...DoubleType...StringType...DateType...
-    >>> row = [[1], {"key": (1, 2.0)}]
-    >>> schema = _parse_schema_abstract("a[] b{c d}")
-    >>> _infer_schema_type(row, schema)
-    StructType...a,ArrayType...b,MapType(StringType,...c,IntegerType...
-    """
-    if dataType is None:
-        return _infer_type(obj)
-
-    if not obj:
-        return NullType()
-
-    if isinstance(dataType, ArrayType):
-        eType = _infer_schema_type(obj[0], dataType.elementType)
-        return ArrayType(eType, True)
-
-    elif isinstance(dataType, MapType):
-        k, v = obj.iteritems().next()
-        return MapType(_infer_schema_type(k, dataType.keyType),
-                       _infer_schema_type(v, dataType.valueType))
-
-    elif isinstance(dataType, StructType):
-        fs = dataType.fields
-        assert len(fs) == len(obj), \
-            "Obj(%s) have different length with fields(%s)" % (obj, fs)
-        fields = [StructField(f.name, _infer_schema_type(o, f.dataType), True)
-                  for o, f in zip(obj, fs)]
-        return StructType(fields)
-
-    else:
-        raise ValueError("Unexpected dataType: %s" % dataType)
-
-
-_acceptable_types = {
-    BooleanType: (bool,),
-    ByteType: (int, long),
-    ShortType: (int, long),
-    IntegerType: (int, long),
-    LongType: (int, long),
-    FloatType: (float,),
-    DoubleType: (float,),
-    DecimalType: (decimal.Decimal,),
-    StringType: (str, unicode),
-    BinaryType: (bytearray,),
-    DateType: (datetime.date,),
-    TimestampType: (datetime.datetime,),
-    ArrayType: (list, tuple, array),
-    MapType: (dict,),
-    StructType: (tuple, list),
-}
-
-
-def _verify_type(obj, dataType):
-    """
-    Verify the type of obj against dataType, raise an exception if
-    they do not match.
-
-    >>> _verify_type(None, StructType([]))
-    >>> _verify_type("", StringType())
-    >>> _verify_type(0, IntegerType())
-    >>> _verify_type(range(3), ArrayType(ShortType()))
-    >>> _verify_type(set(), ArrayType(StringType())) # doctest: +IGNORE_EXCEPTION_DETAIL
-    Traceback (most recent call last):
-        ...
-    TypeError:...
-    >>> _verify_type({}, MapType(StringType(), IntegerType()))
-    >>> _verify_type((), StructType([]))
-    >>> _verify_type([], StructType([]))
-    >>> _verify_type([1], StructType([])) # doctest: +IGNORE_EXCEPTION_DETAIL
-    Traceback (most recent call last):
-        ...
-    ValueError:...
-    >>> _verify_type(ExamplePoint(1.0, 2.0), ExamplePointUDT())
-    >>> _verify_type([1.0, 2.0], ExamplePointUDT()) # doctest: +IGNORE_EXCEPTION_DETAIL
-    Traceback (most recent call last):
-        ...
-    ValueError:...
-    """
-    # all objects are nullable
-    if obj is None:
-        return
-
-    if isinstance(dataType, UserDefinedType):
-        if not (hasattr(obj, '__UDT__') and obj.__UDT__ == dataType):
-            raise ValueError("%r is not an instance of type %r" % (obj, dataType))
-        _verify_type(dataType.serialize(obj), dataType.sqlType())
-        return
-
-    _type = type(dataType)
-    assert _type in _acceptable_types, "unkown datatype: %s" % dataType
-
-    # subclass of them can not be deserialized in JVM
-    if type(obj) not in _acceptable_types[_type]:
-        raise TypeError("%s can not accept object in type %s"
-                        % (dataType, type(obj)))
-
-    if isinstance(dataType, ArrayType):
-        for i in obj:
-            _verify_type(i, dataType.elementType)
-
-    elif isinstance(dataType, MapType):
-        for k, v in obj.iteritems():
-            _verify_type(k, dataType.keyType)
-            _verify_type(v, dataType.valueType)
-
-    elif isinstance(dataType, StructType):
-        if len(obj) != len(dataType.fields):
-            raise ValueError("Length of object (%d) does not match with"
-                             "length of fields (%d)" % (len(obj), len(dataType.fields)))
-        for v, f in zip(obj, dataType.fields):
-            _verify_type(v, f.dataType)
-
-
-_cached_cls = {}
-
-
-def _restore_object(dataType, obj):
-    """ Restore object during unpickling. """
-    # use id(dataType) as key to speed up lookup in dict
-    # Because of batched pickling, dataType will be the
-    # same object in most cases.
-    k = id(dataType)
-    cls = _cached_cls.get(k)
-    if cls is None:
-        # use dataType as key to avoid create multiple class
-        cls = _cached_cls.get(dataType)
-        if cls is None:
-            cls = _create_cls(dataType)
-            _cached_cls[dataType] = cls
-        _cached_cls[k] = cls
-    return cls(obj)
-
-
-def _create_object(cls, v):
-    """ Create an customized object with class `cls`. """
-    # datetime.date would be deserialized as datetime.datetime
-    # from java type, so we need to set it back.
-    if cls is datetime.date and isinstance(v, datetime.datetime):
-        return v.date()
-    return cls(v) if v is not None else v
-
-
-def _create_getter(dt, i):
-    """ Create a getter for item `i` with schema """
-    cls = _create_cls(dt)
-
-    def getter(self):
-        return _create_object(cls, self[i])
-
-    return getter
-
-
-def _has_struct_or_date(dt):
-    """Return whether `dt` is or has StructType/DateType in it"""
-    if isinstance(dt, StructType):
-        return True
-    elif isinstance(dt, ArrayType):
-        return _has_struct_or_date(dt.elementType)
-    elif isinstance(dt, MapType):
-        return _has_struct_or_date(dt.keyType) or _has_struct_or_date(dt.valueType)
-    elif isinstance(dt, DateType):
-        return True
-    elif isinstance(dt, UserDefinedType):
-        return True
-    return False
-
-
-def _create_properties(fields):
-    """Create properties according to fields"""
-    ps = {}
-    for i, f in enumerate(fields):
-        name = f.name
-        if (name.startswith("__") and name.endswith("__")
-                or keyword.iskeyword(name)):
-            warnings.warn("field name %s can not be accessed in Python,"
-                          "use position to access it instead" % name)
-        if _has_struct_or_date(f.dataType):
-            # delay creating object until accessing it
-            getter = _create_getter(f.dataType, i)
-        else:
-            getter = itemgetter(i)
-        ps[name] = property(getter)
-    return ps
-
-
-def _create_cls(dataType):
-    """
-    Create an class by dataType
-
-    The created class is similar to namedtuple, but can have nested schema.
-
-    >>> schema = _parse_schema_abstract("a b c")
-    >>> row = (1, 1.0, "str")
-    >>> schema = _infer_schema_type(row, schema)
-    >>> obj = _create_cls(schema)(row)
-    >>> import pickle
-    >>> pickle.loads(pickle.dumps(obj))
-    Row(a=1, b=1.0, c='str')
-
-    >>> row = [[1], {"key": (1, 2.0)}]
-    >>> schema = _parse_schema_abstract("a[] b{c d}")
-    >>> schema = _infer_schema_type(row, schema)
-    >>> obj = _create_cls(schema)(row)
-    >>> pickle.loads(pickle.dumps(obj))
-    Row(a=[1], b={'key': Row(c=1, d=2.0)})
-    >>> pickle.loads(pickle.dumps(obj.a))
-    [1]
-    >>> pickle.loads(pickle.dumps(obj.b))
-    {'key': Row(c=1, d=2.0)}
-    """
-
-    if isinstance(dataType, ArrayType):
-        cls = _create_cls(dataType.elementType)
-
-        def List(l):
-            if l is None:
-                return
-            return [_create_object(cls, v) for v in l]
-
-        return List
-
-    elif isinstance(dataType, MapType):
-        kcls = _create_cls(dataType.keyType)
-        vcls = _create_cls(dataType.valueType)
-
-        def Dict(d):
-            if d is None:
-                return
-            return dict((_create_object(kcls, k), _create_object(vcls, v)) for k, v in d.items())
-
-        return Dict
-
-    elif isinstance(dataType, DateType):
-        return datetime.date
-
-    elif isinstance(dataType, UserDefinedType):
-        return lambda datum: dataType.deserialize(datum)
-
-    elif not isinstance(dataType, StructType):
-        # no wrapper for primitive types
-        return lambda x: x
-
-    class Row(tuple):
-
-        """ Row in DataFrame """
-        __DATATYPE__ = dataType
-        __FIELDS__ = tuple(f.name for f in dataType.fields)
-        __slots__ = ()
-
-        # create property for fast access
-        locals().update(_create_properties(dataType.fields))
-
-        def asDict(self):
-            """ Return as a dict """
-            return dict((n, getattr(self, n)) for n in self.__FIELDS__)
-
-        def __repr__(self):
-            # call collect __repr__ for nested objects
-            return ("Row(%s)" % ", ".join("%s=%r" % (n, getattr(self, n))
-                                          for n in self.__FIELDS__))
-
-        def __reduce__(self):
-            return (_restore_object, (self.__DATATYPE__, tuple(self)))
-
-    return Row
-
-
-class SQLContext(object):
-
-    """Main entry point for Spark SQL functionality.
-
-    A SQLContext can be used create L{DataFrame}, register L{DataFrame} as
-    tables, execute SQL over tables, cache tables, and read parquet files.
-    """
-
-    def __init__(self, sparkContext, sqlContext=None):
-        """Create a new SQLContext.
-
-        :param sparkContext: The SparkContext to wrap.
-        :param sqlContext: An optional JVM Scala SQLContext. If set, we do not instatiate a new
-        SQLContext in the JVM, instead we make all calls to this object.
-
-        >>> df = sqlCtx.inferSchema(rdd)
-        >>> sqlCtx.inferSchema(df) # doctest: +IGNORE_EXCEPTION_DETAIL
-        Traceback (most recent call last):
-            ...
-        TypeError:...
-
-        >>> bad_rdd = sc.parallelize([1,2,3])
-        >>> sqlCtx.inferSchema(bad_rdd) # doctest: +IGNORE_EXCEPTION_DETAIL
-        Traceback (most recent call last):
-            ...
-        ValueError:...
-
-        >>> from datetime import datetime
-        >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1L,
-        ...     b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1),
-        ...     time=datetime(2014, 8, 1, 14, 1, 5))])
-        >>> df = sqlCtx.inferSchema(allTypes)
-        >>> df.registerTempTable("allTypes")
-        >>> sqlCtx.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a '
-        ...            'from allTypes where b and i > 0').collect()
-        [Row(c0=2, c1=2.0, c2=False, c3=2, c4=0...8, 1, 14, 1, 5), a=1)]
-        >>> df.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time,
-        ...                     x.row.a, x.list)).collect()
-        [(1, u'string', 1.0, 1, True, ...(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
-        """
-        self._sc = sparkContext
-        self._jsc = self._sc._jsc
-        self._jvm = self._sc._jvm
-        self._scala_SQLContext = sqlContext
-
-    @property
-    def _ssql_ctx(self):
-        """Accessor for the JVM Spark SQL context.
-
-        Subclasses can override this property to provide their own
-        JVM Contexts.
-        """
-        if self._scala_SQLContext is None:
-            self._scala_SQLContext = self._jvm.SQLContext(self._jsc.sc())
-        return self._scala_SQLContext
-
-    def registerFunction(self, name, f, returnType=StringType()):
-        """Registers a lambda function as a UDF so it can be used in SQL statements.
-
-        In addition to a name and the function itself, the return type can be optionally specified.
-        When the return type is not given it default to a string and conversion will automatically
-        be done.  For any other return type, the produced object must match the specified type.
-
-        >>> sqlCtx.registerFunction("stringLengthString", lambda x: len(x))
-        >>> sqlCtx.sql("SELECT stringLengthString('test')").collect()
-        [Row(c0=u'4')]
-        >>> sqlCtx.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
-        >>> sqlCtx.sql("SELECT stringLengthInt('test')").collect()
-        [Row(c0=4)]
-        """
-        func = lambda _, it: imap(lambda x: f(*x), it)
-        ser = AutoBatchedSerializer(PickleSerializer())
-        command = (func, None, ser, ser)
-        pickled_cmd, bvars, env, includes = _prepare_for_python_RDD(self._sc, command, self)
-        self._ssql_ctx.udf().registerPython(name,
-                                            bytearray(pickled_cmd),
-                                            env,
-                                            includes,
-                                            self._sc.pythonExec,
-                                            bvars,
-                                            self._sc._javaAccumulator,
-                                            returnType.json())
-
-    def inferSchema(self, rdd, samplingRatio=None):
-        """Infer and apply a schema to an RDD of L{Row}.
-
-        When samplingRatio is specified, the schema is inferred by looking
-        at the types of each row in the sampled dataset. Otherwise, the
-        first 100 rows of the RDD are inspected. Nested collections are
-        supported, which can include array, dict, list, Row, tuple,
-        namedtuple, or object.
-
-        Each row could be L{pyspark.sql.Row} object or namedtuple or objects.
-        Using top level dicts is deprecated, as dict is used to represent Maps.
-
-        If a single column has multiple distinct inferred types, it may cause
-        runtime exceptions.
-
-        >>> rdd = sc.parallelize(
-        ...     [Row(field1=1, field2="row1"),
-        ...      Row(field1=2, field2="row2"),
-        ...      Row(field1=3, field2="row3")])
-        >>> df = sqlCtx.inferSchema(rdd)
-        >>> df.collect()[0]
-        Row(field1=1, field2=u'row1')
-
-        >>> NestedRow = Row("f1", "f2")
-        >>> nestedRdd1 = sc.parallelize([
-        ...     NestedRow(array('i', [1, 2]), {"row1": 1.0}),
-        ...     NestedRow(array('i', [2, 3]), {"row2": 2.0})])
-        >>> df = sqlCtx.inferSchema(nestedRdd1)
-        >>> df.collect()
-        [Row(f1=[1, 2], f2={u'row1': 1.0}), ..., f2={u'row2': 2.0})]
-
-        >>> nestedRdd2 = sc.parallelize([
-        ...     NestedRow([[1, 2], [2, 3]], [1, 2]),
-        ...     NestedRow([[2, 3], [3, 4]], [2, 3])])
-        >>> df = sqlCtx.inferSchema(nestedRdd2)
-        >>> df.collect()
-        [Row(f1=[[1, 2], [2, 3]], f2=[1, 2]), ..., f2=[2, 3])]
-
-        >>> from collections import namedtuple
-        >>> CustomRow = namedtuple('CustomRow', 'field1 field2')
-        >>> rdd = sc.parallelize(
-        ...     [CustomRow(field1=1, field2="row1"),
-        ...      CustomRow(field1=2, field2="row2"),
-        ...      CustomRow(field1=3, field2="row3")])
-        >>> df = sqlCtx.inferSchema(rdd)
-        >>> df.collect()[0]
-        Row(field1=1, field2=u'row1')
-        """
-
-        if isinstance(rdd, DataFrame):
-            raise TypeError("Cannot apply schema to DataFrame")
-
-        first = rdd.first()
-        if not first:
-            raise ValueError("The first row in RDD is empty, "
-                             "can not infer schema")
-        if type(first) is dict:
-            warnings.warn("Using RDD of dict to inferSchema is deprecated,"
-                          "please use pyspark.sql.Row instead")
-
-        if samplingRatio is None:
-            schema = _infer_schema(first)
-            if _has_nulltype(schema):
-                for row in rdd.take(100)[1:]:
-                    schema = _merge_type(schema, _infer_schema(row))
-                    if not _has_nulltype(schema):
-                        break
-                else:
-                    warnings.warn("Some of types cannot be determined by the "
-                                  "first 100 rows, please try again with sampling")
-        else:
-            if samplingRatio > 0.99:
-                rdd = rdd.sample(False, float(samplingRatio))
-            schema = rdd.map(_infer_schema).reduce(_merge_type)
-
-        converter = _create_converter(schema)
-        rdd = rdd.map(converter)
-        return self.applySchema(rdd, schema)
-
-    def applySchema(self, rdd, schema):
-        """
-        Applies the given schema to the given RDD of L{tuple} or L{list}.
-
-        These tuples or lists can contain complex nested structures like
-        lists, maps or nested rows.
-
-        The schema should be a StructType.
-
-        It is important that the schema matches the types of the objects
-        in each row or exceptions could be thrown at runtime.
-
-        >>> rdd2 = sc.parallelize([(1, "row1"), (2, "row2"), (3, "row3")])
-        >>> schema = StructType([StructField("field1", IntegerType(), False),
-        ...     StructField("field2", StringType(), False)])
-        >>> df = sqlCtx.applySchema(rdd2, schema)
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
-        >>> df2 = sqlCtx.sql("SELECT * from table1")
-        >>> df2.collect()
-        [Row(field1=1, field2=u'row1'),..., Row(field1=3, field2=u'row3')]
-
-        >>> from datetime import date, datetime
-        >>> rdd = sc.parallelize([(127, -128L, -32768, 32767, 2147483647L, 1.0,
-        ...     date(2010, 1, 1),
-        ...     datetime(2010, 1, 1, 1, 1, 1),
-        ...     {"a": 1}, (2,), [1, 2, 3], None)])
-        >>> schema = StructType([
-        ...     StructField("byte1", ByteType(), False),
-        ...     StructField("byte2", ByteType(), False),
-        ...     StructField("short1", ShortType(), False),
-        ...     StructField("short2", ShortType(), False),
-        ...     StructField("int", IntegerType(), False),
-        ...     StructField("float", FloatType(), False),
-        ...     StructField("date", DateType(), False),
-        ...     StructField("time", TimestampType(), False),
-        ...     StructField("map",
-        ...         MapType(StringType(), IntegerType(), False), False),
-        ...     StructField("struct",
-        ...         StructType([StructField("b", ShortType(), False)]), False),
-        ...     StructField("list", ArrayType(ByteType(), False), False),
-        ...     StructField("null", DoubleType(), True)])
-        >>> df = sqlCtx.applySchema(rdd, schema)
-        >>> results = df.map(
-        ...     lambda x: (x.byte1, x.byte2, x.short1, x.short2, x.int, x.float, x.date,
-        ...         x.time, x.map["a"], x.struct.b, x.list, x.null))
-        >>> results.collect()[0] # doctest: +NORMALIZE_WHITESPACE
-        (127, -128, -32768, 32767, 2147483647, 1.0, datetime.date(2010, 1, 1),
-             datetime.datetime(2010, 1, 1, 1, 1, 1), 1, 2, [1, 2, 3], None)
-
-        >>> df.registerTempTable("table2")
-        >>> sqlCtx.sql(
-        ...   "SELECT byte1 - 1 AS byte1, byte2 + 1 AS byte2, " +
-        ...     "short1 + 1 AS short1, short2 - 1 AS short2, int - 1 AS int, " +
-        ...     "float + 1.5 as float FROM table2").collect()
-        [Row(byte1=126, byte2=-127, short1=-32767, short2=32766, int=2147483646, float=2.5)]
-
-        >>> rdd = sc.parallelize([(127, -32768, 1.0,
-        ...     datetime(2010, 1, 1, 1, 1, 1),
-        ...     {"a": 1}, (2,), [1, 2, 3])])
-        >>> abstract = "byte short float time map{} struct(b) list[]"
-        >>> schema = _parse_schema_abstract(abstract)
-        >>> typedSchema = _infer_schema_type(rdd.first(), schema)
-        >>> df = sqlCtx.applySchema(rdd, typedSchema)
-        >>> df.collect()
-        [Row(byte=127, short=-32768, float=1.0, time=..., list=[1, 2, 3])]
-        """
-
-        if isinstance(rdd, DataFrame):
-            raise TypeError("Cannot apply schema to DataFrame")
-
-        if not isinstance(schema, StructType):
-            raise TypeError("schema should be StructType")
-
-        # take the first few rows to verify schema
-        rows = rdd.take(10)
-        # Row() cannot been deserialized by Pyrolite
-        if rows and isinstance(rows[0], tuple) and rows[0].__class__.__name__ == 'Row':
-            rdd = rdd.map(tuple)
-            rows = rdd.take(10)
-
-        for row in rows:
-            _verify_type(row, schema)
-
-        # convert python objects to sql data
-        converter = _python_to_sql_converter(schema)
-        rdd = rdd.map(converter)
-
-        jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
-        df = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
-        return DataFrame(df, self)
-
-    def registerRDDAsTable(self, rdd, tableName):
-        """Registers the given RDD as a temporary table in the catalog.
-
-        Temporary tables exist only during the lifetime of this instance of
-        SQLContext.
-
-        >>> df = sqlCtx.inferSchema(rdd)
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
-        """
-        if (rdd.__class__ is DataFrame):
-            df = rdd._jdf
-            self._ssql_ctx.registerRDDAsTable(df, tableName)
-        else:
-            raise ValueError("Can only register DataFrame as table")
-
-    def parquetFile(self, *paths):
-        """Loads a Parquet file, returning the result as a L{DataFrame}.
-
-        >>> import tempfile, shutil
-        >>> parquetFile = tempfile.mkdtemp()
-        >>> shutil.rmtree(parquetFile)
-        >>> df = sqlCtx.inferSchema(rdd)
-        >>> df.saveAsParquetFile(parquetFile)
-        >>> df2 = sqlCtx.parquetFile(parquetFile)
-        >>> sorted(df.collect()) == sorted(df2.collect())
-        True
-        """
-        gateway = self._sc._gateway
-        jpath = paths[0]
-        jpaths = gateway.new_array(gateway.jvm.java.lang.String, len(paths) - 1)
-        for i in range(1, len(paths)):
-            jpaths[i] = paths[i]
-        jdf = self._ssql_ctx.parquetFile(jpath, jpaths)
-        return DataFrame(jdf, self)
-
-    def jsonFile(self, path, schema=None, samplingRatio=1.0):
-        """
-        Loads a text file storing one JSON object per line as a
-        L{DataFrame}.
-
-        If the schema is provided, applies the given schema to this
-        JSON dataset.
-
-        Otherwise, it samples the dataset with ratio `samplingRatio` to
-        determine the schema.
-
-        >>> import tempfile, shutil
-        >>> jsonFile = tempfile.mkdtemp()
-        >>> shutil.rmtree(jsonFile)
-        >>> ofn = open(jsonFile, 'w')
-        >>> for json in jsonStrings:
-        ...   print>>ofn, json
-        >>> ofn.close()
-        >>> df1 = sqlCtx.jsonFile(jsonFile)
-        >>> sqlCtx.registerRDDAsTable(df1, "table1")
-        >>> df2 = sqlCtx.sql(
-        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
-        ...   "field6 as f4 from table1")
-        >>> for r in df2.collect():
-        ...     print r
-        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
-        Row(f1=2, f2=None, f3=Row(field4=22,..., f4=[Row(field7=u'row2')])
-        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
-
-        >>> df3 = sqlCtx.jsonFile(jsonFile, df1.schema())
-        >>> sqlCtx.registerRDDAsTable(df3, "table2")
-        >>> df4 = sqlCtx.sql(
-        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
-        ...   "field6 as f4 from table2")
-        >>> for r in df4.collect():
-        ...    print r
-        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
-        Row(f1=2, f2=None, f3=Row(field4=22,..., f4=[Row(field7=u'row2')])
-        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
-
-        >>> schema = StructType([
-        ...     StructField("field2", StringType(), True),
-        ...     StructField("field3",
-        ...         StructType([
-        ...             StructField("field5",
-        ...                 ArrayType(IntegerType(), False), True)]), False)])
-        >>> df5 = sqlCtx.jsonFile(jsonFile, schema)
-        >>> sqlCtx.registerRDDAsTable(df5, "table3")
-        >>> df6 = sqlCtx.sql(
-        ...   "SELECT field2 AS f1, field3.field5 as f2, "
-        ...   "field3.field5[0] as f3 from table3")
-        >>> df6.collect()
-        [Row(f1=u'row1', f2=None, f3=None)...Row(f1=u'row3', f2=[], f3=None)]
-        """
-        if schema is None:
-            df = self._ssql_ctx.jsonFile(path, samplingRatio)
-        else:
-            scala_datatype = self._ssql_ctx.parseDataType(schema.json())
-            df = self._ssql_ctx.jsonFile(path, scala_datatype)
-        return DataFrame(df, self)
-
-    def jsonRDD(self, rdd, schema=None, samplingRatio=1.0):
-        """Loads an RDD storing one JSON object per string as a L{DataFrame}.
-
-        If the schema is provided, applies the given schema to this
-        JSON dataset.
-
-        Otherwise, it samples the dataset with ratio `samplingRatio` to
-        determine the schema.
-
-        >>> df1 = sqlCtx.jsonRDD(json)
-        >>> sqlCtx.registerRDDAsTable(df1, "table1")
-        >>> df2 = sqlCtx.sql(
-        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
-        ...   "field6 as f4 from table1")
-        >>> for r in df2.collect():
-        ...     print r
-        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
-        Row(f1=2, f2=None, f3=Row(field4=22..., f4=[Row(field7=u'row2')])
-        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
-
-        >>> df3 = sqlCtx.jsonRDD(json, df1.schema())
-        >>> sqlCtx.registerRDDAsTable(df3, "table2")
-        >>> df4 = sqlCtx.sql(
-        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
-        ...   "field6 as f4 from table2")
-        >>> for r in df4.collect():
-        ...     print r
-        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
-        Row(f1=2, f2=None, f3=Row(field4=22..., f4=[Row(field7=u'row2')])
-        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
-
-        >>> schema = StructType([
-        ...     StructField("field2", StringType(), True),
-        ...     StructField("field3",
-        ...         StructType([
-        ...             StructField("field5",
-        ...                 ArrayType(IntegerType(), False), True)]), False)])
-        >>> df5 = sqlCtx.jsonRDD(json, schema)
-        >>> sqlCtx.registerRDDAsTable(df5, "table3")
-        >>> df6 = sqlCtx.sql(
-        ...   "SELECT field2 AS f1, field3.field5 as f2, "
-        ...   "field3.field5[0] as f3 from table3")
-        >>> df6.collect()
-        [Row(f1=u'row1', f2=None,...Row(f1=u'row3', f2=[], f3=None)]
-
-        >>> sqlCtx.jsonRDD(sc.parallelize(['{}',
-        ...         '{"key0": {"key1": "value1"}}'])).collect()
-        [Row(key0=None), Row(key0=Row(key1=u'value1'))]
-        >>> sqlCtx.jsonRDD(sc.parallelize(['{"key0": null}',
-        ...         '{"key0": {"key1": "value1"}}'])).collect()
-        [Row(key0=None), Row(key0=Row(key1=u'value1'))]
-        """
-
-        def func(iterator):
-            for x in iterator:
-                if not isinstance(x, basestring):
-                    x = unicode(x)
-                if isinstance(x, unicode):
-                    x = x.encode("utf-8")
-                yield x
-        keyed = rdd.mapPartitions(func)
-        keyed._bypass_serializer = True
-        jrdd = keyed._jrdd.map(self._jvm.BytesToString())
-        if schema is None:
-            df = self._ssql_ctx.jsonRDD(jrdd.rdd(), samplingRatio)
-        else:
-            scala_datatype = self._ssql_ctx.parseDataType(schema.json())
-            df = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
-        return DataFrame(df, self)
-
-    def sql(self, sqlQuery):
-        """Return a L{DataFrame} representing the result of the given query.
-
-        >>> df = sqlCtx.inferSchema(rdd)
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
-        >>> df2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2 from table1")
-        >>> df2.collect()
-        [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
-        """
-        return DataFrame(self._ssql_ctx.sql(sqlQuery), self)
-
-    def table(self, tableName):
-        """Returns the specified table as a L{DataFrame}.
-
-        >>> df = sqlCtx.inferSchema(rdd)
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
-        >>> df2 = sqlCtx.table("table1")
-        >>> sorted(df.collect()) == sorted(df2.collect())
-        True
-        """
-        return DataFrame(self._ssql_ctx.table(tableName), self)
-
-    def cacheTable(self, tableName):
-        """Caches the specified table in-memory."""
-        self._ssql_ctx.cacheTable(tableName)
-
-    def uncacheTable(self, tableName):
-        """Removes the specified table from the in-memory cache."""
-        self._ssql_ctx.uncacheTable(tableName)
-
-
-class HiveContext(SQLContext):
-
-    """A variant of Spark SQL that integrates with data stored in Hive.
-
-    Configuration for Hive is read from hive-site.xml on the classpath.
-    It supports running both SQL and HiveQL commands.
-    """
-
-    def __init__(self, sparkContext, hiveContext=None):
-        """Create a new HiveContext.
-
-        :param sparkContext: The SparkContext to wrap.
-        :param hiveContext: An optional JVM Scala HiveContext. If set, we do not instatiate a new
-        HiveContext in the JVM, instead we make all calls to this object.
-        """
-        SQLContext.__init__(self, sparkContext)
-
-        if hiveContext:
-            self._scala_HiveContext = hiveContext
-
-    @property
-    def _ssql_ctx(self):
-        try:
-            if not hasattr(self, '_scala_HiveContext'):
-                self._scala_HiveContext = self._get_hive_ctx()
-            return self._scala_HiveContext
-        except Py4JError as e:
-            raise Exception("You must build Spark with Hive. "
-                            "Export 'SPARK_HIVE=true' and run "
-                            "build/sbt assembly", e)
-
-    def _get_hive_ctx(self):
-        return self._jvm.HiveContext(self._jsc.sc())
-
-
-def _create_row(fields, values):
-    row = Row(*values)
-    row.__FIELDS__ = fields
-    return row
-
-
-class Row(tuple):
-
-    """
-    A row in L{DataFrame}. The fields in it can be accessed like attributes.
-
-    Row can be used to create a row object by using named arguments,
-    the fields will be sorted by names.
-
-    >>> row = Row(name="Alice", age=11)
-    >>> row
-    Row(age=11, name='Alice')
-    >>> row.name, row.age
-    ('Alice', 11)
-
-    Row also can be used to create another Row like class, then it
-    could be used to create Row objects, such as
-
-    >>> Person = Row("name", "age")
-    >>> Person
-    <Row(name, age)>
-    >>> Person("Alice", 11)
-    Row(name='Alice', age=11)
-    """
-
-    def __new__(self, *args, **kwargs):
-        if args and kwargs:
-            raise ValueError("Can not use both args "
-                             "and kwargs to create Row")
-        if args:
-            # create row class or objects
-            return tuple.__new__(self, args)
-
-        elif kwargs:
-            # create row objects
-            names = sorted(kwargs.keys())
-            values = tuple(kwargs[n] for n in names)
-            row = tuple.__new__(self, values)
-            row.__FIELDS__ = names
-            return row
-
-        else:
-            raise ValueError("No args or kwargs")
-
-    def asDict(self):
-        """
-        Return as an dict
-        """
-        if not hasattr(self, "__FIELDS__"):
-            raise TypeError("Cannot convert a Row class into dict")
-        return dict(zip(self.__FIELDS__, self))
-
-    # let obect acs like class
-    def __call__(self, *args):
-        """create new Row object"""
-        return _create_row(self, args)
-
-    def __getattr__(self, item):
-        if item.startswith("__"):
-            raise AttributeError(item)
-        try:
-            # it will be slow when it has many fields,
-            # but this will not be used in normal cases
-            idx = self.__FIELDS__.index(item)
-            return self[idx]
-        except IndexError:
-            raise AttributeError(item)
-
-    def __reduce__(self):
-        if hasattr(self, "__FIELDS__"):
-            return (_create_row, (self.__FIELDS__, tuple(self)))
-        else:
-            return tuple.__reduce__(self)
-
-    def __repr__(self):
-        if hasattr(self, "__FIELDS__"):
-            return "Row(%s)" % ", ".join("%s=%r" % (k, v)
-                                         for k, v in zip(self.__FIELDS__, self))
-        else:
-            return "<Row(%s)>" % ", ".join(self)
-
-
-class DataFrame(object):
-
-    """A collection of rows that have the same columns.
-
-    A :class:`DataFrame` is equivalent to a relational table in Spark SQL,
-    and can be created using various functions in :class:`SQLContext`::
-
-        people = sqlContext.parquetFile("...")
-
-    Once created, it can be manipulated using the various domain-specific-language
-    (DSL) functions defined in: :class:`DataFrame`, :class:`Column`.
-
-    To select a column from the data frame, use the apply method::
-
-        ageCol = people.age
-
-    Note that the :class:`Column` type can also be manipulated
-    through its various functions::
-
-        # The following creates a new column that increases everybody's age by 10.
-        people.age + 10
-
-
-    A more concrete example::
-
-        # To create DataFrame using SQLContext
-        people = sqlContext.parquetFile("...")
-        department = sqlContext.parquetFile("...")
-
-        people.filter(people.age > 30).join(department, people.deptId == department.id)) \
-          .groupBy(department.name, "gender").agg({"salary": "avg", "age": "max"})
-    """
-
-    def __init__(self, jdf, sql_ctx):
-        self._jdf = jdf
-        self.sql_ctx = sql_ctx
-        self._sc = sql_ctx and sql_ctx._sc
-        self.is_cached = False
-
-    @property
-    def rdd(self):
-        """
-        Return the content of the :class:`DataFrame` as an :class:`RDD`
-        of :class:`Row` s.
-        """
-        if not hasattr(self, '_lazy_rdd'):
-            jrdd = self._jdf.javaToPython()
-            rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer()))
-            schema = self.schema()
-
-            def applySchema(it):
-                cls = _create_cls(schema)
-                return itertools.imap(cls, it)
-
-            self._lazy_rdd = rdd.mapPartitions(applySchema)
-
-        return self._lazy_rdd
-
-    def toJSON(self, use_unicode=False):
-        """Convert a DataFrame into a MappedRDD of JSON documents; one document per row.
-
-        >>> df1 = sqlCtx.jsonRDD(json)
-        >>> sqlCtx.registerRDDAsTable(df1, "table1")
-        >>> df2 = sqlCtx.sql( "SELECT * from table1")
-        >>> df2.toJSON().take(1)[0] == '{"field1":1,"field2":"row1","field3":{"field4":11}}'
-        True
-        >>> df3 = sqlCtx.sql( "SELECT field3.field4 from table1")
-        >>> df3.toJSON().collect() == ['{"field4":11}', '{"field4":22}', '{"field4":33}']
-        True
-        """
-        rdd = self._jdf.toJSON()
-        return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode))
-
-    def saveAsParquetFile(self, path):
-        """Save the contents as a Parquet file, preserving the schema.
-
-        Files that are written out using this method can be read back in as
-        a DataFrame using the L{SQLContext.parquetFile} method.
-
-        >>> import tempfile, shutil
-        >>> parquetFile = tempfile.mkdtemp()
-        >>> shutil.rmtree(parquetFile)
-        >>> df.saveAsParquetFile(parquetFile)
-        >>> df2 = sqlCtx.parquetFile(parquetFile)
-        >>> sorted(df2.collect()) == sorted(df.collect())
-        True
-        """
-        self._jdf.saveAsParquetFile(path)
-
-    def registerTempTable(self, name):
-        """Registers this RDD as a temporary table using the given name.
-
-        The lifetime of this temporary table is tied to the L{SQLContext}
-        that was used to create this DataFrame.
-
-        >>> df.registerTempTable("people")
-        >>> df2 = sqlCtx.sql("select * from people")
-        >>> sorted(df.collect()) == sorted(df2.collect())
-        True
-        """
-        self._jdf.registerTempTable(name)
-
-    def registerAsTable(self, name):
-        """DEPRECATED: use registerTempTable() instead"""
-        warnings.warn("Use registerTempTable instead of registerAsTable.", DeprecationWarning)
-        self.registerTempTable(name)
-
-    def insertInto(self, tableName, overwrite=False):
-        """Inserts the contents of this DataFrame into the specified table.
-
-        Optionally overwriting any existing data.
-        """
-        self._jdf.insertInto(tableName, overwrite)
-
-    def saveAsTable(self, tableName):
-        """Creates a new table with the contents of this DataFrame."""
-        self._jdf.saveAsTable(tableName)
-
-    def schema(self):
-        """Returns the schema of this DataFrame (represented by
-        a L{StructType}).
-
-        >>> df.schema()
-        StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true)))
-        """
-        return _parse_datatype_json_string(self._jdf.schema().json())
-
-    def printSchema(self):
-        """Prints out the schema in the tree format.
-
-        >>> df.printSchema()
-        root
-         |-- age: integer (nullable = true)
-         |-- name: string (nullable = true)
-        <BLANKLINE>
-        """
-        print (self._jdf.schema().treeString())
-
-    def count(self):
-        """Return the number of elements in this RDD.
-
-        Unlike the base RDD implementation of count, this implementation
-        leverages the query optimizer to compute the count on the DataFrame,
-        which supports features such as filter pushdown.
-
-        >>> df.count()
-        2L
-        """
-        return self._jdf.count()
-
-    def collect(self):
-        """Return a list that contains all of the rows.
-
-        Each object in the list is a Row, the fields can be accessed as
-        attributes.
-
-        >>> df.collect()
-        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
-        """
-        with SCCallSiteSync(self._sc) as css:
-            bytesInJava = self._jdf.javaToPython().collect().iterator()
-        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
-        tempFile.close()
-        self._sc._writeToFile(bytesInJava, tempFile.name)
-        # Read the data into Python and deserialize it:
-        with open(tempFile.name, 'rb') as tempFile:
-            rs = list(BatchedSerializer(PickleSerializer()).load_stream(tempFile))
-        os.unlink(tempFile.name)
-        cls = _create_cls(self.schema())
-        return [cls(r) for r in rs]
-
-    def limit(self, num):
-        """Limit the result count to the number specified.
-
-        >>> df.limit(1).collect()
-        [Row(age=2, name=u'Alice')]
-        >>> df.limit(0).collect()
-        []
-        """
-        jdf = self._jdf.limit(num)
-        return DataFrame(jdf, self.sql_ctx)
-
-    def take(self, num):
-        """Take the first num rows of the RDD.
-
-        Each object in the list is a Row, the fields can be accessed as
-        attributes.
-
-        >>> df.take(2)
-        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
-        """
-        return self.limit(num).collect()
-
-    def map(self, f):
-        """ Return a new RDD by applying a function to each Row, it's a
-        shorthand for df.rdd.map()
-
-        >>> df.map(lambda p: p.name).collect()
-        [u'Alice', u'Bob']
-        """
-        return self.rdd.map(f)
-
-    def mapPartitions(self, f, preservesPartitioning=False):
-        """
-        Return a new RDD by applying a function to each partition.
-
-        >>> rdd = sc.parallelize([1, 2, 3, 4], 4)
-        >>> def f(iterator): yield 1
-        >>> rdd.mapPartitions(f).sum()
-        4
-        """
-        return self.rdd.mapPartitions(f, preservesPartitioning)
-
-    def cache(self):
-        """ Persist with the default storage level (C{MEMORY_ONLY_SER}).
-        """
-        self.is_cached = True
-        self._jdf.cache()
-        return self
-
-    def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER):
-        """ Set the storage level to persist its values across operations
-        after the first time it is computed. This can only be used to assign
-        a new storage level if the RDD does not have a storage level set yet.
-        If no storage level is specified defaults to (C{MEMORY_ONLY_SER}).
-        """
-        self.is_cached = True
-        javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel)
-        self._jdf.persist(javaStorageLevel)
-        return self
-
-    def unpersist(self, blocking=True):
-        """ Mark it as non-persistent, and remove all blocks for it from
-        memory and disk.
-        """
-        self.is_cached = False
-        self._jdf.unpersist(blocking)
-        return self
-
-    # def coalesce(self, numPartitions, shuffle=False):
-    #     rdd = self._jdf.coalesce(numPartitions, shuffle, None)
-    #     return DataFrame(rdd, self.sql_ctx)
-
-    def repartition(self, numPartitions):
-        """ Return a new :class:`DataFrame` that has exactly `numPartitions`
-        partitions.
-        """
-        rdd = self._jdf.repartition(numPartitions, None)
-        return DataFrame(rdd, self.sql_ctx)
-
-    def sample(self, withReplacement, fraction, seed=None):
-        """
-        Return a sampled subset of this DataFrame.
-
-        >>> df = sqlCtx.inferSchema(rdd)
-        >>> df.sample(False, 0.5, 97).count()
-        2L
-        """
-        assert fraction >= 0.0, "Negative fraction value: %s" % fraction
-        seed = seed if seed is not None else random.randint(0, sys.maxint)
-        rdd = self._jdf.sample(withReplacement, fraction, long(seed))
-        return DataFrame(rdd, self.sql_ctx)
-
-    # def takeSample(self, withReplacement, num, seed=None):
-    #     """Return a fixed-size sampled subset of this DataFrame.
-    #
-    #     >>> df = sqlCtx.inferSchema(rdd)
-    #     >>> df.takeSample(False, 2, 97)
-    #     [Row(field1=3, field2=u'row3'), Row(field1=1, field2=u'row1')]
-    #     """
-    #     seed = seed if seed is not None else random.randint(0, sys.maxint)
-    #     with SCCallSiteSync(self.context) as css:
-    #         bytesInJava = self._jdf \
-    #             .takeSampleToPython(withReplacement, num, long(seed)) \
-    #             .iterator()
-    #     cls = _create_cls(self.schema())
-    #     return map(cls, self._collect_iterator_through_file(bytesInJava))
-
-    @property
-    def dtypes(self):
-        """Return all column names and their data types as a list.
-
-        >>> df.dtypes
-        [('age', 'integer'), ('name', 'string')]
-        """
-        return [(str(f.name), f.dataType.jsonValue()) for f in self.schema().fields]
-
-    @property
-    def columns(self):
-        """ Return all column names as a list.
-
-        >>> df.columns
-        [u'age', u'name']
-        """
-        return [f.name for f in self.schema().fields]
-
-    def join(self, other, joinExprs=None, joinType=None):
-        """
-        Join with another DataFrame, using the given join expression.
-        The following performs a full outer join between `df1` and `df2`::
-
-        :param other: Right side of the join
-        :param joinExprs: Join expression
-        :param joinType: One of `inner`, `outer`, `left_outer`, `right_outer`, `semijoin`.
-
-        >>> df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height).collect()
-        [Row(name=None, height=80), Row(name=u'Bob', height=85), Row(name=u'Alice', height=None)]
-        """
-
-        if joinExprs is None:
-            jdf = self._jdf.join(other._jdf)
-        else:
-            assert isinstance(joinExprs, Column), "joinExprs should be Column"
-            if joinType is None:
-                jdf = self._jdf.join(other._jdf, joinExprs._jc)
-            else:
-                assert isinstance(joinType, basestring), "joinType should be basestring"
-                jdf = self._jdf.join(other._jdf, joinExprs._jc, joinType)
-        return DataFrame(jdf, self.sql_ctx)
-
-    def sort(self, *cols):
-        """ Return a new :class:`DataFrame` sorted by the specified column.
-
-        :param cols: The columns or expressions used for sorting
-
-        >>> df.sort(df.age.desc()).collect()
-        [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
-        >>> df.sortBy(df.age.desc()).collect()
-        [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
-        """
-        if not cols:
-            raise ValueError("should sort by at least one column")
-        jcols = ListConverter().convert([_to_java_column(c) for c in cols],
-                                        self._sc._gateway._gateway_client)
-        jdf = self._jdf.sort(self._sc._jvm.PythonUtils.toSeq(jcols))
-        return DataFrame(jdf, self.sql_ctx)
-
-    sortBy = sort
-
-    def head(self, n=None):
-        """ Return the first `n` rows or the first row if n is None.
-
-        >>> df.head()
-        Row(age=2, name=u'Alice')
-        >>> df.head(1)
-        [Row(age=2, name=u'Alice')]
-        """
-        if n is None:
-            rs = self.head(1)
-            return rs[0] if rs else None
-        return self.take(n)
-
-    def first(self):
-        """ Return the first row.
-
-        >>> df.first()
-        Row(age=2, name=u'Alice')
-        """
-        return self.head()
-
-    def __getitem__(self, item):
-        """ Return the column by given name
-
-        >>> df['age'].collect()
-        [Row(age=2), Row(age=5)]
-        >>> df[ ["name", "age"]].collect()
-        [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]
-        >>> df[ df.age > 3 ].collect()
-        [Row(age=5, name=u'Bob')]
-        """
-        if isinstance(item, basestring):
-            jc = self._jdf.apply(item)
-            return Column(jc, self.sql_ctx)
-        elif isinstance(item, Column):
-            return self.filter(item)
-        elif isinstance(item, list):
-            return self.select(*item)
-        else:
-            raise IndexError("unexpected index: %s" % item)
-
-    def __getattr__(self, name):
-        """ Return the column by given name
-
-        >>> df.age.collect()
-        [Row(age=2), Row(age=5)]
-        """
-        if name.startswith("__"):
-            raise AttributeError(name)
-        jc = self._jdf.apply(name)
-        return Column(jc, self.sql_ctx)
-
-    def select(self, *cols):
-        """ Selecting a set of expressions.
-
-        >>> df.select().collect()
-        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
-        >>> df.select('*').collect()
-        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
-        >>> df.select('name', 'age').collect()
-        [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]
-        >>> df.select(df.name, (df.age + 10).alias('age')).collect()
-        [Row(name=u'Alice', age=12), Row(name=u'Bob', age=15)]
-        """
-        if not cols:
-            cols = ["*"]
-        jcols = ListConverter().convert([_to_java_column(c) for c in cols],
-                                        self._sc._gateway._gateway_client)
-        jdf = self._jdf.select(self.sql_ctx._sc._jvm.PythonUtils.toSeq(jcols))
-        return DataFrame(jdf, self.sql_ctx)
-
-    def selectExpr(self, *expr):
-        """
-        Selects a set of SQL expressions. This is a variant of
-        `select` that accepts SQL expressions.
-
-        >>> df.selectExpr("age * 2", "abs(age)").collect()
-        [Row(('age * 2)=4, Abs('age)=2), Row(('age * 2)=10, Abs('age)=5)]
-        """
-        jexpr = ListConverter().convert(expr, self._sc._gateway._gateway_client)
-        jdf = self._jdf.selectExpr(self._sc._jvm.PythonUtils.toSeq(jexpr))
-        return DataFrame(jdf, self.sql_ctx)
-
-    def filter(self, condition):
-        """ Filtering rows using the given condition, which could be
-        Column expression or string of SQL expression.
-
-        where() is an alias for filter().
-
-        >>> df.filter(df.age > 3).collect()
-        [Row(age=5, name=u'Bob')]
-        >>> df.where(df.age == 2).collect()
-        [Row(age=2, name=u'Alice')]
-
-        >>> df.filter("age > 3").collect()
-        [Row(age=5, name=u'Bob')]
-        >>> df.where("age = 2").collect()
-        [Row(age=2, name=u'Alice')]
-        """
-        if isinstance(condition, basestring):
-            jdf = self._jdf.filter(condition)
-        elif isinstance(condition, Column):
-            jdf = self._jdf.filter(condition._jc)
-        else:
-            raise TypeError("condition should be string or Column")
-        return DataFrame(jdf, self.sql_ctx)
-
-    where = filter
-
-    def groupBy(self, *cols):
-        """ Group the :class:`DataFrame` using the specified columns,
-        so we can run aggregation on them. See :class:`GroupedData`
-        for all the available aggregate functions.
-
-        >>> df.groupBy().avg().collect()
-        [Row(AVG(age#0)=3.5)]
-        >>> df.groupBy('name').agg({'age': 'mean'}).collect()
-        [Row(name=u'Bob', AVG(age#0)=5.0), Row(name=u'Alice', AVG(age#0)=2.0)]
-        >>> df.groupBy(df.name).avg().collect()
-        [Row(name=u'Bob', AVG(age#0)=5.0), Row(name=u'Alice', AVG(age#0)=2.0)]
-        """
-        jcols = ListConverter().convert([_to_java_column(c) for c in cols],
-                                        self._sc._gateway._gateway_client)
-        jdf = self._jdf.groupBy(self.sql_ctx._sc._jvm.PythonUtils.toSeq(jcols))
-        return GroupedData(jdf, self.sql_ctx)
-
-    def agg(self, *exprs):
-        """ Aggregate on the entire :class:`DataFrame` without groups
-        (shorthand for df.groupBy.agg()).
-
-        >>> df.agg({"age": "max"}).collect()
-        [Row(MAX(age#0)=5)]
-        >>> from pyspark.sql import Dsl
-        >>> df.agg(Dsl.min(df.age)).collect()
-        [Row(MIN(age#0)=2)]
-        """
-        return self.groupBy().agg(*exprs)
-
-    def unionAll(self, other):
-        """ Return a new DataFrame containing union of rows in this
-        frame and another frame.
-
-        This is equivalent to `UNION ALL` in SQL.
-        """
-        return DataFrame(self._jdf.unionAll(other._jdf), self.sql_ctx)
-
-    def intersect(self, other):
-        """ Return a new :class:`DataFrame` containing rows only in
-        both this frame and another frame.
-
-        This is equivalent to `INTERSECT` in SQL.
-        """
-        return DataFrame(self._jdf.intersect(other._jdf), self.sql_ctx)
-
-    def subtract(self, other):
-        """ Return a new :class:`DataFrame` containing rows in this frame
-        but not in another frame.
-
-        This is equivalent to `EXCEPT` in SQL.
-        """
-        return DataFrame(getattr(self._jdf, "except")(other._jdf), self.sql_ctx)
-
-    def addColumn(self, colName, col):
-        """ Return a new :class:`DataFrame` by adding a column.
-
-        >>> df.addColumn('age2', df.age + 2).collect()
-        [Row(age=2, name=u'Alice', age2=4), Row(age=5, name=u'Bob', age2=7)]
-        """
-        return self.select('*', col.alias(colName))
-
-    def to_pandas(self):
-        """
-        Collect all the rows and return a `pandas.DataFrame`.
-
-        >>> df.to_pandas()  # doctest: +SKIP
-           age   name
-        0    2  Alice
-        1    5    Bob
-        """
-        import pandas as pd
-        return pd.DataFrame.from_records(self.collect(), columns=self.columns)
-
-
-# Having SchemaRDD for backward compatibility (for docs)
-class SchemaRDD(DataFrame):
-    """
-    SchemaRDD is deprecated, please use DataFrame
-    """
-
-
-def dfapi(f):
-    def _api(self):
-        name = f.__name__
-        jdf = getattr(self._jdf, name)()
-        return DataFrame(jdf, self.sql_ctx)
-    _api.__name__ = f.__name__
-    _api.__doc__ = f.__doc__
-    return _api
-
-
-class GroupedData(object):
-
-    """
-    A set of methods for aggregations on a :class:`DataFrame`,
-    created by DataFrame.groupBy().
-    """
-
-    def __init__(self, jdf, sql_ctx):
-        self._jdf = jdf
-        self.sql_ctx = sql_ctx
-
-    def agg(self, *exprs):
-        """ Compute aggregates by specifying a map from column name
-        to aggregate methods.
-
-        The available aggregate methods are `avg`, `max`, `min`,
-        `sum`, `count`.
-
-        :param exprs: list or aggregate columns or a map from column
-                      name to aggregate methods.
-
-        >>> gdf = df.groupBy(df.name)
-        >>> gdf.agg({"age": "max"}).collect()
-        [Row(name=u'Bob', MAX(age#0)=5), Row(name=u'Alice', MAX(age#0)=2)]
-        >>> from pyspark.sql import Dsl
-        >>> gdf.agg(Dsl.min(df.age)).collect()
-        [Row(MIN(age#0)=5), Row(MIN(age#0)=2)]
-        """
-        assert exprs, "exprs should not be empty"
-        if len(exprs) == 1 and isinstance(exprs[0], dict):
-            jmap = MapConverter().convert(exprs[0],
-                                          self.sql_ctx._sc._gateway._gateway_client)
-            jdf = self._jdf.agg(jmap)
-        else:
-            # Columns
-            assert all(isinstance(c, Column) for c in exprs), "all exprs should be Column"
-            jcols = ListConverter().convert([c._jc for c in exprs[1:]],
-                                            self.sql_ctx._sc._gateway._gateway_client)
-            jdf = self._jdf.agg(exprs[0]._jc, self.sql_ctx._sc._jvm.PythonUtils.toSeq(jcols))
-        return DataFrame(jdf, self.sql_ctx)
-
-    @dfapi
-    def count(self):
-        """ Count the number of rows for each group.
-
-        >>> df.groupBy(df.age).count().collect()
-        [Row(age=2, count=1), Row(age=5, count=1)]
-        """
-
-    @dfapi
-    def mean(self):
-        """Compute the average value for each numeric columns
-        for each group. This is an alias for `avg`."""
-
-    @dfapi
-    def avg(self):
-        """Compute the average value for each numeric columns
-        for each group."""
-
-    @dfapi
-    def max(self):
-        """Compute the max value for each numeric columns for
-        each group. """
-
-    @dfapi
-    def min(self):
-        """Compute the min value for each numeric column for
-        each group."""
-
-    @dfapi
-    def sum(self):
-        """Compute the sum for each numeric columns for each
-        group."""
-
-
-def _create_column_from_literal(literal):
-    sc = SparkContext._active_spark_context
-    return sc._jvm.Dsl.lit(literal)
-
-
-def _create_column_from_name(name):
-    sc = SparkContext._active_spark_context
-    return sc._jvm.Dsl.col(name)
-
-
-def _to_java_column(col):
-    if isinstance(col, Column):
-        jcol = col._jc
-    else:
-        jcol = _create_column_from_name(col)
-    return jcol
-
-
-def _unary_op(name, doc="unary operator"):
-    """ Create a method for given unary operator """
-    def _(self):
-        jc = getattr(self._jc, name)()
-        return Column(jc, self.sql_ctx)
-    _.__doc__ = doc
-    return _
-
-
-def _dsl_op(name, doc=''):
-    def _(self):
-        jc = getattr(self._sc._jvm.Dsl, name)(self._jc)
-        return Column(jc, self.sql_ctx)
-    _.__doc__ = doc
-    return _
-
-
-def _bin_op(name, doc="binary operator"):
-    """ Create a method for given binary operator
-    """
-    def _(self, other):
-        jc = other._jc if isinstance(other, Column) else other
-        njc = getattr(self._jc, name)(jc)
-        return Column(njc, self.sql_ctx)
-    _.__doc__ = doc
-    return _
-
-
-def _reverse_op(name, doc="binary operator"):
-    """ Create a method for binary operator (this object is on right side)
-    """
-    def _(self, other):
-        jother = _create_column_from_literal(other)
-        jc = getattr(jother, name)(self._jc)
-        return Column(jc, self.sql_ctx)
-    _.__doc__ = doc
-    return _
-
-
-class Column(DataFrame):
-
-    """
-    A column in a DataFrame.
-
-    `Column` instances can be created by::
-
-        # 1. Select a column out of a DataFrame
-        df.colName
-        df["colName"]
-
-        # 2. Create from an expression
-        df.colName + 1
-        1 / df.colName
-    """
-
-    def __init__(self, jc, sql_ctx=None):
-        self._jc = jc
-        super(Column, self).__init__(jc, sql_ctx)
-
-    # arithmetic operators
-    __neg__ = _dsl_op("negate")
-    __add__ = _bin_op("plus")
-    __sub__ = _bin_op("minus")
-    __mul__ = _bin_op("multiply")
-    __div__ = _bin_op("divide")
-    __mod__ = _bin_op("mod")
-    __radd__ = _bin_op("plus")
-    __rsub__ = _reverse_op("minus")
-    __rmul__ = _bin_op("multiply")
-    __rdiv__ = _reverse_op("divide")
-    __rmod__ = _reverse_op("mod")
-
-    # logistic operators
-    __eq__ = _bin_op("equalTo")
-    __ne__ = _bin_op("notEqual")
-    __lt__ = _bin_op("lt")
-    __le__ = _bin_op("leq")
-    __ge__ = _bin_op("geq")
-    __gt__ = _bin_op("gt")
-
-    # `and`, `or`, `not` cannot be overloaded in Python,
-    # so use bitwise operators as boolean operators
-    __and__ = _bin_op('and')
-    __or__ = _bin_op('or')
-    __invert__ = _dsl_op('not')
-    __rand__ = _bin_op("and")
-    __ror__ = _bin_op("or")
-
-    # container operators
-    __contains__ = _bin_op("contains")
-    __getitem__ = _bin_op("getItem")
-    getField = _bin_op("getField", "An expression that gets a field by name in a StructField.")
-
-    # string methods
-    rlike = _bin_op("rlike")
-    like = _bin_op("like")
-    startswith = _bin_op("startsWith")
-    endswith = _bin_op("endsWith")
-
-    def substr(self, startPos, length):
-        """
-        Return a Column which is a substring of the column
-
-        :param startPos: start position (int or Column)
-        :param length:  length of the substring (int or Column)
-
-        >>> df.name.substr(1, 3).collect()
-        [Row(col=u'Ali'), Row(col=u'Bob')]
-        """
-        if type(startPos) != type(length):
-            raise TypeError("Can not mix the type")
-        if isinstance(startPos, (int, long)):
-            jc = self._jc.substr(startPos, length)
-        elif isinstance(startPos, Column):
-            jc = self._jc.substr(startPos._jc, length._jc)
-        else:
-            raise TypeError("Unexpected type: %s" % type(startPos))
-        return Column(jc, self.sql_ctx)
-
-    __getslice__ = substr
-
-    # order
-    asc = _unary_op("asc")
-    desc = _unary_op("desc")
-
-    isNull = _unary_op("isNull", "True if the current expression is null.")
-    isNotNull = _unary_op("isNotNull", "True if the current expression is not null.")
-
-    def alias(self, alias):
-        """Return a alias for this column
-
-        >>> df.age.alias("age2").collect()
-        [Row(age2=2), Row(age2=5)]
-        """
-        return Column(getattr(self._jc, "as")(alias), self.sql_ctx)
-
-    def cast(self, dataType):
-        """ Convert the column into type `dataType`
-
-        >>> df.select(df.age.cast("string").alias('ages')).collect()
-        [Row(ages=u'2'), Row(ages=u'5')]
-        >>> df.select(df.age.cast(StringType()).alias('ages')).collect()
-        [Row(ages=u'2'), Row(ages=u'5')]
-        """
-        if self.sql_ctx is None:
-            sc = SparkContext._active_spark_context
-            ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
-        else:
-            ssql_ctx = self.sql_ctx._ssql_ctx
-        if isinstance(dataType, basestring):
-            jc = self._jc.cast(dataType)
-        elif isinstance(dataType, DataType):
-            jdt = ssql_ctx.parseDataType(dataType.json())
-            jc = self._jc.cast(jdt)
-        return Column(jc, self.sql_ctx)
-
-    def to_pandas(self):
-        """
-        Return a pandas.Series from the column
-
-        >>> df.age.to_pandas()  # doctest: +SKIP
-        0    2
-        1    5
-        dtype: int64
-        """
-        import pandas as pd
-        data = [c for c, in self.collect()]
-        return pd.Series(data)
-
-
-def _aggregate_func(name, doc=""):
-    """ Create a function for aggregator by name"""
-    def _(col):
-        sc = SparkContext._active_spark_context
-        jc = getattr(sc._jvm.Dsl, name)(_to_java_column(col))
-        return Column(jc)
-    _.__name__ = name
-    _.__doc__ = doc
-    return staticmethod(_)
-
-
-class UserDefinedFunction(object):
-    def __init__(self, func, returnType):
-        self.func = func
-        self.returnType = returnType
-        self._broadcast = None
-        self._judf = self._create_judf()
-
-    def _create_judf(self):
-        f = self.func  # put it in closure `func`
-        func = lambda _, it: imap(lambda x: f(*x), it)
-        ser = AutoBatchedSerializer(PickleSerializer())
-        command = (func, None, ser, ser)
-        sc = SparkContext._active_spark_context
-        pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self)
-        ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
-        jdt = ssql_ctx.parseDataType(self.returnType.json())
-        judf = sc._jvm.UserDefinedPythonFunction(f.__name__, bytearray(pickled_command), env,
-                                                 includes, sc.pythonExec, broadcast_vars,
-                                                 sc._javaAccumulator, jdt)
-        return judf
-
-    def __del__(self):
-        if self._broadcast is not None:
-            self._broadcast.unpersist()
-            self._broadcast = None
-
-    def __call__(self, *cols):
-        sc = SparkContext._active_spark_context
-        jcols = ListConverter().convert([_to_java_column(c) for c in cols],
-                                        sc._gateway._gateway_client)
-        jc = self._judf.apply(sc._jvm.PythonUtils.toSeq(jcols))
-        return Column(jc)
-
-
-class Dsl(object):
-    """
-    A collections of builtin aggregators
-    """
-    DSLS = {
-        'lit': 'Creates a :class:`Column` of literal value.',
-        'col': 'Returns a :class:`Column` based on the given column name.',
-        'column': 'Returns a :class:`Column` based on the given column name.',
-        'upper': 'Converts a string expression to upper case.',
-        'lower': 'Converts a string expression to upper case.',
-        'sqrt': 'Computes the square root of the specified float value.',
-        'abs': 'Computes the absolutle value.',
-
-        'max': 'Aggregate function: returns the maximum value of the expression in a group.',
-        'min': 'Aggregate function: returns the minimum value of the expression in a group.',
-        'first': 'Aggregate function: returns the first value in a group.',
-        'last': 'Aggregate function: returns the last value in a group.',
-        'count': 'Aggregate function: returns the number of items in a group.',
-        'sum': 'Aggregate function: returns the sum of all values in the expression.',
-        'avg': 'Aggregate function: returns the average of the values in a group.',
-        'mean': 'Aggregate function: returns the average of the values in a group.',
-        'sumDistinct': 'Aggregate function: returns the sum of distinct values in the expression.',
-    }
-
-    for _name, _doc in DSLS.items():
-        locals()[_name] = _aggregate_func(_name, _doc)
-    del _name, _doc
-
-    @staticmethod
-    def countDistinct(col, *cols):
-        """ Return a new Column for distinct count of (col, *cols)
-
-        >>> from pyspark.sql import Dsl
-        >>> df.agg(Dsl.countDistinct(df.age, df.name).alias('c')).collect()
-        [Row(c=2)]
-
-        >>> df.agg(Dsl.countDistinct("age", "name").alias('c')).collect()
-        [Row(c=2)]
-        """
-        sc = SparkContext._active_spark_context
-        jcols = ListConverter().convert([_to_java_column(c) for c in cols],
-                                        sc._gateway._gateway_client)
-        jc = sc._jvm.Dsl.countDistinct(_to_java_column(col),
-                                       sc._jvm.PythonUtils.toSeq(jcols))
-        return Column(jc)
-
-    @staticmethod
-    def approxCountDistinct(col, rsd=None):
-        """ Return a new Column for approxiate distinct count of (col, *cols)
-
-        >>> from pyspark.sql import Dsl
-        >>> df.agg(Dsl.approxCountDistinct(df.age).alias('c')).collect()
-        [Row(c=2)]
-        """
-        sc = SparkContext._active_spark_context
-        if rsd is None:
-            jc = sc._jvm.Dsl.approxCountDistinct(_to_java_column(col))
-        else:
-            jc = sc._jvm.Dsl.approxCountDistinct(_to_java_column(col), rsd)
-        return Column(jc)
-
-    @staticmethod
-    def udf(f, returnType=StringType()):
-        """Create a user defined function (UDF)
-
-        >>> slen = Dsl.udf(lambda s: len(s), IntegerType())
-        >>> df.select(slen(df.name).alias('slen')).collect()
-        [Row(slen=5), Row(slen=3)]
-        """
-        return UserDefinedFunction(f, returnType)
-
-
-def _test():
-    import doctest
-    from pyspark.context import SparkContext
-    # let doctest run in pyspark.sql, so DataTypes can be picklable
-    import pyspark.sql
-    from pyspark.sql import Row, SQLContext
-    from pyspark.sql_tests import ExamplePoint, ExamplePointUDT
-    globs = pyspark.sql.__dict__.copy()
-    sc = SparkContext('local[4]', 'PythonTest')
-    globs['sc'] = sc
-    globs['sqlCtx'] = sqlCtx = SQLContext(sc)
-    globs['rdd'] = sc.parallelize(
-        [Row(field1=1, field2="row1"),
-         Row(field1=2, field2="row2"),
-         Row(field1=3, field2="row3")]
-    )
-    rdd2 = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)])
-    rdd3 = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)])
-    globs['df'] = sqlCtx.inferSchema(rdd2)
-    globs['df2'] = sqlCtx.inferSchema(rdd3)
-    globs['ExamplePoint'] = ExamplePoint
-    globs['ExamplePointUDT'] = ExamplePointUDT
-    jsonStrings = [
-        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
-        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
-        '"field6":[{"field7": "row2"}]}',
-        '{"field1" : null, "field2": "row3", '
-        '"field3":{"field4":33, "field5": []}}'
-    ]
-    globs['jsonStrings'] = jsonStrings
-    globs['json'] = sc.parallelize(jsonStrings)
-    (failure_count, test_count) = doctest.testmod(
-        pyspark.sql, globs=globs, optionflags=doctest.ELLIPSIS)
-    globs['sc'].stop()
-    if failure_count:
-        exit(-1)
-
-
-if __name__ == "__main__":
-    _test()
diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
new file mode 100644
index 0000000000000..0a5ba00393aab
--- /dev/null
+++ b/python/pyspark/sql/__init__.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+public classes of Spark SQL:
+
+    - L{SQLContext}
+      Main entry point for SQL functionality.
+    - L{DataFrame}
+      A Resilient Distributed Dataset (RDD) with Schema information for the data contained. In
+      addition to normal RDD operations, DataFrames also support SQL.
+    - L{GroupedData}
+    - L{Column}
+      Column is a DataFrame with a single column.
+    - L{Row}
+      A Row of data returned by a Spark SQL query.
+    - L{HiveContext}
+      Main entry point for accessing data stored in Apache Hive..
+"""
+
+from pyspark.sql.context import SQLContext, HiveContext
+from pyspark.sql.types import Row
+from pyspark.sql.dataframe import DataFrame, GroupedData, Column, Dsl, SchemaRDD
+
+__all__ = [
+    'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row',
+    'Dsl',
+]
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
new file mode 100644
index 0000000000000..49f016a9cf2e9
--- /dev/null
+++ b/python/pyspark/sql/context.py
@@ -0,0 +1,642 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import warnings
+import json
+from array import array
+from itertools import imap
+
+from py4j.protocol import Py4JError
+
+from pyspark.rdd import _prepare_for_python_RDD
+from pyspark.serializers import AutoBatchedSerializer, PickleSerializer
+from pyspark.sql.types import StringType, StructType, _verify_type, \
+    _infer_schema, _has_nulltype, _merge_type, _create_converter, _python_to_sql_converter
+from pyspark.sql.dataframe import DataFrame
+
+__all__ = ["SQLContext", "HiveContext"]
+
+
+class SQLContext(object):
+
+    """Main entry point for Spark SQL functionality.
+
+    A SQLContext can be used create L{DataFrame}, register L{DataFrame} as
+    tables, execute SQL over tables, cache tables, and read parquet files.
+    """
+
+    def __init__(self, sparkContext, sqlContext=None):
+        """Create a new SQLContext.
+
+        :param sparkContext: The SparkContext to wrap.
+        :param sqlContext: An optional JVM Scala SQLContext. If set, we do not instatiate a new
+        SQLContext in the JVM, instead we make all calls to this object.
+
+        >>> df = sqlCtx.inferSchema(rdd)
+        >>> sqlCtx.inferSchema(df) # doctest: +IGNORE_EXCEPTION_DETAIL
+        Traceback (most recent call last):
+            ...
+        TypeError:...
+
+        >>> bad_rdd = sc.parallelize([1,2,3])
+        >>> sqlCtx.inferSchema(bad_rdd) # doctest: +IGNORE_EXCEPTION_DETAIL
+        Traceback (most recent call last):
+            ...
+        ValueError:...
+
+        >>> from datetime import datetime
+        >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1L,
+        ...     b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1),
+        ...     time=datetime(2014, 8, 1, 14, 1, 5))])
+        >>> df = sqlCtx.inferSchema(allTypes)
+        >>> df.registerTempTable("allTypes")
+        >>> sqlCtx.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a '
+        ...            'from allTypes where b and i > 0').collect()
+        [Row(c0=2, c1=2.0, c2=False, c3=2, c4=0...8, 1, 14, 1, 5), a=1)]
+        >>> df.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time,
+        ...                     x.row.a, x.list)).collect()
+        [(1, u'string', 1.0, 1, True, ...(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
+        """
+        self._sc = sparkContext
+        self._jsc = self._sc._jsc
+        self._jvm = self._sc._jvm
+        self._scala_SQLContext = sqlContext
+
+    @property
+    def _ssql_ctx(self):
+        """Accessor for the JVM Spark SQL context.
+
+        Subclasses can override this property to provide their own
+        JVM Contexts.
+        """
+        if self._scala_SQLContext is None:
+            self._scala_SQLContext = self._jvm.SQLContext(self._jsc.sc())
+        return self._scala_SQLContext
+
+    def registerFunction(self, name, f, returnType=StringType()):
+        """Registers a lambda function as a UDF so it can be used in SQL statements.
+
+        In addition to a name and the function itself, the return type can be optionally specified.
+        When the return type is not given it default to a string and conversion will automatically
+        be done.  For any other return type, the produced object must match the specified type.
+
+        >>> sqlCtx.registerFunction("stringLengthString", lambda x: len(x))
+        >>> sqlCtx.sql("SELECT stringLengthString('test')").collect()
+        [Row(c0=u'4')]
+        >>> from pyspark.sql.types import IntegerType
+        >>> sqlCtx.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
+        >>> sqlCtx.sql("SELECT stringLengthInt('test')").collect()
+        [Row(c0=4)]
+        """
+        func = lambda _, it: imap(lambda x: f(*x), it)
+        ser = AutoBatchedSerializer(PickleSerializer())
+        command = (func, None, ser, ser)
+        pickled_cmd, bvars, env, includes = _prepare_for_python_RDD(self._sc, command, self)
+        self._ssql_ctx.udf().registerPython(name,
+                                            bytearray(pickled_cmd),
+                                            env,
+                                            includes,
+                                            self._sc.pythonExec,
+                                            bvars,
+                                            self._sc._javaAccumulator,
+                                            returnType.json())
+
+    def inferSchema(self, rdd, samplingRatio=None):
+        """Infer and apply a schema to an RDD of L{Row}.
+
+        When samplingRatio is specified, the schema is inferred by looking
+        at the types of each row in the sampled dataset. Otherwise, the
+        first 100 rows of the RDD are inspected. Nested collections are
+        supported, which can include array, dict, list, Row, tuple,
+        namedtuple, or object.
+
+        Each row could be L{pyspark.sql.Row} object or namedtuple or objects.
+        Using top level dicts is deprecated, as dict is used to represent Maps.
+
+        If a single column has multiple distinct inferred types, it may cause
+        runtime exceptions.
+
+        >>> rdd = sc.parallelize(
+        ...     [Row(field1=1, field2="row1"),
+        ...      Row(field1=2, field2="row2"),
+        ...      Row(field1=3, field2="row3")])
+        >>> df = sqlCtx.inferSchema(rdd)
+        >>> df.collect()[0]
+        Row(field1=1, field2=u'row1')
+
+        >>> NestedRow = Row("f1", "f2")
+        >>> nestedRdd1 = sc.parallelize([
+        ...     NestedRow(array('i', [1, 2]), {"row1": 1.0}),
+        ...     NestedRow(array('i', [2, 3]), {"row2": 2.0})])
+        >>> df = sqlCtx.inferSchema(nestedRdd1)
+        >>> df.collect()
+        [Row(f1=[1, 2], f2={u'row1': 1.0}), ..., f2={u'row2': 2.0})]
+
+        >>> nestedRdd2 = sc.parallelize([
+        ...     NestedRow([[1, 2], [2, 3]], [1, 2]),
+        ...     NestedRow([[2, 3], [3, 4]], [2, 3])])
+        >>> df = sqlCtx.inferSchema(nestedRdd2)
+        >>> df.collect()
+        [Row(f1=[[1, 2], [2, 3]], f2=[1, 2]), ..., f2=[2, 3])]
+
+        >>> from collections import namedtuple
+        >>> CustomRow = namedtuple('CustomRow', 'field1 field2')
+        >>> rdd = sc.parallelize(
+        ...     [CustomRow(field1=1, field2="row1"),
+        ...      CustomRow(field1=2, field2="row2"),
+        ...      CustomRow(field1=3, field2="row3")])
+        >>> df = sqlCtx.inferSchema(rdd)
+        >>> df.collect()[0]
+        Row(field1=1, field2=u'row1')
+        """
+
+        if isinstance(rdd, DataFrame):
+            raise TypeError("Cannot apply schema to DataFrame")
+
+        first = rdd.first()
+        if not first:
+            raise ValueError("The first row in RDD is empty, "
+                             "can not infer schema")
+        if type(first) is dict:
+            warnings.warn("Using RDD of dict to inferSchema is deprecated,"
+                          "please use pyspark.sql.Row instead")
+
+        if samplingRatio is None:
+            schema = _infer_schema(first)
+            if _has_nulltype(schema):
+                for row in rdd.take(100)[1:]:
+                    schema = _merge_type(schema, _infer_schema(row))
+                    if not _has_nulltype(schema):
+                        break
+                else:
+                    warnings.warn("Some of types cannot be determined by the "
+                                  "first 100 rows, please try again with sampling")
+        else:
+            if samplingRatio > 0.99:
+                rdd = rdd.sample(False, float(samplingRatio))
+            schema = rdd.map(_infer_schema).reduce(_merge_type)
+
+        converter = _create_converter(schema)
+        rdd = rdd.map(converter)
+        return self.applySchema(rdd, schema)
+
+    def applySchema(self, rdd, schema):
+        """
+        Applies the given schema to the given RDD of L{tuple} or L{list}.
+
+        These tuples or lists can contain complex nested structures like
+        lists, maps or nested rows.
+
+        The schema should be a StructType.
+
+        It is important that the schema matches the types of the objects
+        in each row or exceptions could be thrown at runtime.
+
+        >>> from pyspark.sql.types import *
+        >>> rdd2 = sc.parallelize([(1, "row1"), (2, "row2"), (3, "row3")])
+        >>> schema = StructType([StructField("field1", IntegerType(), False),
+        ...     StructField("field2", StringType(), False)])
+        >>> df = sqlCtx.applySchema(rdd2, schema)
+        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> df2 = sqlCtx.sql("SELECT * from table1")
+        >>> df2.collect()
+        [Row(field1=1, field2=u'row1'),..., Row(field1=3, field2=u'row3')]
+
+        >>> from datetime import date, datetime
+        >>> rdd = sc.parallelize([(127, -128L, -32768, 32767, 2147483647L, 1.0,
+        ...     date(2010, 1, 1),
+        ...     datetime(2010, 1, 1, 1, 1, 1),
+        ...     {"a": 1}, (2,), [1, 2, 3], None)])
+        >>> schema = StructType([
+        ...     StructField("byte1", ByteType(), False),
+        ...     StructField("byte2", ByteType(), False),
+        ...     StructField("short1", ShortType(), False),
+        ...     StructField("short2", ShortType(), False),
+        ...     StructField("int", IntegerType(), False),
+        ...     StructField("float", FloatType(), False),
+        ...     StructField("date", DateType(), False),
+        ...     StructField("time", TimestampType(), False),
+        ...     StructField("map",
+        ...         MapType(StringType(), IntegerType(), False), False),
+        ...     StructField("struct",
+        ...         StructType([StructField("b", ShortType(), False)]), False),
+        ...     StructField("list", ArrayType(ByteType(), False), False),
+        ...     StructField("null", DoubleType(), True)])
+        >>> df = sqlCtx.applySchema(rdd, schema)
+        >>> results = df.map(
+        ...     lambda x: (x.byte1, x.byte2, x.short1, x.short2, x.int, x.float, x.date,
+        ...         x.time, x.map["a"], x.struct.b, x.list, x.null))
+        >>> results.collect()[0] # doctest: +NORMALIZE_WHITESPACE
+        (127, -128, -32768, 32767, 2147483647, 1.0, datetime.date(2010, 1, 1),
+             datetime.datetime(2010, 1, 1, 1, 1, 1), 1, 2, [1, 2, 3], None)
+
+        >>> df.registerTempTable("table2")
+        >>> sqlCtx.sql(
+        ...   "SELECT byte1 - 1 AS byte1, byte2 + 1 AS byte2, " +
+        ...     "short1 + 1 AS short1, short2 - 1 AS short2, int - 1 AS int, " +
+        ...     "float + 1.5 as float FROM table2").collect()
+        [Row(byte1=126, byte2=-127, short1=-32767, short2=32766, int=2147483646, float=2.5)]
+
+        >>> from pyspark.sql.types import _parse_schema_abstract, _infer_schema_type
+        >>> rdd = sc.parallelize([(127, -32768, 1.0,
+        ...     datetime(2010, 1, 1, 1, 1, 1),
+        ...     {"a": 1}, (2,), [1, 2, 3])])
+        >>> abstract = "byte short float time map{} struct(b) list[]"
+        >>> schema = _parse_schema_abstract(abstract)
+        >>> typedSchema = _infer_schema_type(rdd.first(), schema)
+        >>> df = sqlCtx.applySchema(rdd, typedSchema)
+        >>> df.collect()
+        [Row(byte=127, short=-32768, float=1.0, time=..., list=[1, 2, 3])]
+        """
+
+        if isinstance(rdd, DataFrame):
+            raise TypeError("Cannot apply schema to DataFrame")
+
+        if not isinstance(schema, StructType):
+            raise TypeError("schema should be StructType")
+
+        # take the first few rows to verify schema
+        rows = rdd.take(10)
+        # Row() cannot been deserialized by Pyrolite
+        if rows and isinstance(rows[0], tuple) and rows[0].__class__.__name__ == 'Row':
+            rdd = rdd.map(tuple)
+            rows = rdd.take(10)
+
+        for row in rows:
+            _verify_type(row, schema)
+
+        # convert python objects to sql data
+        converter = _python_to_sql_converter(schema)
+        rdd = rdd.map(converter)
+
+        jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
+        df = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
+        return DataFrame(df, self)
+
+    def registerRDDAsTable(self, rdd, tableName):
+        """Registers the given RDD as a temporary table in the catalog.
+
+        Temporary tables exist only during the lifetime of this instance of
+        SQLContext.
+
+        >>> df = sqlCtx.inferSchema(rdd)
+        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        """
+        if (rdd.__class__ is DataFrame):
+            df = rdd._jdf
+            self._ssql_ctx.registerRDDAsTable(df, tableName)
+        else:
+            raise ValueError("Can only register DataFrame as table")
+
+    def parquetFile(self, *paths):
+        """Loads a Parquet file, returning the result as a L{DataFrame}.
+
+        >>> import tempfile, shutil
+        >>> parquetFile = tempfile.mkdtemp()
+        >>> shutil.rmtree(parquetFile)
+        >>> df = sqlCtx.inferSchema(rdd)
+        >>> df.saveAsParquetFile(parquetFile)
+        >>> df2 = sqlCtx.parquetFile(parquetFile)
+        >>> sorted(df.collect()) == sorted(df2.collect())
+        True
+        """
+        gateway = self._sc._gateway
+        jpath = paths[0]
+        jpaths = gateway.new_array(gateway.jvm.java.lang.String, len(paths) - 1)
+        for i in range(1, len(paths)):
+            jpaths[i] = paths[i]
+        jdf = self._ssql_ctx.parquetFile(jpath, jpaths)
+        return DataFrame(jdf, self)
+
+    def jsonFile(self, path, schema=None, samplingRatio=1.0):
+        """
+        Loads a text file storing one JSON object per line as a
+        L{DataFrame}.
+
+        If the schema is provided, applies the given schema to this
+        JSON dataset.
+
+        Otherwise, it samples the dataset with ratio `samplingRatio` to
+        determine the schema.
+
+        >>> import tempfile, shutil
+        >>> jsonFile = tempfile.mkdtemp()
+        >>> shutil.rmtree(jsonFile)
+        >>> ofn = open(jsonFile, 'w')
+        >>> for json in jsonStrings:
+        ...   print>>ofn, json
+        >>> ofn.close()
+        >>> df1 = sqlCtx.jsonFile(jsonFile)
+        >>> sqlCtx.registerRDDAsTable(df1, "table1")
+        >>> df2 = sqlCtx.sql(
+        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
+        ...   "field6 as f4 from table1")
+        >>> for r in df2.collect():
+        ...     print r
+        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
+        Row(f1=2, f2=None, f3=Row(field4=22,..., f4=[Row(field7=u'row2')])
+        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
+
+        >>> df3 = sqlCtx.jsonFile(jsonFile, df1.schema())
+        >>> sqlCtx.registerRDDAsTable(df3, "table2")
+        >>> df4 = sqlCtx.sql(
+        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
+        ...   "field6 as f4 from table2")
+        >>> for r in df4.collect():
+        ...    print r
+        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
+        Row(f1=2, f2=None, f3=Row(field4=22,..., f4=[Row(field7=u'row2')])
+        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
+
+        >>> from pyspark.sql.types import *
+        >>> schema = StructType([
+        ...     StructField("field2", StringType(), True),
+        ...     StructField("field3",
+        ...         StructType([
+        ...             StructField("field5",
+        ...                 ArrayType(IntegerType(), False), True)]), False)])
+        >>> df5 = sqlCtx.jsonFile(jsonFile, schema)
+        >>> sqlCtx.registerRDDAsTable(df5, "table3")
+        >>> df6 = sqlCtx.sql(
+        ...   "SELECT field2 AS f1, field3.field5 as f2, "
+        ...   "field3.field5[0] as f3 from table3")
+        >>> df6.collect()
+        [Row(f1=u'row1', f2=None, f3=None)...Row(f1=u'row3', f2=[], f3=None)]
+        """
+        if schema is None:
+            df = self._ssql_ctx.jsonFile(path, samplingRatio)
+        else:
+            scala_datatype = self._ssql_ctx.parseDataType(schema.json())
+            df = self._ssql_ctx.jsonFile(path, scala_datatype)
+        return DataFrame(df, self)
+
+    def jsonRDD(self, rdd, schema=None, samplingRatio=1.0):
+        """Loads an RDD storing one JSON object per string as a L{DataFrame}.
+
+        If the schema is provided, applies the given schema to this
+        JSON dataset.
+
+        Otherwise, it samples the dataset with ratio `samplingRatio` to
+        determine the schema.
+
+        >>> df1 = sqlCtx.jsonRDD(json)
+        >>> sqlCtx.registerRDDAsTable(df1, "table1")
+        >>> df2 = sqlCtx.sql(
+        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
+        ...   "field6 as f4 from table1")
+        >>> for r in df2.collect():
+        ...     print r
+        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
+        Row(f1=2, f2=None, f3=Row(field4=22..., f4=[Row(field7=u'row2')])
+        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
+
+        >>> df3 = sqlCtx.jsonRDD(json, df1.schema())
+        >>> sqlCtx.registerRDDAsTable(df3, "table2")
+        >>> df4 = sqlCtx.sql(
+        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
+        ...   "field6 as f4 from table2")
+        >>> for r in df4.collect():
+        ...     print r
+        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
+        Row(f1=2, f2=None, f3=Row(field4=22..., f4=[Row(field7=u'row2')])
+        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
+
+        >>> from pyspark.sql.types import *
+        >>> schema = StructType([
+        ...     StructField("field2", StringType(), True),
+        ...     StructField("field3",
+        ...         StructType([
+        ...             StructField("field5",
+        ...                 ArrayType(IntegerType(), False), True)]), False)])
+        >>> df5 = sqlCtx.jsonRDD(json, schema)
+        >>> sqlCtx.registerRDDAsTable(df5, "table3")
+        >>> df6 = sqlCtx.sql(
+        ...   "SELECT field2 AS f1, field3.field5 as f2, "
+        ...   "field3.field5[0] as f3 from table3")
+        >>> df6.collect()
+        [Row(f1=u'row1', f2=None,...Row(f1=u'row3', f2=[], f3=None)]
+
+        >>> sqlCtx.jsonRDD(sc.parallelize(['{}',
+        ...         '{"key0": {"key1": "value1"}}'])).collect()
+        [Row(key0=None), Row(key0=Row(key1=u'value1'))]
+        >>> sqlCtx.jsonRDD(sc.parallelize(['{"key0": null}',
+        ...         '{"key0": {"key1": "value1"}}'])).collect()
+        [Row(key0=None), Row(key0=Row(key1=u'value1'))]
+        """
+
+        def func(iterator):
+            for x in iterator:
+                if not isinstance(x, basestring):
+                    x = unicode(x)
+                if isinstance(x, unicode):
+                    x = x.encode("utf-8")
+                yield x
+        keyed = rdd.mapPartitions(func)
+        keyed._bypass_serializer = True
+        jrdd = keyed._jrdd.map(self._jvm.BytesToString())
+        if schema is None:
+            df = self._ssql_ctx.jsonRDD(jrdd.rdd(), samplingRatio)
+        else:
+            scala_datatype = self._ssql_ctx.parseDataType(schema.json())
+            df = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
+        return DataFrame(df, self)
+
+    def sql(self, sqlQuery):
+        """Return a L{DataFrame} representing the result of the given query.
+
+        >>> df = sqlCtx.inferSchema(rdd)
+        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> df2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2 from table1")
+        >>> df2.collect()
+        [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
+        """
+        return DataFrame(self._ssql_ctx.sql(sqlQuery), self)
+
+    def table(self, tableName):
+        """Returns the specified table as a L{DataFrame}.
+
+        >>> df = sqlCtx.inferSchema(rdd)
+        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> df2 = sqlCtx.table("table1")
+        >>> sorted(df.collect()) == sorted(df2.collect())
+        True
+        """
+        return DataFrame(self._ssql_ctx.table(tableName), self)
+
+    def cacheTable(self, tableName):
+        """Caches the specified table in-memory."""
+        self._ssql_ctx.cacheTable(tableName)
+
+    def uncacheTable(self, tableName):
+        """Removes the specified table from the in-memory cache."""
+        self._ssql_ctx.uncacheTable(tableName)
+
+
+class HiveContext(SQLContext):
+
+    """A variant of Spark SQL that integrates with data stored in Hive.
+
+    Configuration for Hive is read from hive-site.xml on the classpath.
+    It supports running both SQL and HiveQL commands.
+    """
+
+    def __init__(self, sparkContext, hiveContext=None):
+        """Create a new HiveContext.
+
+        :param sparkContext: The SparkContext to wrap.
+        :param hiveContext: An optional JVM Scala HiveContext. If set, we do not instatiate a new
+        HiveContext in the JVM, instead we make all calls to this object.
+        """
+        SQLContext.__init__(self, sparkContext)
+
+        if hiveContext:
+            self._scala_HiveContext = hiveContext
+
+    @property
+    def _ssql_ctx(self):
+        try:
+            if not hasattr(self, '_scala_HiveContext'):
+                self._scala_HiveContext = self._get_hive_ctx()
+            return self._scala_HiveContext
+        except Py4JError as e:
+            raise Exception("You must build Spark with Hive. "
+                            "Export 'SPARK_HIVE=true' and run "
+                            "build/sbt assembly", e)
+
+    def _get_hive_ctx(self):
+        return self._jvm.HiveContext(self._jsc.sc())
+
+
+def _create_row(fields, values):
+    row = Row(*values)
+    row.__FIELDS__ = fields
+    return row
+
+
+class Row(tuple):
+
+    """
+    A row in L{DataFrame}. The fields in it can be accessed like attributes.
+
+    Row can be used to create a row object by using named arguments,
+    the fields will be sorted by names.
+
+    >>> row = Row(name="Alice", age=11)
+    >>> row
+    Row(age=11, name='Alice')
+    >>> row.name, row.age
+    ('Alice', 11)
+
+    Row also can be used to create another Row like class, then it
+    could be used to create Row objects, such as
+
+    >>> Person = Row("name", "age")
+    >>> Person
+    <Row(name, age)>
+    >>> Person("Alice", 11)
+    Row(name='Alice', age=11)
+    """
+
+    def __new__(self, *args, **kwargs):
+        if args and kwargs:
+            raise ValueError("Can not use both args "
+                             "and kwargs to create Row")
+        if args:
+            # create row class or objects
+            return tuple.__new__(self, args)
+
+        elif kwargs:
+            # create row objects
+            names = sorted(kwargs.keys())
+            values = tuple(kwargs[n] for n in names)
+            row = tuple.__new__(self, values)
+            row.__FIELDS__ = names
+            return row
+
+        else:
+            raise ValueError("No args or kwargs")
+
+    def asDict(self):
+        """
+        Return as an dict
+        """
+        if not hasattr(self, "__FIELDS__"):
+            raise TypeError("Cannot convert a Row class into dict")
+        return dict(zip(self.__FIELDS__, self))
+
+    # let obect acs like class
+    def __call__(self, *args):
+        """create new Row object"""
+        return _create_row(self, args)
+
+    def __getattr__(self, item):
+        if item.startswith("__"):
+            raise AttributeError(item)
+        try:
+            # it will be slow when it has many fields,
+            # but this will not be used in normal cases
+            idx = self.__FIELDS__.index(item)
+            return self[idx]
+        except IndexError:
+            raise AttributeError(item)
+
+    def __reduce__(self):
+        if hasattr(self, "__FIELDS__"):
+            return (_create_row, (self.__FIELDS__, tuple(self)))
+        else:
+            return tuple.__reduce__(self)
+
+    def __repr__(self):
+        if hasattr(self, "__FIELDS__"):
+            return "Row(%s)" % ", ".join("%s=%r" % (k, v)
+                                         for k, v in zip(self.__FIELDS__, self))
+        else:
+            return "<Row(%s)>" % ", ".join(self)
+
+
+def _test():
+    import doctest
+    from pyspark.context import SparkContext
+    from pyspark.sql import Row, SQLContext
+    import pyspark.sql.context
+    globs = pyspark.sql.context.__dict__.copy()
+    sc = SparkContext('local[4]', 'PythonTest')
+    globs['sc'] = sc
+    globs['sqlCtx'] = sqlCtx = SQLContext(sc)
+    globs['rdd'] = sc.parallelize(
+        [Row(field1=1, field2="row1"),
+         Row(field1=2, field2="row2"),
+         Row(field1=3, field2="row3")]
+    )
+    jsonStrings = [
+        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
+        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
+        '"field6":[{"field7": "row2"}]}',
+        '{"field1" : null, "field2": "row3", '
+        '"field3":{"field4":33, "field5": []}}'
+    ]
+    globs['jsonStrings'] = jsonStrings
+    globs['json'] = sc.parallelize(jsonStrings)
+    (failure_count, test_count) = doctest.testmod(
+        pyspark.sql.context, globs=globs, optionflags=doctest.ELLIPSIS)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
new file mode 100644
index 0000000000000..cda704eea75f5
--- /dev/null
+++ b/python/pyspark/sql/dataframe.py
@@ -0,0 +1,974 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+import itertools
+import warnings
+import random
+import os
+from tempfile import NamedTemporaryFile
+from itertools import imap
+
+from py4j.java_collections import ListConverter, MapConverter
+
+from pyspark.context import SparkContext
+from pyspark.rdd import RDD, _prepare_for_python_RDD
+from pyspark.serializers import BatchedSerializer, AutoBatchedSerializer, PickleSerializer, \
+    UTF8Deserializer
+from pyspark.storagelevel import StorageLevel
+from pyspark.traceback_utils import SCCallSiteSync
+from pyspark.sql.types import *
+from pyspark.sql.types import _create_cls, _parse_datatype_json_string
+
+
+__all__ = ["DataFrame", "GroupedData", "Column", "Dsl", "SchemaRDD"]
+
+
+class DataFrame(object):
+
+    """A collection of rows that have the same columns.
+
+    A :class:`DataFrame` is equivalent to a relational table in Spark SQL,
+    and can be created using various functions in :class:`SQLContext`::
+
+        people = sqlContext.parquetFile("...")
+
+    Once created, it can be manipulated using the various domain-specific-language
+    (DSL) functions defined in: :class:`DataFrame`, :class:`Column`.
+
+    To select a column from the data frame, use the apply method::
+
+        ageCol = people.age
+
+    Note that the :class:`Column` type can also be manipulated
+    through its various functions::
+
+        # The following creates a new column that increases everybody's age by 10.
+        people.age + 10
+
+
+    A more concrete example::
+
+        # To create DataFrame using SQLContext
+        people = sqlContext.parquetFile("...")
+        department = sqlContext.parquetFile("...")
+
+        people.filter(people.age > 30).join(department, people.deptId == department.id)) \
+          .groupBy(department.name, "gender").agg({"salary": "avg", "age": "max"})
+    """
+
+    def __init__(self, jdf, sql_ctx):
+        self._jdf = jdf
+        self.sql_ctx = sql_ctx
+        self._sc = sql_ctx and sql_ctx._sc
+        self.is_cached = False
+
+    @property
+    def rdd(self):
+        """
+        Return the content of the :class:`DataFrame` as an :class:`RDD`
+        of :class:`Row` s.
+        """
+        if not hasattr(self, '_lazy_rdd'):
+            jrdd = self._jdf.javaToPython()
+            rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer()))
+            schema = self.schema()
+
+            def applySchema(it):
+                cls = _create_cls(schema)
+                return itertools.imap(cls, it)
+
+            self._lazy_rdd = rdd.mapPartitions(applySchema)
+
+        return self._lazy_rdd
+
+    def toJSON(self, use_unicode=False):
+        """Convert a DataFrame into a MappedRDD of JSON documents; one document per row.
+
+        >>> df.toJSON().first()
+        '{"age":2,"name":"Alice"}'
+        """
+        rdd = self._jdf.toJSON()
+        return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode))
+
+    def saveAsParquetFile(self, path):
+        """Save the contents as a Parquet file, preserving the schema.
+
+        Files that are written out using this method can be read back in as
+        a DataFrame using the L{SQLContext.parquetFile} method.
+
+        >>> import tempfile, shutil
+        >>> parquetFile = tempfile.mkdtemp()
+        >>> shutil.rmtree(parquetFile)
+        >>> df.saveAsParquetFile(parquetFile)
+        >>> df2 = sqlCtx.parquetFile(parquetFile)
+        >>> sorted(df2.collect()) == sorted(df.collect())
+        True
+        """
+        self._jdf.saveAsParquetFile(path)
+
+    def registerTempTable(self, name):
+        """Registers this RDD as a temporary table using the given name.
+
+        The lifetime of this temporary table is tied to the L{SQLContext}
+        that was used to create this DataFrame.
+
+        >>> df.registerTempTable("people")
+        >>> df2 = sqlCtx.sql("select * from people")
+        >>> sorted(df.collect()) == sorted(df2.collect())
+        True
+        """
+        self._jdf.registerTempTable(name)
+
+    def registerAsTable(self, name):
+        """DEPRECATED: use registerTempTable() instead"""
+        warnings.warn("Use registerTempTable instead of registerAsTable.", DeprecationWarning)
+        self.registerTempTable(name)
+
+    def insertInto(self, tableName, overwrite=False):
+        """Inserts the contents of this DataFrame into the specified table.
+
+        Optionally overwriting any existing data.
+        """
+        self._jdf.insertInto(tableName, overwrite)
+
+    def saveAsTable(self, tableName):
+        """Creates a new table with the contents of this DataFrame."""
+        self._jdf.saveAsTable(tableName)
+
+    def schema(self):
+        """Returns the schema of this DataFrame (represented by
+        a L{StructType}).
+
+        >>> df.schema()
+        StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true)))
+        """
+        return _parse_datatype_json_string(self._jdf.schema().json())
+
+    def printSchema(self):
+        """Prints out the schema in the tree format.
+
+        >>> df.printSchema()
+        root
+         |-- age: integer (nullable = true)
+         |-- name: string (nullable = true)
+        <BLANKLINE>
+        """
+        print (self._jdf.schema().treeString())
+
+    def count(self):
+        """Return the number of elements in this RDD.
+
+        Unlike the base RDD implementation of count, this implementation
+        leverages the query optimizer to compute the count on the DataFrame,
+        which supports features such as filter pushdown.
+
+        >>> df.count()
+        2L
+        """
+        return self._jdf.count()
+
+    def collect(self):
+        """Return a list that contains all of the rows.
+
+        Each object in the list is a Row, the fields can be accessed as
+        attributes.
+
+        >>> df.collect()
+        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+        """
+        with SCCallSiteSync(self._sc) as css:
+            bytesInJava = self._jdf.javaToPython().collect().iterator()
+        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+        tempFile.close()
+        self._sc._writeToFile(bytesInJava, tempFile.name)
+        # Read the data into Python and deserialize it:
+        with open(tempFile.name, 'rb') as tempFile:
+            rs = list(BatchedSerializer(PickleSerializer()).load_stream(tempFile))
+        os.unlink(tempFile.name)
+        cls = _create_cls(self.schema())
+        return [cls(r) for r in rs]
+
+    def limit(self, num):
+        """Limit the result count to the number specified.
+
+        >>> df.limit(1).collect()
+        [Row(age=2, name=u'Alice')]
+        >>> df.limit(0).collect()
+        []
+        """
+        jdf = self._jdf.limit(num)
+        return DataFrame(jdf, self.sql_ctx)
+
+    def take(self, num):
+        """Take the first num rows of the RDD.
+
+        Each object in the list is a Row, the fields can be accessed as
+        attributes.
+
+        >>> df.take(2)
+        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+        """
+        return self.limit(num).collect()
+
+    def map(self, f):
+        """ Return a new RDD by applying a function to each Row, it's a
+        shorthand for df.rdd.map()
+
+        >>> df.map(lambda p: p.name).collect()
+        [u'Alice', u'Bob']
+        """
+        return self.rdd.map(f)
+
+    def mapPartitions(self, f, preservesPartitioning=False):
+        """
+        Return a new RDD by applying a function to each partition.
+
+        >>> rdd = sc.parallelize([1, 2, 3, 4], 4)
+        >>> def f(iterator): yield 1
+        >>> rdd.mapPartitions(f).sum()
+        4
+        """
+        return self.rdd.mapPartitions(f, preservesPartitioning)
+
+    def cache(self):
+        """ Persist with the default storage level (C{MEMORY_ONLY_SER}).
+        """
+        self.is_cached = True
+        self._jdf.cache()
+        return self
+
+    def persist(self, storageLevel=StorageLevel.MEMORY_ONLY_SER):
+        """ Set the storage level to persist its values across operations
+        after the first time it is computed. This can only be used to assign
+        a new storage level if the RDD does not have a storage level set yet.
+        If no storage level is specified defaults to (C{MEMORY_ONLY_SER}).
+        """
+        self.is_cached = True
+        javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel)
+        self._jdf.persist(javaStorageLevel)
+        return self
+
+    def unpersist(self, blocking=True):
+        """ Mark it as non-persistent, and remove all blocks for it from
+        memory and disk.
+        """
+        self.is_cached = False
+        self._jdf.unpersist(blocking)
+        return self
+
+    # def coalesce(self, numPartitions, shuffle=False):
+    #     rdd = self._jdf.coalesce(numPartitions, shuffle, None)
+    #     return DataFrame(rdd, self.sql_ctx)
+
+    def repartition(self, numPartitions):
+        """ Return a new :class:`DataFrame` that has exactly `numPartitions`
+        partitions.
+        """
+        rdd = self._jdf.repartition(numPartitions, None)
+        return DataFrame(rdd, self.sql_ctx)
+
+    def sample(self, withReplacement, fraction, seed=None):
+        """
+        Return a sampled subset of this DataFrame.
+
+        >>> df.sample(False, 0.5, 97).count()
+        1L
+        """
+        assert fraction >= 0.0, "Negative fraction value: %s" % fraction
+        seed = seed if seed is not None else random.randint(0, sys.maxint)
+        rdd = self._jdf.sample(withReplacement, fraction, long(seed))
+        return DataFrame(rdd, self.sql_ctx)
+
+    # def takeSample(self, withReplacement, num, seed=None):
+    #     """Return a fixed-size sampled subset of this DataFrame.
+    #
+    #     >>> df = sqlCtx.inferSchema(rdd)
+    #     >>> df.takeSample(False, 2, 97)
+    #     [Row(field1=3, field2=u'row3'), Row(field1=1, field2=u'row1')]
+    #     """
+    #     seed = seed if seed is not None else random.randint(0, sys.maxint)
+    #     with SCCallSiteSync(self.context) as css:
+    #         bytesInJava = self._jdf \
+    #             .takeSampleToPython(withReplacement, num, long(seed)) \
+    #             .iterator()
+    #     cls = _create_cls(self.schema())
+    #     return map(cls, self._collect_iterator_through_file(bytesInJava))
+
+    @property
+    def dtypes(self):
+        """Return all column names and their data types as a list.
+
+        >>> df.dtypes
+        [('age', 'integer'), ('name', 'string')]
+        """
+        return [(str(f.name), f.dataType.jsonValue()) for f in self.schema().fields]
+
+    @property
+    def columns(self):
+        """ Return all column names as a list.
+
+        >>> df.columns
+        [u'age', u'name']
+        """
+        return [f.name for f in self.schema().fields]
+
+    def join(self, other, joinExprs=None, joinType=None):
+        """
+        Join with another DataFrame, using the given join expression.
+        The following performs a full outer join between `df1` and `df2`::
+
+        :param other: Right side of the join
+        :param joinExprs: Join expression
+        :param joinType: One of `inner`, `outer`, `left_outer`, `right_outer`, `semijoin`.
+
+        >>> df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height).collect()
+        [Row(name=None, height=80), Row(name=u'Bob', height=85), Row(name=u'Alice', height=None)]
+        """
+
+        if joinExprs is None:
+            jdf = self._jdf.join(other._jdf)
+        else:
+            assert isinstance(joinExprs, Column), "joinExprs should be Column"
+            if joinType is None:
+                jdf = self._jdf.join(other._jdf, joinExprs._jc)
+            else:
+                assert isinstance(joinType, basestring), "joinType should be basestring"
+                jdf = self._jdf.join(other._jdf, joinExprs._jc, joinType)
+        return DataFrame(jdf, self.sql_ctx)
+
+    def sort(self, *cols):
+        """ Return a new :class:`DataFrame` sorted by the specified column.
+
+        :param cols: The columns or expressions used for sorting
+
+        >>> df.sort(df.age.desc()).collect()
+        [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
+        >>> df.sortBy(df.age.desc()).collect()
+        [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
+        """
+        if not cols:
+            raise ValueError("should sort by at least one column")
+        jcols = ListConverter().convert([_to_java_column(c) for c in cols],
+                                        self._sc._gateway._gateway_client)
+        jdf = self._jdf.sort(self._sc._jvm.PythonUtils.toSeq(jcols))
+        return DataFrame(jdf, self.sql_ctx)
+
+    sortBy = sort
+
+    def head(self, n=None):
+        """ Return the first `n` rows or the first row if n is None.
+
+        >>> df.head()
+        Row(age=2, name=u'Alice')
+        >>> df.head(1)
+        [Row(age=2, name=u'Alice')]
+        """
+        if n is None:
+            rs = self.head(1)
+            return rs[0] if rs else None
+        return self.take(n)
+
+    def first(self):
+        """ Return the first row.
+
+        >>> df.first()
+        Row(age=2, name=u'Alice')
+        """
+        return self.head()
+
+    def __getitem__(self, item):
+        """ Return the column by given name
+
+        >>> df['age'].collect()
+        [Row(age=2), Row(age=5)]
+        >>> df[ ["name", "age"]].collect()
+        [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]
+        >>> df[ df.age > 3 ].collect()
+        [Row(age=5, name=u'Bob')]
+        """
+        if isinstance(item, basestring):
+            jc = self._jdf.apply(item)
+            return Column(jc, self.sql_ctx)
+        elif isinstance(item, Column):
+            return self.filter(item)
+        elif isinstance(item, list):
+            return self.select(*item)
+        else:
+            raise IndexError("unexpected index: %s" % item)
+
+    def __getattr__(self, name):
+        """ Return the column by given name
+
+        >>> df.age.collect()
+        [Row(age=2), Row(age=5)]
+        """
+        if name.startswith("__"):
+            raise AttributeError(name)
+        jc = self._jdf.apply(name)
+        return Column(jc, self.sql_ctx)
+
+    def select(self, *cols):
+        """ Selecting a set of expressions.
+
+        >>> df.select().collect()
+        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+        >>> df.select('*').collect()
+        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+        >>> df.select('name', 'age').collect()
+        [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]
+        >>> df.select(df.name, (df.age + 10).alias('age')).collect()
+        [Row(name=u'Alice', age=12), Row(name=u'Bob', age=15)]
+        """
+        if not cols:
+            cols = ["*"]
+        jcols = ListConverter().convert([_to_java_column(c) for c in cols],
+                                        self._sc._gateway._gateway_client)
+        jdf = self._jdf.select(self.sql_ctx._sc._jvm.PythonUtils.toSeq(jcols))
+        return DataFrame(jdf, self.sql_ctx)
+
+    def selectExpr(self, *expr):
+        """
+        Selects a set of SQL expressions. This is a variant of
+        `select` that accepts SQL expressions.
+
+        >>> df.selectExpr("age * 2", "abs(age)").collect()
+        [Row(('age * 2)=4, Abs('age)=2), Row(('age * 2)=10, Abs('age)=5)]
+        """
+        jexpr = ListConverter().convert(expr, self._sc._gateway._gateway_client)
+        jdf = self._jdf.selectExpr(self._sc._jvm.PythonUtils.toSeq(jexpr))
+        return DataFrame(jdf, self.sql_ctx)
+
+    def filter(self, condition):
+        """ Filtering rows using the given condition, which could be
+        Column expression or string of SQL expression.
+
+        where() is an alias for filter().
+
+        >>> df.filter(df.age > 3).collect()
+        [Row(age=5, name=u'Bob')]
+        >>> df.where(df.age == 2).collect()
+        [Row(age=2, name=u'Alice')]
+
+        >>> df.filter("age > 3").collect()
+        [Row(age=5, name=u'Bob')]
+        >>> df.where("age = 2").collect()
+        [Row(age=2, name=u'Alice')]
+        """
+        if isinstance(condition, basestring):
+            jdf = self._jdf.filter(condition)
+        elif isinstance(condition, Column):
+            jdf = self._jdf.filter(condition._jc)
+        else:
+            raise TypeError("condition should be string or Column")
+        return DataFrame(jdf, self.sql_ctx)
+
+    where = filter
+
+    def groupBy(self, *cols):
+        """ Group the :class:`DataFrame` using the specified columns,
+        so we can run aggregation on them. See :class:`GroupedData`
+        for all the available aggregate functions.
+
+        >>> df.groupBy().avg().collect()
+        [Row(AVG(age#0)=3.5)]
+        >>> df.groupBy('name').agg({'age': 'mean'}).collect()
+        [Row(name=u'Bob', AVG(age#0)=5.0), Row(name=u'Alice', AVG(age#0)=2.0)]
+        >>> df.groupBy(df.name).avg().collect()
+        [Row(name=u'Bob', AVG(age#0)=5.0), Row(name=u'Alice', AVG(age#0)=2.0)]
+        """
+        jcols = ListConverter().convert([_to_java_column(c) for c in cols],
+                                        self._sc._gateway._gateway_client)
+        jdf = self._jdf.groupBy(self.sql_ctx._sc._jvm.PythonUtils.toSeq(jcols))
+        return GroupedData(jdf, self.sql_ctx)
+
+    def agg(self, *exprs):
+        """ Aggregate on the entire :class:`DataFrame` without groups
+        (shorthand for df.groupBy.agg()).
+
+        >>> df.agg({"age": "max"}).collect()
+        [Row(MAX(age#0)=5)]
+        >>> from pyspark.sql import Dsl
+        >>> df.agg(Dsl.min(df.age)).collect()
+        [Row(MIN(age#0)=2)]
+        """
+        return self.groupBy().agg(*exprs)
+
+    def unionAll(self, other):
+        """ Return a new DataFrame containing union of rows in this
+        frame and another frame.
+
+        This is equivalent to `UNION ALL` in SQL.
+        """
+        return DataFrame(self._jdf.unionAll(other._jdf), self.sql_ctx)
+
+    def intersect(self, other):
+        """ Return a new :class:`DataFrame` containing rows only in
+        both this frame and another frame.
+
+        This is equivalent to `INTERSECT` in SQL.
+        """
+        return DataFrame(self._jdf.intersect(other._jdf), self.sql_ctx)
+
+    def subtract(self, other):
+        """ Return a new :class:`DataFrame` containing rows in this frame
+        but not in another frame.
+
+        This is equivalent to `EXCEPT` in SQL.
+        """
+        return DataFrame(getattr(self._jdf, "except")(other._jdf), self.sql_ctx)
+
+    def addColumn(self, colName, col):
+        """ Return a new :class:`DataFrame` by adding a column.
+
+        >>> df.addColumn('age2', df.age + 2).collect()
+        [Row(age=2, name=u'Alice', age2=4), Row(age=5, name=u'Bob', age2=7)]
+        """
+        return self.select('*', col.alias(colName))
+
+    def to_pandas(self):
+        """
+        Collect all the rows and return a `pandas.DataFrame`.
+
+        >>> df.to_pandas()  # doctest: +SKIP
+           age   name
+        0    2  Alice
+        1    5    Bob
+        """
+        import pandas as pd
+        return pd.DataFrame.from_records(self.collect(), columns=self.columns)
+
+
+# Having SchemaRDD for backward compatibility (for docs)
+class SchemaRDD(DataFrame):
+    """
+    SchemaRDD is deprecated, please use DataFrame
+    """
+
+
+def dfapi(f):
+    def _api(self):
+        name = f.__name__
+        jdf = getattr(self._jdf, name)()
+        return DataFrame(jdf, self.sql_ctx)
+    _api.__name__ = f.__name__
+    _api.__doc__ = f.__doc__
+    return _api
+
+
+class GroupedData(object):
+
+    """
+    A set of methods for aggregations on a :class:`DataFrame`,
+    created by DataFrame.groupBy().
+    """
+
+    def __init__(self, jdf, sql_ctx):
+        self._jdf = jdf
+        self.sql_ctx = sql_ctx
+
+    def agg(self, *exprs):
+        """ Compute aggregates by specifying a map from column name
+        to aggregate methods.
+
+        The available aggregate methods are `avg`, `max`, `min`,
+        `sum`, `count`.
+
+        :param exprs: list or aggregate columns or a map from column
+                      name to aggregate methods.
+
+        >>> gdf = df.groupBy(df.name)
+        >>> gdf.agg({"age": "max"}).collect()
+        [Row(name=u'Bob', MAX(age#0)=5), Row(name=u'Alice', MAX(age#0)=2)]
+        >>> from pyspark.sql import Dsl
+        >>> gdf.agg(Dsl.min(df.age)).collect()
+        [Row(MIN(age#0)=5), Row(MIN(age#0)=2)]
+        """
+        assert exprs, "exprs should not be empty"
+        if len(exprs) == 1 and isinstance(exprs[0], dict):
+            jmap = MapConverter().convert(exprs[0],
+                                          self.sql_ctx._sc._gateway._gateway_client)
+            jdf = self._jdf.agg(jmap)
+        else:
+            # Columns
+            assert all(isinstance(c, Column) for c in exprs), "all exprs should be Column"
+            jcols = ListConverter().convert([c._jc for c in exprs[1:]],
+                                            self.sql_ctx._sc._gateway._gateway_client)
+            jdf = self._jdf.agg(exprs[0]._jc, self.sql_ctx._sc._jvm.PythonUtils.toSeq(jcols))
+        return DataFrame(jdf, self.sql_ctx)
+
+    @dfapi
+    def count(self):
+        """ Count the number of rows for each group.
+
+        >>> df.groupBy(df.age).count().collect()
+        [Row(age=2, count=1), Row(age=5, count=1)]
+        """
+
+    @dfapi
+    def mean(self):
+        """Compute the average value for each numeric columns
+        for each group. This is an alias for `avg`."""
+
+    @dfapi
+    def avg(self):
+        """Compute the average value for each numeric columns
+        for each group."""
+
+    @dfapi
+    def max(self):
+        """Compute the max value for each numeric columns for
+        each group. """
+
+    @dfapi
+    def min(self):
+        """Compute the min value for each numeric column for
+        each group."""
+
+    @dfapi
+    def sum(self):
+        """Compute the sum for each numeric columns for each
+        group."""
+
+
+def _create_column_from_literal(literal):
+    sc = SparkContext._active_spark_context
+    return sc._jvm.Dsl.lit(literal)
+
+
+def _create_column_from_name(name):
+    sc = SparkContext._active_spark_context
+    return sc._jvm.Dsl.col(name)
+
+
+def _to_java_column(col):
+    if isinstance(col, Column):
+        jcol = col._jc
+    else:
+        jcol = _create_column_from_name(col)
+    return jcol
+
+
+def _unary_op(name, doc="unary operator"):
+    """ Create a method for given unary operator """
+    def _(self):
+        jc = getattr(self._jc, name)()
+        return Column(jc, self.sql_ctx)
+    _.__doc__ = doc
+    return _
+
+
+def _dsl_op(name, doc=''):
+    def _(self):
+        jc = getattr(self._sc._jvm.Dsl, name)(self._jc)
+        return Column(jc, self.sql_ctx)
+    _.__doc__ = doc
+    return _
+
+
+def _bin_op(name, doc="binary operator"):
+    """ Create a method for given binary operator
+    """
+    def _(self, other):
+        jc = other._jc if isinstance(other, Column) else other
+        njc = getattr(self._jc, name)(jc)
+        return Column(njc, self.sql_ctx)
+    _.__doc__ = doc
+    return _
+
+
+def _reverse_op(name, doc="binary operator"):
+    """ Create a method for binary operator (this object is on right side)
+    """
+    def _(self, other):
+        jother = _create_column_from_literal(other)
+        jc = getattr(jother, name)(self._jc)
+        return Column(jc, self.sql_ctx)
+    _.__doc__ = doc
+    return _
+
+
+class Column(DataFrame):
+
+    """
+    A column in a DataFrame.
+
+    `Column` instances can be created by::
+
+        # 1. Select a column out of a DataFrame
+        df.colName
+        df["colName"]
+
+        # 2. Create from an expression
+        df.colName + 1
+        1 / df.colName
+    """
+
+    def __init__(self, jc, sql_ctx=None):
+        self._jc = jc
+        super(Column, self).__init__(jc, sql_ctx)
+
+    # arithmetic operators
+    __neg__ = _dsl_op("negate")
+    __add__ = _bin_op("plus")
+    __sub__ = _bin_op("minus")
+    __mul__ = _bin_op("multiply")
+    __div__ = _bin_op("divide")
+    __mod__ = _bin_op("mod")
+    __radd__ = _bin_op("plus")
+    __rsub__ = _reverse_op("minus")
+    __rmul__ = _bin_op("multiply")
+    __rdiv__ = _reverse_op("divide")
+    __rmod__ = _reverse_op("mod")
+
+    # logistic operators
+    __eq__ = _bin_op("equalTo")
+    __ne__ = _bin_op("notEqual")
+    __lt__ = _bin_op("lt")
+    __le__ = _bin_op("leq")
+    __ge__ = _bin_op("geq")
+    __gt__ = _bin_op("gt")
+
+    # `and`, `or`, `not` cannot be overloaded in Python,
+    # so use bitwise operators as boolean operators
+    __and__ = _bin_op('and')
+    __or__ = _bin_op('or')
+    __invert__ = _dsl_op('not')
+    __rand__ = _bin_op("and")
+    __ror__ = _bin_op("or")
+
+    # container operators
+    __contains__ = _bin_op("contains")
+    __getitem__ = _bin_op("getItem")
+    getField = _bin_op("getField", "An expression that gets a field by name in a StructField.")
+
+    # string methods
+    rlike = _bin_op("rlike")
+    like = _bin_op("like")
+    startswith = _bin_op("startsWith")
+    endswith = _bin_op("endsWith")
+
+    def substr(self, startPos, length):
+        """
+        Return a Column which is a substring of the column
+
+        :param startPos: start position (int or Column)
+        :param length:  length of the substring (int or Column)
+
+        >>> df.name.substr(1, 3).collect()
+        [Row(col=u'Ali'), Row(col=u'Bob')]
+        """
+        if type(startPos) != type(length):
+            raise TypeError("Can not mix the type")
+        if isinstance(startPos, (int, long)):
+            jc = self._jc.substr(startPos, length)
+        elif isinstance(startPos, Column):
+            jc = self._jc.substr(startPos._jc, length._jc)
+        else:
+            raise TypeError("Unexpected type: %s" % type(startPos))
+        return Column(jc, self.sql_ctx)
+
+    __getslice__ = substr
+
+    # order
+    asc = _unary_op("asc")
+    desc = _unary_op("desc")
+
+    isNull = _unary_op("isNull", "True if the current expression is null.")
+    isNotNull = _unary_op("isNotNull", "True if the current expression is not null.")
+
+    def alias(self, alias):
+        """Return a alias for this column
+
+        >>> df.age.alias("age2").collect()
+        [Row(age2=2), Row(age2=5)]
+        """
+        return Column(getattr(self._jc, "as")(alias), self.sql_ctx)
+
+    def cast(self, dataType):
+        """ Convert the column into type `dataType`
+
+        >>> df.select(df.age.cast("string").alias('ages')).collect()
+        [Row(ages=u'2'), Row(ages=u'5')]
+        >>> df.select(df.age.cast(StringType()).alias('ages')).collect()
+        [Row(ages=u'2'), Row(ages=u'5')]
+        """
+        if self.sql_ctx is None:
+            sc = SparkContext._active_spark_context
+            ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
+        else:
+            ssql_ctx = self.sql_ctx._ssql_ctx
+        if isinstance(dataType, basestring):
+            jc = self._jc.cast(dataType)
+        elif isinstance(dataType, DataType):
+            jdt = ssql_ctx.parseDataType(dataType.json())
+            jc = self._jc.cast(jdt)
+        return Column(jc, self.sql_ctx)
+
+    def to_pandas(self):
+        """
+        Return a pandas.Series from the column
+
+        >>> df.age.to_pandas()  # doctest: +SKIP
+        0    2
+        1    5
+        dtype: int64
+        """
+        import pandas as pd
+        data = [c for c, in self.collect()]
+        return pd.Series(data)
+
+
+def _aggregate_func(name, doc=""):
+    """ Create a function for aggregator by name"""
+    def _(col):
+        sc = SparkContext._active_spark_context
+        jc = getattr(sc._jvm.Dsl, name)(_to_java_column(col))
+        return Column(jc)
+    _.__name__ = name
+    _.__doc__ = doc
+    return staticmethod(_)
+
+
+class UserDefinedFunction(object):
+    def __init__(self, func, returnType):
+        self.func = func
+        self.returnType = returnType
+        self._broadcast = None
+        self._judf = self._create_judf()
+
+    def _create_judf(self):
+        f = self.func  # put it in closure `func`
+        func = lambda _, it: imap(lambda x: f(*x), it)
+        ser = AutoBatchedSerializer(PickleSerializer())
+        command = (func, None, ser, ser)
+        sc = SparkContext._active_spark_context
+        pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self)
+        ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
+        jdt = ssql_ctx.parseDataType(self.returnType.json())
+        judf = sc._jvm.UserDefinedPythonFunction(f.__name__, bytearray(pickled_command), env,
+                                                 includes, sc.pythonExec, broadcast_vars,
+                                                 sc._javaAccumulator, jdt)
+        return judf
+
+    def __del__(self):
+        if self._broadcast is not None:
+            self._broadcast.unpersist()
+            self._broadcast = None
+
+    def __call__(self, *cols):
+        sc = SparkContext._active_spark_context
+        jcols = ListConverter().convert([_to_java_column(c) for c in cols],
+                                        sc._gateway._gateway_client)
+        jc = self._judf.apply(sc._jvm.PythonUtils.toSeq(jcols))
+        return Column(jc)
+
+
+class Dsl(object):
+    """
+    A collections of builtin aggregators
+    """
+    DSLS = {
+        'lit': 'Creates a :class:`Column` of literal value.',
+        'col': 'Returns a :class:`Column` based on the given column name.',
+        'column': 'Returns a :class:`Column` based on the given column name.',
+        'upper': 'Converts a string expression to upper case.',
+        'lower': 'Converts a string expression to upper case.',
+        'sqrt': 'Computes the square root of the specified float value.',
+        'abs': 'Computes the absolutle value.',
+
+        'max': 'Aggregate function: returns the maximum value of the expression in a group.',
+        'min': 'Aggregate function: returns the minimum value of the expression in a group.',
+        'first': 'Aggregate function: returns the first value in a group.',
+        'last': 'Aggregate function: returns the last value in a group.',
+        'count': 'Aggregate function: returns the number of items in a group.',
+        'sum': 'Aggregate function: returns the sum of all values in the expression.',
+        'avg': 'Aggregate function: returns the average of the values in a group.',
+        'mean': 'Aggregate function: returns the average of the values in a group.',
+        'sumDistinct': 'Aggregate function: returns the sum of distinct values in the expression.',
+    }
+
+    for _name, _doc in DSLS.items():
+        locals()[_name] = _aggregate_func(_name, _doc)
+    del _name, _doc
+
+    @staticmethod
+    def countDistinct(col, *cols):
+        """ Return a new Column for distinct count of (col, *cols)
+
+        >>> from pyspark.sql import Dsl
+        >>> df.agg(Dsl.countDistinct(df.age, df.name).alias('c')).collect()
+        [Row(c=2)]
+
+        >>> df.agg(Dsl.countDistinct("age", "name").alias('c')).collect()
+        [Row(c=2)]
+        """
+        sc = SparkContext._active_spark_context
+        jcols = ListConverter().convert([_to_java_column(c) for c in cols],
+                                        sc._gateway._gateway_client)
+        jc = sc._jvm.Dsl.countDistinct(_to_java_column(col),
+                                       sc._jvm.PythonUtils.toSeq(jcols))
+        return Column(jc)
+
+    @staticmethod
+    def approxCountDistinct(col, rsd=None):
+        """ Return a new Column for approxiate distinct count of (col, *cols)
+
+        >>> from pyspark.sql import Dsl
+        >>> df.agg(Dsl.approxCountDistinct(df.age).alias('c')).collect()
+        [Row(c=2)]
+        """
+        sc = SparkContext._active_spark_context
+        if rsd is None:
+            jc = sc._jvm.Dsl.approxCountDistinct(_to_java_column(col))
+        else:
+            jc = sc._jvm.Dsl.approxCountDistinct(_to_java_column(col), rsd)
+        return Column(jc)
+
+    @staticmethod
+    def udf(f, returnType=StringType()):
+        """Create a user defined function (UDF)
+
+        >>> slen = Dsl.udf(lambda s: len(s), IntegerType())
+        >>> df.select(slen(df.name).alias('slen')).collect()
+        [Row(slen=5), Row(slen=3)]
+        """
+        return UserDefinedFunction(f, returnType)
+
+
+def _test():
+    import doctest
+    from pyspark.context import SparkContext
+    from pyspark.sql import Row, SQLContext
+    import pyspark.sql.dataframe
+    globs = pyspark.sql.dataframe.__dict__.copy()
+    sc = SparkContext('local[4]', 'PythonTest')
+    globs['sc'] = sc
+    globs['sqlCtx'] = sqlCtx = SQLContext(sc)
+    rdd2 = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)])
+    rdd3 = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)])
+    globs['df'] = sqlCtx.inferSchema(rdd2)
+    globs['df2'] = sqlCtx.inferSchema(rdd3)
+    (failure_count, test_count) = doctest.testmod(
+        pyspark.sql.dataframe, globs=globs, optionflags=doctest.ELLIPSIS)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/pyspark/sql_tests.py b/python/pyspark/sql/tests.py
similarity index 96%
rename from python/pyspark/sql_tests.py
rename to python/pyspark/sql/tests.py
index d314f46e8d2d5..d25c6365ed067 100644
--- a/python/pyspark/sql_tests.py
+++ b/python/pyspark/sql/tests.py
@@ -34,8 +34,10 @@
 else:
     import unittest
 
-from pyspark.sql import SQLContext, IntegerType, Row, ArrayType, StructType, StructField, \
-    UserDefinedType, DoubleType
+
+from pyspark.sql import SQLContext, Column
+from pyspark.sql.types import IntegerType, Row, ArrayType, StructType, StructField, \
+    UserDefinedType, DoubleType, LongType
 from pyspark.tests import ReusedPySparkTestCase
 
 
@@ -220,7 +222,7 @@ def test_convert_row_to_dict(self):
         self.assertEqual(1.0, row.asDict()['d']['key'].c)
 
     def test_infer_schema_with_udt(self):
-        from pyspark.sql_tests import ExamplePoint, ExamplePointUDT
+        from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
         row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
         rdd = self.sc.parallelize([row])
         df = self.sqlCtx.inferSchema(rdd)
@@ -232,7 +234,7 @@ def test_infer_schema_with_udt(self):
         self.assertEqual(point, ExamplePoint(1.0, 2.0))
 
     def test_apply_schema_with_udt(self):
-        from pyspark.sql_tests import ExamplePoint, ExamplePointUDT
+        from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
         row = (1.0, ExamplePoint(1.0, 2.0))
         rdd = self.sc.parallelize([row])
         schema = StructType([StructField("label", DoubleType(), False),
@@ -242,7 +244,7 @@ def test_apply_schema_with_udt(self):
         self.assertEquals(point, ExamplePoint(1.0, 2.0))
 
     def test_parquet_with_udt(self):
-        from pyspark.sql_tests import ExamplePoint
+        from pyspark.sql.tests import ExamplePoint
         row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
         rdd = self.sc.parallelize([row])
         df0 = self.sqlCtx.inferSchema(rdd)
@@ -253,7 +255,6 @@ def test_parquet_with_udt(self):
         self.assertEquals(point, ExamplePoint(1.0, 2.0))
 
     def test_column_operators(self):
-        from pyspark.sql import Column, LongType
         ci = self.df.key
         cs = self.df.value
         c = ci == cs
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
new file mode 100644
index 0000000000000..41afefe48ee5e
--- /dev/null
+++ b/python/pyspark/sql/types.py
@@ -0,0 +1,1279 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import decimal
+import datetime
+import keyword
+import warnings
+import json
+import re
+from array import array
+from operator import itemgetter
+
+
+__all__ = [
+    "DataType", "NullType", "StringType", "BinaryType", "BooleanType", "DateType",
+    "TimestampType", "DecimalType", "DoubleType", "FloatType", "ByteType", "IntegerType",
+    "LongType", "ShortType", "ArrayType", "MapType", "StructField", "StructType", ]
+
+
+class DataType(object):
+
+    """Spark SQL DataType"""
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def __eq__(self, other):
+        return (isinstance(other, self.__class__) and
+                self.__dict__ == other.__dict__)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    @classmethod
+    def typeName(cls):
+        return cls.__name__[:-4].lower()
+
+    def jsonValue(self):
+        return self.typeName()
+
+    def json(self):
+        return json.dumps(self.jsonValue(),
+                          separators=(',', ':'),
+                          sort_keys=True)
+
+
+class PrimitiveTypeSingleton(type):
+
+    """Metaclass for PrimitiveType"""
+
+    _instances = {}
+
+    def __call__(cls):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(PrimitiveTypeSingleton, cls).__call__()
+        return cls._instances[cls]
+
+
+class PrimitiveType(DataType):
+
+    """Spark SQL PrimitiveType"""
+
+    __metaclass__ = PrimitiveTypeSingleton
+
+    def __eq__(self, other):
+        # because they should be the same object
+        return self is other
+
+
+class NullType(PrimitiveType):
+
+    """Spark SQL NullType
+
+    The data type representing None, used for the types which has not
+    been inferred.
+    """
+
+
+class StringType(PrimitiveType):
+
+    """Spark SQL StringType
+
+    The data type representing string values.
+    """
+
+
+class BinaryType(PrimitiveType):
+
+    """Spark SQL BinaryType
+
+    The data type representing bytearray values.
+    """
+
+
+class BooleanType(PrimitiveType):
+
+    """Spark SQL BooleanType
+
+    The data type representing bool values.
+    """
+
+
+class DateType(PrimitiveType):
+
+    """Spark SQL DateType
+
+    The data type representing datetime.date values.
+    """
+
+
+class TimestampType(PrimitiveType):
+
+    """Spark SQL TimestampType
+
+    The data type representing datetime.datetime values.
+    """
+
+
+class DecimalType(DataType):
+
+    """Spark SQL DecimalType
+
+    The data type representing decimal.Decimal values.
+    """
+
+    def __init__(self, precision=None, scale=None):
+        self.precision = precision
+        self.scale = scale
+        self.hasPrecisionInfo = precision is not None
+
+    def jsonValue(self):
+        if self.hasPrecisionInfo:
+            return "decimal(%d,%d)" % (self.precision, self.scale)
+        else:
+            return "decimal"
+
+    def __repr__(self):
+        if self.hasPrecisionInfo:
+            return "DecimalType(%d,%d)" % (self.precision, self.scale)
+        else:
+            return "DecimalType()"
+
+
+class DoubleType(PrimitiveType):
+
+    """Spark SQL DoubleType
+
+    The data type representing float values.
+    """
+
+
+class FloatType(PrimitiveType):
+
+    """Spark SQL FloatType
+
+    The data type representing single precision floating-point values.
+    """
+
+
+class ByteType(PrimitiveType):
+
+    """Spark SQL ByteType
+
+    The data type representing int values with 1 singed byte.
+    """
+
+
+class IntegerType(PrimitiveType):
+
+    """Spark SQL IntegerType
+
+    The data type representing int values.
+    """
+
+
+class LongType(PrimitiveType):
+
+    """Spark SQL LongType
+
+    The data type representing long values. If the any value is
+    beyond the range of [-9223372036854775808, 9223372036854775807],
+    please use DecimalType.
+    """
+
+
+class ShortType(PrimitiveType):
+
+    """Spark SQL ShortType
+
+    The data type representing int values with 2 signed bytes.
+    """
+
+
+class ArrayType(DataType):
+
+    """Spark SQL ArrayType
+
+    The data type representing list values. An ArrayType object
+    comprises two fields, elementType (a DataType) and containsNull (a bool).
+    The field of elementType is used to specify the type of array elements.
+    The field of containsNull is used to specify if the array has None values.
+
+    """
+
+    def __init__(self, elementType, containsNull=True):
+        """Creates an ArrayType
+
+        :param elementType: the data type of elements.
+        :param containsNull: indicates whether the list contains None values.
+
+        >>> ArrayType(StringType) == ArrayType(StringType, True)
+        True
+        >>> ArrayType(StringType, False) == ArrayType(StringType)
+        False
+        """
+        self.elementType = elementType
+        self.containsNull = containsNull
+
+    def __repr__(self):
+        return "ArrayType(%s,%s)" % (self.elementType,
+                                     str(self.containsNull).lower())
+
+    def jsonValue(self):
+        return {"type": self.typeName(),
+                "elementType": self.elementType.jsonValue(),
+                "containsNull": self.containsNull}
+
+    @classmethod
+    def fromJson(cls, json):
+        return ArrayType(_parse_datatype_json_value(json["elementType"]),
+                         json["containsNull"])
+
+
+class MapType(DataType):
+
+    """Spark SQL MapType
+
+    The data type representing dict values. A MapType object comprises
+    three fields, keyType (a DataType), valueType (a DataType) and
+    valueContainsNull (a bool).
+
+    The field of keyType is used to specify the type of keys in the map.
+    The field of valueType is used to specify the type of values in the map.
+    The field of valueContainsNull is used to specify if values of this
+    map has None values.
+
+    For values of a MapType column, keys are not allowed to have None values.
+
+    """
+
+    def __init__(self, keyType, valueType, valueContainsNull=True):
+        """Creates a MapType
+        :param keyType: the data type of keys.
+        :param valueType: the data type of values.
+        :param valueContainsNull: indicates whether values contains
+        null values.
+
+        >>> (MapType(StringType, IntegerType)
+        ...        == MapType(StringType, IntegerType, True))
+        True
+        >>> (MapType(StringType, IntegerType, False)
+        ...        == MapType(StringType, FloatType))
+        False
+        """
+        self.keyType = keyType
+        self.valueType = valueType
+        self.valueContainsNull = valueContainsNull
+
+    def __repr__(self):
+        return "MapType(%s,%s,%s)" % (self.keyType, self.valueType,
+                                      str(self.valueContainsNull).lower())
+
+    def jsonValue(self):
+        return {"type": self.typeName(),
+                "keyType": self.keyType.jsonValue(),
+                "valueType": self.valueType.jsonValue(),
+                "valueContainsNull": self.valueContainsNull}
+
+    @classmethod
+    def fromJson(cls, json):
+        return MapType(_parse_datatype_json_value(json["keyType"]),
+                       _parse_datatype_json_value(json["valueType"]),
+                       json["valueContainsNull"])
+
+
+class StructField(DataType):
+
+    """Spark SQL StructField
+
+    Represents a field in a StructType.
+    A StructField object comprises three fields, name (a string),
+    dataType (a DataType) and nullable (a bool). The field of name
+    is the name of a StructField. The field of dataType specifies
+    the data type of a StructField.
+
+    The field of nullable specifies if values of a StructField can
+    contain None values.
+
+    """
+
+    def __init__(self, name, dataType, nullable=True, metadata=None):
+        """Creates a StructField
+        :param name: the name of this field.
+        :param dataType: the data type of this field.
+        :param nullable: indicates whether values of this field
+                         can be null.
+        :param metadata: metadata of this field, which is a map from string
+                         to simple type that can be serialized to JSON
+                         automatically
+
+        >>> (StructField("f1", StringType, True)
+        ...      == StructField("f1", StringType, True))
+        True
+        >>> (StructField("f1", StringType, True)
+        ...      == StructField("f2", StringType, True))
+        False
+        """
+        self.name = name
+        self.dataType = dataType
+        self.nullable = nullable
+        self.metadata = metadata or {}
+
+    def __repr__(self):
+        return "StructField(%s,%s,%s)" % (self.name, self.dataType,
+                                          str(self.nullable).lower())
+
+    def jsonValue(self):
+        return {"name": self.name,
+                "type": self.dataType.jsonValue(),
+                "nullable": self.nullable,
+                "metadata": self.metadata}
+
+    @classmethod
+    def fromJson(cls, json):
+        return StructField(json["name"],
+                           _parse_datatype_json_value(json["type"]),
+                           json["nullable"],
+                           json["metadata"])
+
+
+class StructType(DataType):
+
+    """Spark SQL StructType
+
+    The data type representing rows.
+    A StructType object comprises a list of L{StructField}.
+
+    """
+
+    def __init__(self, fields):
+        """Creates a StructType
+
+        >>> struct1 = StructType([StructField("f1", StringType, True)])
+        >>> struct2 = StructType([StructField("f1", StringType, True)])
+        >>> struct1 == struct2
+        True
+        >>> struct1 = StructType([StructField("f1", StringType, True)])
+        >>> struct2 = StructType([StructField("f1", StringType, True),
+        ...   [StructField("f2", IntegerType, False)]])
+        >>> struct1 == struct2
+        False
+        """
+        self.fields = fields
+
+    def __repr__(self):
+        return ("StructType(List(%s))" %
+                ",".join(str(field) for field in self.fields))
+
+    def jsonValue(self):
+        return {"type": self.typeName(),
+                "fields": [f.jsonValue() for f in self.fields]}
+
+    @classmethod
+    def fromJson(cls, json):
+        return StructType([StructField.fromJson(f) for f in json["fields"]])
+
+
+class UserDefinedType(DataType):
+    """
+    .. note:: WARN: Spark Internal Use Only
+    SQL User-Defined Type (UDT).
+    """
+
+    @classmethod
+    def typeName(cls):
+        return cls.__name__.lower()
+
+    @classmethod
+    def sqlType(cls):
+        """
+        Underlying SQL storage type for this UDT.
+        """
+        raise NotImplementedError("UDT must implement sqlType().")
+
+    @classmethod
+    def module(cls):
+        """
+        The Python module of the UDT.
+        """
+        raise NotImplementedError("UDT must implement module().")
+
+    @classmethod
+    def scalaUDT(cls):
+        """
+        The class name of the paired Scala UDT.
+        """
+        raise NotImplementedError("UDT must have a paired Scala UDT.")
+
+    def serialize(self, obj):
+        """
+        Converts the a user-type object into a SQL datum.
+        """
+        raise NotImplementedError("UDT must implement serialize().")
+
+    def deserialize(self, datum):
+        """
+        Converts a SQL datum into a user-type object.
+        """
+        raise NotImplementedError("UDT must implement deserialize().")
+
+    def json(self):
+        return json.dumps(self.jsonValue(), separators=(',', ':'), sort_keys=True)
+
+    def jsonValue(self):
+        schema = {
+            "type": "udt",
+            "class": self.scalaUDT(),
+            "pyClass": "%s.%s" % (self.module(), type(self).__name__),
+            "sqlType": self.sqlType().jsonValue()
+        }
+        return schema
+
+    @classmethod
+    def fromJson(cls, json):
+        pyUDT = json["pyClass"]
+        split = pyUDT.rfind(".")
+        pyModule = pyUDT[:split]
+        pyClass = pyUDT[split+1:]
+        m = __import__(pyModule, globals(), locals(), [pyClass], -1)
+        UDT = getattr(m, pyClass)
+        return UDT()
+
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+
+_all_primitive_types = dict((v.typeName(), v)
+                            for v in globals().itervalues()
+                            if type(v) is PrimitiveTypeSingleton and
+                            v.__base__ == PrimitiveType)
+
+
+_all_complex_types = dict((v.typeName(), v)
+                          for v in [ArrayType, MapType, StructType])
+
+
+def _parse_datatype_json_string(json_string):
+    """Parses the given data type JSON string.
+    >>> def check_datatype(datatype):
+    ...     scala_datatype = sqlCtx._ssql_ctx.parseDataType(datatype.json())
+    ...     python_datatype = _parse_datatype_json_string(scala_datatype.json())
+    ...     return datatype == python_datatype
+    >>> all(check_datatype(cls()) for cls in _all_primitive_types.values())
+    True
+    >>> # Simple ArrayType.
+    >>> simple_arraytype = ArrayType(StringType(), True)
+    >>> check_datatype(simple_arraytype)
+    True
+    >>> # Simple MapType.
+    >>> simple_maptype = MapType(StringType(), LongType())
+    >>> check_datatype(simple_maptype)
+    True
+    >>> # Simple StructType.
+    >>> simple_structtype = StructType([
+    ...     StructField("a", DecimalType(), False),
+    ...     StructField("b", BooleanType(), True),
+    ...     StructField("c", LongType(), True),
+    ...     StructField("d", BinaryType(), False)])
+    >>> check_datatype(simple_structtype)
+    True
+    >>> # Complex StructType.
+    >>> complex_structtype = StructType([
+    ...     StructField("simpleArray", simple_arraytype, True),
+    ...     StructField("simpleMap", simple_maptype, True),
+    ...     StructField("simpleStruct", simple_structtype, True),
+    ...     StructField("boolean", BooleanType(), False),
+    ...     StructField("withMeta", DoubleType(), False, {"name": "age"})])
+    >>> check_datatype(complex_structtype)
+    True
+    >>> # Complex ArrayType.
+    >>> complex_arraytype = ArrayType(complex_structtype, True)
+    >>> check_datatype(complex_arraytype)
+    True
+    >>> # Complex MapType.
+    >>> complex_maptype = MapType(complex_structtype,
+    ...                           complex_arraytype, False)
+    >>> check_datatype(complex_maptype)
+    True
+    >>> check_datatype(ExamplePointUDT())
+    True
+    >>> structtype_with_udt = StructType([StructField("label", DoubleType(), False),
+    ...                                   StructField("point", ExamplePointUDT(), False)])
+    >>> check_datatype(structtype_with_udt)
+    True
+    """
+    return _parse_datatype_json_value(json.loads(json_string))
+
+
+_FIXED_DECIMAL = re.compile("decimal\\((\\d+),(\\d+)\\)")
+
+
+def _parse_datatype_json_value(json_value):
+    if type(json_value) is unicode:
+        if json_value in _all_primitive_types.keys():
+            return _all_primitive_types[json_value]()
+        elif json_value == u'decimal':
+            return DecimalType()
+        elif _FIXED_DECIMAL.match(json_value):
+            m = _FIXED_DECIMAL.match(json_value)
+            return DecimalType(int(m.group(1)), int(m.group(2)))
+        else:
+            raise ValueError("Could not parse datatype: %s" % json_value)
+    else:
+        tpe = json_value["type"]
+        if tpe in _all_complex_types:
+            return _all_complex_types[tpe].fromJson(json_value)
+        elif tpe == 'udt':
+            return UserDefinedType.fromJson(json_value)
+        else:
+            raise ValueError("not supported type: %s" % tpe)
+
+
+# Mapping Python types to Spark SQL DataType
+_type_mappings = {
+    type(None): NullType,
+    bool: BooleanType,
+    int: IntegerType,
+    long: LongType,
+    float: DoubleType,
+    str: StringType,
+    unicode: StringType,
+    bytearray: BinaryType,
+    decimal.Decimal: DecimalType,
+    datetime.date: DateType,
+    datetime.datetime: TimestampType,
+    datetime.time: TimestampType,
+}
+
+
+def _infer_type(obj):
+    """Infer the DataType from obj
+
+    >>> p = ExamplePoint(1.0, 2.0)
+    >>> _infer_type(p)
+    ExamplePointUDT
+    """
+    if obj is None:
+        raise ValueError("Can not infer type for None")
+
+    if hasattr(obj, '__UDT__'):
+        return obj.__UDT__
+
+    dataType = _type_mappings.get(type(obj))
+    if dataType is not None:
+        return dataType()
+
+    if isinstance(obj, dict):
+        for key, value in obj.iteritems():
+            if key is not None and value is not None:
+                return MapType(_infer_type(key), _infer_type(value), True)
+        else:
+            return MapType(NullType(), NullType(), True)
+    elif isinstance(obj, (list, array)):
+        for v in obj:
+            if v is not None:
+                return ArrayType(_infer_type(obj[0]), True)
+        else:
+            return ArrayType(NullType(), True)
+    else:
+        try:
+            return _infer_schema(obj)
+        except ValueError:
+            raise ValueError("not supported type: %s" % type(obj))
+
+
+def _infer_schema(row):
+    """Infer the schema from dict/namedtuple/object"""
+    if isinstance(row, dict):
+        items = sorted(row.items())
+
+    elif isinstance(row, tuple):
+        if hasattr(row, "_fields"):  # namedtuple
+            items = zip(row._fields, tuple(row))
+        elif hasattr(row, "__FIELDS__"):  # Row
+            items = zip(row.__FIELDS__, tuple(row))
+        elif all(isinstance(x, tuple) and len(x) == 2 for x in row):
+            items = row
+        else:
+            raise ValueError("Can't infer schema from tuple")
+
+    elif hasattr(row, "__dict__"):  # object
+        items = sorted(row.__dict__.items())
+
+    else:
+        raise ValueError("Can not infer schema for type: %s" % type(row))
+
+    fields = [StructField(k, _infer_type(v), True) for k, v in items]
+    return StructType(fields)
+
+
+def _need_python_to_sql_conversion(dataType):
+    """
+    Checks whether we need python to sql conversion for the given type.
+    For now, only UDTs need this conversion.
+
+    >>> _need_python_to_sql_conversion(DoubleType())
+    False
+    >>> schema0 = StructType([StructField("indices", ArrayType(IntegerType(), False), False),
+    ...                       StructField("values", ArrayType(DoubleType(), False), False)])
+    >>> _need_python_to_sql_conversion(schema0)
+    False
+    >>> _need_python_to_sql_conversion(ExamplePointUDT())
+    True
+    >>> schema1 = ArrayType(ExamplePointUDT(), False)
+    >>> _need_python_to_sql_conversion(schema1)
+    True
+    >>> schema2 = StructType([StructField("label", DoubleType(), False),
+    ...                       StructField("point", ExamplePointUDT(), False)])
+    >>> _need_python_to_sql_conversion(schema2)
+    True
+    """
+    if isinstance(dataType, StructType):
+        return any([_need_python_to_sql_conversion(f.dataType) for f in dataType.fields])
+    elif isinstance(dataType, ArrayType):
+        return _need_python_to_sql_conversion(dataType.elementType)
+    elif isinstance(dataType, MapType):
+        return _need_python_to_sql_conversion(dataType.keyType) or \
+            _need_python_to_sql_conversion(dataType.valueType)
+    elif isinstance(dataType, UserDefinedType):
+        return True
+    else:
+        return False
+
+
+def _python_to_sql_converter(dataType):
+    """
+    Returns a converter that converts a Python object into a SQL datum for the given type.
+
+    >>> conv = _python_to_sql_converter(DoubleType())
+    >>> conv(1.0)
+    1.0
+    >>> conv = _python_to_sql_converter(ArrayType(DoubleType(), False))
+    >>> conv([1.0, 2.0])
+    [1.0, 2.0]
+    >>> conv = _python_to_sql_converter(ExamplePointUDT())
+    >>> conv(ExamplePoint(1.0, 2.0))
+    [1.0, 2.0]
+    >>> schema = StructType([StructField("label", DoubleType(), False),
+    ...                      StructField("point", ExamplePointUDT(), False)])
+    >>> conv = _python_to_sql_converter(schema)
+    >>> conv((1.0, ExamplePoint(1.0, 2.0)))
+    (1.0, [1.0, 2.0])
+    """
+    if not _need_python_to_sql_conversion(dataType):
+        return lambda x: x
+
+    if isinstance(dataType, StructType):
+        names, types = zip(*[(f.name, f.dataType) for f in dataType.fields])
+        converters = map(_python_to_sql_converter, types)
+
+        def converter(obj):
+            if isinstance(obj, dict):
+                return tuple(c(obj.get(n)) for n, c in zip(names, converters))
+            elif isinstance(obj, tuple):
+                if hasattr(obj, "_fields") or hasattr(obj, "__FIELDS__"):
+                    return tuple(c(v) for c, v in zip(converters, obj))
+                elif all(isinstance(x, tuple) and len(x) == 2 for x in obj):  # k-v pairs
+                    d = dict(obj)
+                    return tuple(c(d.get(n)) for n, c in zip(names, converters))
+                else:
+                    return tuple(c(v) for c, v in zip(converters, obj))
+            else:
+                raise ValueError("Unexpected tuple %r with type %r" % (obj, dataType))
+        return converter
+    elif isinstance(dataType, ArrayType):
+        element_converter = _python_to_sql_converter(dataType.elementType)
+        return lambda a: [element_converter(v) for v in a]
+    elif isinstance(dataType, MapType):
+        key_converter = _python_to_sql_converter(dataType.keyType)
+        value_converter = _python_to_sql_converter(dataType.valueType)
+        return lambda m: dict([(key_converter(k), value_converter(v)) for k, v in m.items()])
+    elif isinstance(dataType, UserDefinedType):
+        return lambda obj: dataType.serialize(obj)
+    else:
+        raise ValueError("Unexpected type %r" % dataType)
+
+
+def _has_nulltype(dt):
+    """ Return whether there is NullType in `dt` or not """
+    if isinstance(dt, StructType):
+        return any(_has_nulltype(f.dataType) for f in dt.fields)
+    elif isinstance(dt, ArrayType):
+        return _has_nulltype((dt.elementType))
+    elif isinstance(dt, MapType):
+        return _has_nulltype(dt.keyType) or _has_nulltype(dt.valueType)
+    else:
+        return isinstance(dt, NullType)
+
+
+def _merge_type(a, b):
+    if isinstance(a, NullType):
+        return b
+    elif isinstance(b, NullType):
+        return a
+    elif type(a) is not type(b):
+        # TODO: type cast (such as int -> long)
+        raise TypeError("Can not merge type %s and %s" % (a, b))
+
+    # same type
+    if isinstance(a, StructType):
+        nfs = dict((f.name, f.dataType) for f in b.fields)
+        fields = [StructField(f.name, _merge_type(f.dataType, nfs.get(f.name, NullType())))
+                  for f in a.fields]
+        names = set([f.name for f in fields])
+        for n in nfs:
+            if n not in names:
+                fields.append(StructField(n, nfs[n]))
+        return StructType(fields)
+
+    elif isinstance(a, ArrayType):
+        return ArrayType(_merge_type(a.elementType, b.elementType), True)
+
+    elif isinstance(a, MapType):
+        return MapType(_merge_type(a.keyType, b.keyType),
+                       _merge_type(a.valueType, b.valueType),
+                       True)
+    else:
+        return a
+
+
+def _create_converter(dataType):
+    """Create an converter to drop the names of fields in obj """
+    if isinstance(dataType, ArrayType):
+        conv = _create_converter(dataType.elementType)
+        return lambda row: map(conv, row)
+
+    elif isinstance(dataType, MapType):
+        kconv = _create_converter(dataType.keyType)
+        vconv = _create_converter(dataType.valueType)
+        return lambda row: dict((kconv(k), vconv(v)) for k, v in row.iteritems())
+
+    elif isinstance(dataType, NullType):
+        return lambda x: None
+
+    elif not isinstance(dataType, StructType):
+        return lambda x: x
+
+    # dataType must be StructType
+    names = [f.name for f in dataType.fields]
+    converters = [_create_converter(f.dataType) for f in dataType.fields]
+
+    def convert_struct(obj):
+        if obj is None:
+            return
+
+        if isinstance(obj, tuple):
+            if hasattr(obj, "_fields"):
+                d = dict(zip(obj._fields, obj))
+            elif hasattr(obj, "__FIELDS__"):
+                d = dict(zip(obj.__FIELDS__, obj))
+            elif all(isinstance(x, tuple) and len(x) == 2 for x in obj):
+                d = dict(obj)
+            else:
+                raise ValueError("unexpected tuple: %s" % str(obj))
+
+        elif isinstance(obj, dict):
+            d = obj
+        elif hasattr(obj, "__dict__"):  # object
+            d = obj.__dict__
+        else:
+            raise ValueError("Unexpected obj: %s" % obj)
+
+        return tuple([conv(d.get(name)) for name, conv in zip(names, converters)])
+
+    return convert_struct
+
+
+_BRACKETS = {'(': ')', '[': ']', '{': '}'}
+
+
+def _split_schema_abstract(s):
+    """
+    split the schema abstract into fields
+
+    >>> _split_schema_abstract("a b  c")
+    ['a', 'b', 'c']
+    >>> _split_schema_abstract("a(a b)")
+    ['a(a b)']
+    >>> _split_schema_abstract("a b[] c{a b}")
+    ['a', 'b[]', 'c{a b}']
+    >>> _split_schema_abstract(" ")
+    []
+    """
+
+    r = []
+    w = ''
+    brackets = []
+    for c in s:
+        if c == ' ' and not brackets:
+            if w:
+                r.append(w)
+            w = ''
+        else:
+            w += c
+            if c in _BRACKETS:
+                brackets.append(c)
+            elif c in _BRACKETS.values():
+                if not brackets or c != _BRACKETS[brackets.pop()]:
+                    raise ValueError("unexpected " + c)
+
+    if brackets:
+        raise ValueError("brackets not closed: %s" % brackets)
+    if w:
+        r.append(w)
+    return r
+
+
+def _parse_field_abstract(s):
+    """
+    Parse a field in schema abstract
+
+    >>> _parse_field_abstract("a")
+    StructField(a,None,true)
+    >>> _parse_field_abstract("b(c d)")
+    StructField(b,StructType(...c,None,true),StructField(d...
+    >>> _parse_field_abstract("a[]")
+    StructField(a,ArrayType(None,true),true)
+    >>> _parse_field_abstract("a{[]}")
+    StructField(a,MapType(None,ArrayType(None,true),true),true)
+    """
+    if set(_BRACKETS.keys()) & set(s):
+        idx = min((s.index(c) for c in _BRACKETS if c in s))
+        name = s[:idx]
+        return StructField(name, _parse_schema_abstract(s[idx:]), True)
+    else:
+        return StructField(s, None, True)
+
+
+def _parse_schema_abstract(s):
+    """
+    parse abstract into schema
+
+    >>> _parse_schema_abstract("a b  c")
+    StructType...a...b...c...
+    >>> _parse_schema_abstract("a[b c] b{}")
+    StructType...a,ArrayType...b...c...b,MapType...
+    >>> _parse_schema_abstract("c{} d{a b}")
+    StructType...c,MapType...d,MapType...a...b...
+    >>> _parse_schema_abstract("a b(t)").fields[1]
+    StructField(b,StructType(List(StructField(t,None,true))),true)
+    """
+    s = s.strip()
+    if not s:
+        return
+
+    elif s.startswith('('):
+        return _parse_schema_abstract(s[1:-1])
+
+    elif s.startswith('['):
+        return ArrayType(_parse_schema_abstract(s[1:-1]), True)
+
+    elif s.startswith('{'):
+        return MapType(None, _parse_schema_abstract(s[1:-1]))
+
+    parts = _split_schema_abstract(s)
+    fields = [_parse_field_abstract(p) for p in parts]
+    return StructType(fields)
+
+
+def _infer_schema_type(obj, dataType):
+    """
+    Fill the dataType with types inferred from obj
+
+    >>> schema = _parse_schema_abstract("a b c d")
+    >>> row = (1, 1.0, "str", datetime.date(2014, 10, 10))
+    >>> _infer_schema_type(row, schema)
+    StructType...IntegerType...DoubleType...StringType...DateType...
+    >>> row = [[1], {"key": (1, 2.0)}]
+    >>> schema = _parse_schema_abstract("a[] b{c d}")
+    >>> _infer_schema_type(row, schema)
+    StructType...a,ArrayType...b,MapType(StringType,...c,IntegerType...
+    """
+    if dataType is None:
+        return _infer_type(obj)
+
+    if not obj:
+        return NullType()
+
+    if isinstance(dataType, ArrayType):
+        eType = _infer_schema_type(obj[0], dataType.elementType)
+        return ArrayType(eType, True)
+
+    elif isinstance(dataType, MapType):
+        k, v = obj.iteritems().next()
+        return MapType(_infer_schema_type(k, dataType.keyType),
+                       _infer_schema_type(v, dataType.valueType))
+
+    elif isinstance(dataType, StructType):
+        fs = dataType.fields
+        assert len(fs) == len(obj), \
+            "Obj(%s) have different length with fields(%s)" % (obj, fs)
+        fields = [StructField(f.name, _infer_schema_type(o, f.dataType), True)
+                  for o, f in zip(obj, fs)]
+        return StructType(fields)
+
+    else:
+        raise ValueError("Unexpected dataType: %s" % dataType)
+
+
+_acceptable_types = {
+    BooleanType: (bool,),
+    ByteType: (int, long),
+    ShortType: (int, long),
+    IntegerType: (int, long),
+    LongType: (int, long),
+    FloatType: (float,),
+    DoubleType: (float,),
+    DecimalType: (decimal.Decimal,),
+    StringType: (str, unicode),
+    BinaryType: (bytearray,),
+    DateType: (datetime.date,),
+    TimestampType: (datetime.datetime,),
+    ArrayType: (list, tuple, array),
+    MapType: (dict,),
+    StructType: (tuple, list),
+}
+
+
+def _verify_type(obj, dataType):
+    """
+    Verify the type of obj against dataType, raise an exception if
+    they do not match.
+
+    >>> _verify_type(None, StructType([]))
+    >>> _verify_type("", StringType())
+    >>> _verify_type(0, IntegerType())
+    >>> _verify_type(range(3), ArrayType(ShortType()))
+    >>> _verify_type(set(), ArrayType(StringType())) # doctest: +IGNORE_EXCEPTION_DETAIL
+    Traceback (most recent call last):
+        ...
+    TypeError:...
+    >>> _verify_type({}, MapType(StringType(), IntegerType()))
+    >>> _verify_type((), StructType([]))
+    >>> _verify_type([], StructType([]))
+    >>> _verify_type([1], StructType([])) # doctest: +IGNORE_EXCEPTION_DETAIL
+    Traceback (most recent call last):
+        ...
+    ValueError:...
+    >>> _verify_type(ExamplePoint(1.0, 2.0), ExamplePointUDT())
+    >>> _verify_type([1.0, 2.0], ExamplePointUDT()) # doctest: +IGNORE_EXCEPTION_DETAIL
+    Traceback (most recent call last):
+        ...
+    ValueError:...
+    """
+    # all objects are nullable
+    if obj is None:
+        return
+
+    if isinstance(dataType, UserDefinedType):
+        if not (hasattr(obj, '__UDT__') and obj.__UDT__ == dataType):
+            raise ValueError("%r is not an instance of type %r" % (obj, dataType))
+        _verify_type(dataType.serialize(obj), dataType.sqlType())
+        return
+
+    _type = type(dataType)
+    assert _type in _acceptable_types, "unkown datatype: %s" % dataType
+
+    # subclass of them can not be deserialized in JVM
+    if type(obj) not in _acceptable_types[_type]:
+        raise TypeError("%s can not accept object in type %s"
+                        % (dataType, type(obj)))
+
+    if isinstance(dataType, ArrayType):
+        for i in obj:
+            _verify_type(i, dataType.elementType)
+
+    elif isinstance(dataType, MapType):
+        for k, v in obj.iteritems():
+            _verify_type(k, dataType.keyType)
+            _verify_type(v, dataType.valueType)
+
+    elif isinstance(dataType, StructType):
+        if len(obj) != len(dataType.fields):
+            raise ValueError("Length of object (%d) does not match with"
+                             "length of fields (%d)" % (len(obj), len(dataType.fields)))
+        for v, f in zip(obj, dataType.fields):
+            _verify_type(v, f.dataType)
+
+
+_cached_cls = {}
+
+
+def _restore_object(dataType, obj):
+    """ Restore object during unpickling. """
+    # use id(dataType) as key to speed up lookup in dict
+    # Because of batched pickling, dataType will be the
+    # same object in most cases.
+    k = id(dataType)
+    cls = _cached_cls.get(k)
+    if cls is None:
+        # use dataType as key to avoid create multiple class
+        cls = _cached_cls.get(dataType)
+        if cls is None:
+            cls = _create_cls(dataType)
+            _cached_cls[dataType] = cls
+        _cached_cls[k] = cls
+    return cls(obj)
+
+
+def _create_object(cls, v):
+    """ Create an customized object with class `cls`. """
+    # datetime.date would be deserialized as datetime.datetime
+    # from java type, so we need to set it back.
+    if cls is datetime.date and isinstance(v, datetime.datetime):
+        return v.date()
+    return cls(v) if v is not None else v
+
+
+def _create_getter(dt, i):
+    """ Create a getter for item `i` with schema """
+    cls = _create_cls(dt)
+
+    def getter(self):
+        return _create_object(cls, self[i])
+
+    return getter
+
+
+def _has_struct_or_date(dt):
+    """Return whether `dt` is or has StructType/DateType in it"""
+    if isinstance(dt, StructType):
+        return True
+    elif isinstance(dt, ArrayType):
+        return _has_struct_or_date(dt.elementType)
+    elif isinstance(dt, MapType):
+        return _has_struct_or_date(dt.keyType) or _has_struct_or_date(dt.valueType)
+    elif isinstance(dt, DateType):
+        return True
+    elif isinstance(dt, UserDefinedType):
+        return True
+    return False
+
+
+def _create_properties(fields):
+    """Create properties according to fields"""
+    ps = {}
+    for i, f in enumerate(fields):
+        name = f.name
+        if (name.startswith("__") and name.endswith("__")
+                or keyword.iskeyword(name)):
+            warnings.warn("field name %s can not be accessed in Python,"
+                          "use position to access it instead" % name)
+        if _has_struct_or_date(f.dataType):
+            # delay creating object until accessing it
+            getter = _create_getter(f.dataType, i)
+        else:
+            getter = itemgetter(i)
+        ps[name] = property(getter)
+    return ps
+
+
+def _create_cls(dataType):
+    """
+    Create an class by dataType
+
+    The created class is similar to namedtuple, but can have nested schema.
+
+    >>> schema = _parse_schema_abstract("a b c")
+    >>> row = (1, 1.0, "str")
+    >>> schema = _infer_schema_type(row, schema)
+    >>> obj = _create_cls(schema)(row)
+    >>> import pickle
+    >>> pickle.loads(pickle.dumps(obj))
+    Row(a=1, b=1.0, c='str')
+
+    >>> row = [[1], {"key": (1, 2.0)}]
+    >>> schema = _parse_schema_abstract("a[] b{c d}")
+    >>> schema = _infer_schema_type(row, schema)
+    >>> obj = _create_cls(schema)(row)
+    >>> pickle.loads(pickle.dumps(obj))
+    Row(a=[1], b={'key': Row(c=1, d=2.0)})
+    >>> pickle.loads(pickle.dumps(obj.a))
+    [1]
+    >>> pickle.loads(pickle.dumps(obj.b))
+    {'key': Row(c=1, d=2.0)}
+    """
+
+    if isinstance(dataType, ArrayType):
+        cls = _create_cls(dataType.elementType)
+
+        def List(l):
+            if l is None:
+                return
+            return [_create_object(cls, v) for v in l]
+
+        return List
+
+    elif isinstance(dataType, MapType):
+        kcls = _create_cls(dataType.keyType)
+        vcls = _create_cls(dataType.valueType)
+
+        def Dict(d):
+            if d is None:
+                return
+            return dict((_create_object(kcls, k), _create_object(vcls, v)) for k, v in d.items())
+
+        return Dict
+
+    elif isinstance(dataType, DateType):
+        return datetime.date
+
+    elif isinstance(dataType, UserDefinedType):
+        return lambda datum: dataType.deserialize(datum)
+
+    elif not isinstance(dataType, StructType):
+        # no wrapper for primitive types
+        return lambda x: x
+
+    class Row(tuple):
+
+        """ Row in DataFrame """
+        __DATATYPE__ = dataType
+        __FIELDS__ = tuple(f.name for f in dataType.fields)
+        __slots__ = ()
+
+        # create property for fast access
+        locals().update(_create_properties(dataType.fields))
+
+        def asDict(self):
+            """ Return as a dict """
+            return dict((n, getattr(self, n)) for n in self.__FIELDS__)
+
+        def __repr__(self):
+            # call collect __repr__ for nested objects
+            return ("Row(%s)" % ", ".join("%s=%r" % (n, getattr(self, n))
+                                          for n in self.__FIELDS__))
+
+        def __reduce__(self):
+            return (_restore_object, (self.__DATATYPE__, tuple(self)))
+
+    return Row
+
+
+def _create_row(fields, values):
+    row = Row(*values)
+    row.__FIELDS__ = fields
+    return row
+
+
+class Row(tuple):
+
+    """
+    A row in L{DataFrame}. The fields in it can be accessed like attributes.
+
+    Row can be used to create a row object by using named arguments,
+    the fields will be sorted by names.
+
+    >>> row = Row(name="Alice", age=11)
+    >>> row
+    Row(age=11, name='Alice')
+    >>> row.name, row.age
+    ('Alice', 11)
+
+    Row also can be used to create another Row like class, then it
+    could be used to create Row objects, such as
+
+    >>> Person = Row("name", "age")
+    >>> Person
+    <Row(name, age)>
+    >>> Person("Alice", 11)
+    Row(name='Alice', age=11)
+    """
+
+    def __new__(self, *args, **kwargs):
+        if args and kwargs:
+            raise ValueError("Can not use both args "
+                             "and kwargs to create Row")
+        if args:
+            # create row class or objects
+            return tuple.__new__(self, args)
+
+        elif kwargs:
+            # create row objects
+            names = sorted(kwargs.keys())
+            values = tuple(kwargs[n] for n in names)
+            row = tuple.__new__(self, values)
+            row.__FIELDS__ = names
+            return row
+
+        else:
+            raise ValueError("No args or kwargs")
+
+    def asDict(self):
+        """
+        Return as an dict
+        """
+        if not hasattr(self, "__FIELDS__"):
+            raise TypeError("Cannot convert a Row class into dict")
+        return dict(zip(self.__FIELDS__, self))
+
+    # let obect acs like class
+    def __call__(self, *args):
+        """create new Row object"""
+        return _create_row(self, args)
+
+    def __getattr__(self, item):
+        if item.startswith("__"):
+            raise AttributeError(item)
+        try:
+            # it will be slow when it has many fields,
+            # but this will not be used in normal cases
+            idx = self.__FIELDS__.index(item)
+            return self[idx]
+        except IndexError:
+            raise AttributeError(item)
+
+    def __reduce__(self):
+        if hasattr(self, "__FIELDS__"):
+            return (_create_row, (self.__FIELDS__, tuple(self)))
+        else:
+            return tuple.__reduce__(self)
+
+    def __repr__(self):
+        if hasattr(self, "__FIELDS__"):
+            return "Row(%s)" % ", ".join("%s=%r" % (k, v)
+                                         for k, v in zip(self.__FIELDS__, self))
+        else:
+            return "<Row(%s)>" % ", ".join(self)
+
+
+def _test():
+    import doctest
+    from pyspark.context import SparkContext
+    # let doctest run in pyspark.sql.types, so DataTypes can be picklable
+    import pyspark.sql.types
+    from pyspark.sql import Row, SQLContext
+    from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
+    globs = pyspark.sql.types.__dict__.copy()
+    sc = SparkContext('local[4]', 'PythonTest')
+    globs['sc'] = sc
+    globs['sqlCtx'] = sqlCtx = SQLContext(sc)
+    globs['ExamplePoint'] = ExamplePoint
+    globs['ExamplePointUDT'] = ExamplePointUDT
+    (failure_count, test_count) = doctest.testmod(
+        pyspark.sql.types, globs=globs, optionflags=doctest.ELLIPSIS)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/run-tests b/python/run-tests
index 649a2c44d187b..58a26dd8ff088 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -64,8 +64,10 @@ function run_core_tests() {
 
 function run_sql_tests() {
     echo "Run sql tests ..."
-    run_test "pyspark/sql.py"
-    run_test "pyspark/sql_tests.py"
+    run_test "pyspark/sql/types.py"
+    run_test "pyspark/sql/context.py"
+    run_test "pyspark/sql/dataframe.py"
+    run_test "pyspark/sql/tests.py"
 }
 
 function run_mllib_tests() {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
index e6f622e87f7a4..eb045e37bf5a9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
@@ -37,7 +37,7 @@ private[sql] class ExamplePointUDT extends UserDefinedType[ExamplePoint] {
 
   override def sqlType: DataType = ArrayType(DoubleType, false)
 
-  override def pyUDT: String = "pyspark.sql_tests.ExamplePointUDT"
+  override def pyUDT: String = "pyspark.sql.tests.ExamplePointUDT"
 
   override def serialize(obj: Any): Seq[Double] = {
     obj match {

From 31d435ecfdc24a788a6e38f4e82767bc275a3283 Mon Sep 17 00:00:00 2001
From: KaiXinXiaoLei <huleilei1@huawei.com>
Date: Mon, 9 Feb 2015 20:58:58 -0800
Subject: [PATCH 081/357] Add a config option to print DAG.

Add a config option "spark.rddDebug.enable" to check whether to print DAG info. When "spark.rddDebug.enable" is true, it will print information about DAG in the log.

Author: KaiXinXiaoLei <huleilei1@huawei.com>

Closes #4257 from KaiXinXiaoLei/DAGprint and squashes the following commits:

d9fe42e [KaiXinXiaoLei] change  log info
c27ee76 [KaiXinXiaoLei] change log info
83c2b32 [KaiXinXiaoLei] change config option
adcb14f [KaiXinXiaoLei] change the file.
f4e7b9e [KaiXinXiaoLei] add a option to print DAG
---
 core/src/main/scala/org/apache/spark/SparkContext.scala | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 71bdbc9b38ddb..8d3c3d000adf3 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1420,6 +1420,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     val callSite = getCallSite
     val cleanedFunc = clean(func)
     logInfo("Starting job: " + callSite.shortForm)
+    if (conf.getBoolean("spark.logLineage", false)) {
+      logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
+    }
     dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, allowLocal,
       resultHandler, localProperties.get)
     progressBar.foreach(_.finishAll())

From 36c4e1d75933dc843acb747b91dc12e75ad1df42 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 9 Feb 2015 21:13:58 -0800
Subject: [PATCH 082/357] SPARK-4900 [MLLIB] MLlib SingularValueDecomposition
 ARPACK IllegalStateException

Fix ARPACK error code mapping, at least. It's not yet clear whether the error is what we expect from ARPACK. If it isn't, not sure if that's to be treated as an MLlib or Breeze issue.

Author: Sean Owen <sowen@cloudera.com>

Closes #4485 from srowen/SPARK-4900 and squashes the following commits:

7355aa1 [Sean Owen] Fix ARPACK error code mapping
---
 .../org/apache/spark/mllib/linalg/EigenValueDecomposition.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
index 9d6f97528148e..866936aa4f118 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
@@ -117,7 +117,7 @@ private[mllib] object EigenValueDecomposition {
       info.`val` match {
         case 1 => throw new IllegalStateException("ARPACK returns non-zero info = " + info.`val` +
             " Maximum number of iterations taken. (Refer ARPACK user guide for details)")
-        case 2 => throw new IllegalStateException("ARPACK returns non-zero info = " + info.`val` +
+        case 3 => throw new IllegalStateException("ARPACK returns non-zero info = " + info.`val` +
             " No shifts could be applied. Try to increase NCV. " +
             "(Refer ARPACK user guide for details)")
         case _ => throw new IllegalStateException("ARPACK returns non-zero info = " + info.`val` +

From 20a6013106b56a1a1cc3e8cda092330ffbe77cc3 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 9 Feb 2015 21:17:06 -0800
Subject: [PATCH 083/357] [SPARK-2996] Implement userClassPathFirst for driver,
 yarn.

Yarn's config option `spark.yarn.user.classpath.first` does not work the same way as
`spark.files.userClassPathFirst`; Yarn's version is a lot more dangerous, in that it
modifies the system classpath, instead of restricting the changes to the user's class
loader. So this change implements the behavior of the latter for Yarn, and deprecates
the more dangerous choice.

To be able to achieve feature-parity, I also implemented the option for drivers (the existing
option only applies to executors). So now there are two options, each controlling whether
to apply userClassPathFirst to the driver or executors. The old option was deprecated, and
aliased to the new one (`spark.executor.userClassPathFirst`).

The existing "child-first" class loader also had to be fixed. It didn't handle resources, and it
was also doing some things that ended up causing JVM errors depending on how things
were being called.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #3233 from vanzin/SPARK-2996 and squashes the following commits:

9cf9cf1 [Marcelo Vanzin] Merge branch 'master' into SPARK-2996
a1499e2 [Marcelo Vanzin] Remove SPARK_HOME propagation.
fa7df88 [Marcelo Vanzin] Remove 'test.resource' file, create it dynamically.
a8c69f1 [Marcelo Vanzin] Review feedback.
cabf962 [Marcelo Vanzin] Merge branch 'master' into SPARK-2996
a1b8d7e [Marcelo Vanzin] Merge branch 'master' into SPARK-2996
3f768e3 [Marcelo Vanzin] Merge branch 'master' into SPARK-2996
2ce3c7a [Marcelo Vanzin] Merge branch 'master' into SPARK-2996
0e6d6be [Marcelo Vanzin] Merge branch 'master' into SPARK-2996
70d4044 [Marcelo Vanzin] Fix pyspark/yarn-cluster test.
0fe7777 [Marcelo Vanzin] Merge branch 'master' into SPARK-2996
0e6ef19 [Marcelo Vanzin] Move class loaders around and make names more meaninful.
fe970a7 [Marcelo Vanzin] Review feedback.
25d4fed [Marcelo Vanzin] Merge branch 'master' into SPARK-2996
3cb6498 [Marcelo Vanzin] Call the right loadClass() method on the parent.
fbb8ab5 [Marcelo Vanzin] Add locking in loadClass() to avoid deadlocks.
2e6c4b7 [Marcelo Vanzin] Mention new setting in documentation.
b6497f9 [Marcelo Vanzin] Merge branch 'master' into SPARK-2996
a10f379 [Marcelo Vanzin] Some feedback.
3730151 [Marcelo Vanzin] Merge branch 'master' into SPARK-2996
f513871 [Marcelo Vanzin] Merge branch 'master' into SPARK-2996
44010b6 [Marcelo Vanzin] Merge branch 'master' into SPARK-2996
7b57cba [Marcelo Vanzin] Remove now outdated message.
5304d64 [Marcelo Vanzin] Merge branch 'master' into SPARK-2996
35949c8 [Marcelo Vanzin] Merge branch 'master' into SPARK-2996
54e1a98 [Marcelo Vanzin] Merge branch 'master' into SPARK-2996
d1273b2 [Marcelo Vanzin] Add test file to rat exclude.
fa1aafa [Marcelo Vanzin] Remove write check on user jars.
89d8072 [Marcelo Vanzin] Cleanups.
a963ea3 [Marcelo Vanzin] Implement spark.driver.userClassPathFirst for standalone cluster mode.
50afa5f [Marcelo Vanzin] Fix Yarn executor command line.
7d14397 [Marcelo Vanzin] Register user jars in executor up front.
7f8603c [Marcelo Vanzin] Fix yarn-cluster mode without userClassPathFirst.
20373f5 [Marcelo Vanzin] Fix ClientBaseSuite.
55c88fa [Marcelo Vanzin] Run all Yarn integration tests via spark-submit.
0b64d92 [Marcelo Vanzin] Add deprecation warning to yarn option.
4a84d87 [Marcelo Vanzin] Fix the child-first class loader.
d0394b8 [Marcelo Vanzin] Add "deprecated configs" to SparkConf.
46d8cf2 [Marcelo Vanzin] Update doc with new option, change name to "userClassPathFirst".
a314f2d [Marcelo Vanzin] Enable driver class path isolation in SparkSubmit.
91f7e54 [Marcelo Vanzin] [yarn] Enable executor class path isolation.
a853e74 [Marcelo Vanzin] Re-work CoarseGrainedExecutorBackend command line arguments.
89522ef [Marcelo Vanzin] Add class path isolation support for Yarn cluster mode.
---
 .../scala/org/apache/spark/SparkConf.scala    |  83 +++++-
 .../scala/org/apache/spark/TestUtils.scala    |  19 +-
 .../org/apache/spark/deploy/Client.scala      |   5 +-
 .../org/apache/spark/deploy/SparkSubmit.scala |   8 +-
 .../spark/deploy/master/ui/MasterPage.scala   |   2 +-
 .../deploy/rest/StandaloneRestServer.scala    |   2 +-
 .../spark/deploy/worker/DriverRunner.scala    |  15 +-
 .../spark/deploy/worker/DriverWrapper.scala   |  20 +-
 .../CoarseGrainedExecutorBackend.scala        |  83 +++++-
 .../org/apache/spark/executor/Executor.scala  |  52 ++--
 .../executor/ExecutorURLClassLoader.scala     |  84 ------
 .../cluster/SparkDeploySchedulerBackend.scala |   9 +-
 .../mesos/CoarseMesosSchedulerBackend.scala   |  21 +-
 .../spark/util/MutableURLClassLoader.scala    | 103 +++++++
 .../apache/spark/util/ParentClassLoader.scala |   7 +-
 .../org/apache/spark/SparkConfSuite.scala     |  12 +
 .../spark/deploy/SparkSubmitSuite.scala       |  27 ++
 .../MutableURLClassLoaderSuite.scala}         |  12 +-
 docs/configuration.md                         |  31 +-
 pom.xml                                       |  12 +-
 project/SparkBuild.scala                      |   8 +-
 .../spark/deploy/yarn/ApplicationMaster.scala |  25 +-
 .../org/apache/spark/deploy/yarn/Client.scala | 133 ++++-----
 .../spark/deploy/yarn/ExecutorRunnable.scala  |  25 +-
 yarn/src/test/resources/log4j.properties      |   4 +-
 .../spark/deploy/yarn/ClientSuite.scala       |   6 +-
 .../spark/deploy/yarn/YarnClusterSuite.scala  | 276 ++++++++++++------
 27 files changed, 736 insertions(+), 348 deletions(-)
 delete mode 100644 core/src/main/scala/org/apache/spark/executor/ExecutorURLClassLoader.scala
 create mode 100644 core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala
 rename core/src/test/scala/org/apache/spark/{executor/ExecutorURLClassLoaderSuite.scala => util/MutableURLClassLoaderSuite.scala} (90%)

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 13aa9960ac33a..0dbd26146cb13 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -18,6 +18,7 @@
 package org.apache.spark
 
 import java.util.concurrent.ConcurrentHashMap
+import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.LinkedHashSet
@@ -67,7 +68,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
     if (value == null) {
       throw new NullPointerException("null value for " + key)
     }
-    settings.put(key, value)
+    settings.put(translateConfKey(key, warn = true), value)
     this
   }
 
@@ -139,7 +140,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
 
   /** Set a parameter if it isn't already configured */
   def setIfMissing(key: String, value: String): SparkConf = {
-    settings.putIfAbsent(key, value)
+    settings.putIfAbsent(translateConfKey(key, warn = true), value)
     this
   }
 
@@ -175,7 +176,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
 
   /** Get a parameter as an Option */
   def getOption(key: String): Option[String] = {
-    Option(settings.get(key))
+    Option(settings.get(translateConfKey(key)))
   }
 
   /** Get all parameters as a list of pairs */
@@ -228,7 +229,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
   def getAppId: String = get("spark.app.id")
 
   /** Does the configuration contain a given parameter? */
-  def contains(key: String): Boolean = settings.containsKey(key)
+  def contains(key: String): Boolean = settings.containsKey(translateConfKey(key))
 
   /** Copy this object */
   override def clone: SparkConf = {
@@ -285,7 +286,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
     // Validate memory fractions
     val memoryKeys = Seq(
       "spark.storage.memoryFraction",
-      "spark.shuffle.memoryFraction", 
+      "spark.shuffle.memoryFraction",
       "spark.shuffle.safetyFraction",
       "spark.storage.unrollFraction",
       "spark.storage.safetyFraction")
@@ -351,9 +352,20 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
   def toDebugString: String = {
     getAll.sorted.map{case (k, v) => k + "=" + v}.mkString("\n")
   }
+
 }
 
-private[spark] object SparkConf {
+private[spark] object SparkConf extends Logging {
+
+  private val deprecatedConfigs: Map[String, DeprecatedConfig] = {
+    val configs = Seq(
+      DeprecatedConfig("spark.files.userClassPathFirst", "spark.executor.userClassPathFirst",
+        "1.3"),
+      DeprecatedConfig("spark.yarn.user.classpath.first", null, "1.3",
+        "Use spark.{driver,executor}.userClassPathFirst instead."))
+    configs.map { x => (x.oldName, x) }.toMap
+  }
+
   /**
    * Return whether the given config is an akka config (e.g. akka.actor.provider).
    * Note that this does not include spark-specific akka configs (e.g. spark.akka.timeout).
@@ -380,4 +392,63 @@ private[spark] object SparkConf {
   def isSparkPortConf(name: String): Boolean = {
     (name.startsWith("spark.") && name.endsWith(".port")) || name.startsWith("spark.port.")
   }
+
+  /**
+   * Translate the configuration key if it is deprecated and has a replacement, otherwise just
+   * returns the provided key.
+   *
+   * @param userKey Configuration key from the user / caller.
+   * @param warn Whether to print a warning if the key is deprecated. Warnings will be printed
+   *             only once for each key.
+   */
+  def translateConfKey(userKey: String, warn: Boolean = false): String = {
+    deprecatedConfigs.get(userKey)
+      .map { deprecatedKey =>
+        if (warn) {
+          deprecatedKey.warn()
+        }
+        deprecatedKey.newName.getOrElse(userKey)
+      }.getOrElse(userKey)
+  }
+
+  /**
+   * Holds information about keys that have been deprecated or renamed.
+   *
+   * @param oldName Old configuration key.
+   * @param newName New configuration key, or `null` if key has no replacement, in which case the
+   *                deprecated key will be used (but the warning message will still be printed).
+   * @param version Version of Spark where key was deprecated.
+   * @param deprecationMessage Message to include in the deprecation warning; mandatory when
+   *                           `newName` is not provided.
+   */
+  private case class DeprecatedConfig(
+      oldName: String,
+      _newName: String,
+      version: String,
+      deprecationMessage: String = null) {
+
+    private val warned = new AtomicBoolean(false)
+    val newName = Option(_newName)
+
+    if (newName == null && (deprecationMessage == null || deprecationMessage.isEmpty())) {
+      throw new IllegalArgumentException("Need new config name or deprecation message.")
+    }
+
+    def warn(): Unit = {
+      if (warned.compareAndSet(false, true)) {
+        if (newName != null) {
+          val message = Option(deprecationMessage).getOrElse(
+            s"Please use the alternative '$newName' instead.")
+          logWarning(
+            s"The configuration option '$oldName' has been replaced as of Spark $version and " +
+            s"may be removed in the future. $message")
+        } else {
+          logWarning(
+            s"The configuration option '$oldName' has been deprecated as of Spark $version and " +
+            s"may be removed in the future. $deprecationMessage")
+        }
+      }
+    }
+
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala
index be081c3825566..35b324ba6f573 100644
--- a/core/src/main/scala/org/apache/spark/TestUtils.scala
+++ b/core/src/main/scala/org/apache/spark/TestUtils.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark
 
-import java.io.{File, FileInputStream, FileOutputStream}
+import java.io.{ByteArrayInputStream, File, FileInputStream, FileOutputStream}
 import java.net.{URI, URL}
 import java.util.jar.{JarEntry, JarOutputStream}
 
 import scala.collection.JavaConversions._
 
+import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.{ByteStreams, Files}
 import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider}
 
@@ -59,6 +60,22 @@ private[spark] object TestUtils {
     createJar(files1 ++ files2, jarFile)
   }
 
+  /**
+   * Create a jar file containing multiple files. The `files` map contains a mapping of
+   * file names in the jar file to their contents.
+   */
+  def createJarWithFiles(files: Map[String, String], dir: File = null): URL = {
+    val tempDir = Option(dir).getOrElse(Utils.createTempDir())
+    val jarFile = File.createTempFile("testJar", ".jar", tempDir)
+    val jarStream = new JarOutputStream(new FileOutputStream(jarFile))
+    files.foreach { case (k, v) =>
+      val entry = new JarEntry(k)
+      jarStream.putNextEntry(entry)
+      ByteStreams.copy(new ByteArrayInputStream(v.getBytes(UTF_8)), jarStream)
+    }
+    jarStream.close()
+    jarFile.toURI.toURL
+  }
 
   /**
    * Create a jar file that contains this set of files. All files will be located at the root
diff --git a/core/src/main/scala/org/apache/spark/deploy/Client.scala b/core/src/main/scala/org/apache/spark/deploy/Client.scala
index 38b3da0b13756..237d26fc6bd0e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/Client.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/Client.scala
@@ -68,8 +68,9 @@ private class ClientActor(driverArgs: ClientArguments, conf: SparkConf)
           .map(Utils.splitCommandString).getOrElse(Seq.empty)
         val sparkJavaOpts = Utils.sparkJavaOpts(conf)
         val javaOpts = sparkJavaOpts ++ extraJavaOpts
-        val command = new Command(mainClass, Seq("{{WORKER_URL}}", driverArgs.mainClass) ++
-          driverArgs.driverOptions, sys.env, classPathEntries, libraryPathEntries, javaOpts)
+        val command = new Command(mainClass,
+          Seq("{{WORKER_URL}}", "{{USER_JAR}}", driverArgs.mainClass) ++ driverArgs.driverOptions,
+          sys.env, classPathEntries, libraryPathEntries, javaOpts)
 
         val driverDescription = new DriverDescription(
           driverArgs.jarUrl,
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 6d213926f3d7b..c4bc5054d61a1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -37,7 +37,7 @@ import org.apache.ivy.plugins.resolver.{ChainResolver, IBiblioResolver}
 
 import org.apache.spark.deploy.rest._
 import org.apache.spark.executor._
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, Utils}
 
 /**
  * Whether to submit, kill, or request the status of an application.
@@ -467,11 +467,11 @@ object SparkSubmit {
     }
 
     val loader =
-      if (sysProps.getOrElse("spark.files.userClassPathFirst", "false").toBoolean) {
-        new ChildExecutorURLClassLoader(new Array[URL](0),
+      if (sysProps.getOrElse("spark.driver.userClassPathFirst", "false").toBoolean) {
+        new ChildFirstURLClassLoader(new Array[URL](0),
           Thread.currentThread.getContextClassLoader)
       } else {
-        new ExecutorURLClassLoader(new Array[URL](0),
+        new MutableURLClassLoader(new Array[URL](0),
           Thread.currentThread.getContextClassLoader)
       }
     Thread.currentThread.setContextClassLoader(loader)
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
index b47a081053e77..fd514f07664a9 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
@@ -196,7 +196,7 @@ private[spark] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
       <td sorttable_customkey={driver.desc.mem.toString}>
         {Utils.megabytesToString(driver.desc.mem.toLong)}
       </td>
-      <td>{driver.desc.command.arguments(1)}</td>
+      <td>{driver.desc.command.arguments(2)}</td>
     </tr>
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
index 2033d67e1f394..6e4486e20fcba 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
@@ -392,7 +392,7 @@ private class SubmitRequestServlet(
     val javaOpts = sparkJavaOpts ++ extraJavaOpts
     val command = new Command(
       "org.apache.spark.deploy.worker.DriverWrapper",
-      Seq("{{WORKER_URL}}", mainClass) ++ appArgs, // args to the DriverWrapper
+      Seq("{{WORKER_URL}}", "{{USER_JAR}}", mainClass) ++ appArgs, // args to the DriverWrapper
       environmentVariables, extraClassPath, extraLibraryPath, javaOpts)
     val actualDriverMemory = driverMemory.map(Utils.memoryStringToMb).getOrElse(DEFAULT_MEMORY)
     val actualDriverCores = driverCores.map(_.toInt).getOrElse(DEFAULT_CORES)
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
index 28cab36c7b9e2..b964a09bdb218 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
@@ -74,10 +74,15 @@ private[spark] class DriverRunner(
           val driverDir = createWorkingDirectory()
           val localJarFilename = downloadUserJar(driverDir)
 
-          // Make sure user application jar is on the classpath
+          def substituteVariables(argument: String): String = argument match {
+            case "{{WORKER_URL}}" => workerUrl
+            case "{{USER_JAR}}" => localJarFilename
+            case other => other
+          }
+
           // TODO: If we add ability to submit multiple jars they should also be added here
           val builder = CommandUtils.buildProcessBuilder(driverDesc.command, driverDesc.mem,
-            sparkHome.getAbsolutePath, substituteVariables, Seq(localJarFilename))
+            sparkHome.getAbsolutePath, substituteVariables)
           launchDriver(builder, driverDir, driverDesc.supervise)
         }
         catch {
@@ -111,12 +116,6 @@ private[spark] class DriverRunner(
     }
   }
 
-  /** Replace variables in a command argument passed to us */
-  private def substituteVariables(argument: String): String = argument match {
-    case "{{WORKER_URL}}" => workerUrl
-    case other => other
-  }
-
   /**
    * Creates the working directory for this driver.
    * Will throw an exception if there are errors preparing the directory.
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
index 05e242e6df702..ab467a5ee8c6c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
@@ -17,10 +17,12 @@
 
 package org.apache.spark.deploy.worker
 
+import java.io.File
+
 import akka.actor._
 
 import org.apache.spark.{SecurityManager, SparkConf}
-import org.apache.spark.util.{AkkaUtils, Utils}
+import org.apache.spark.util.{AkkaUtils, ChildFirstURLClassLoader, MutableURLClassLoader, Utils}
 
 /**
  * Utility object for launching driver programs such that they share fate with the Worker process.
@@ -28,21 +30,31 @@ import org.apache.spark.util.{AkkaUtils, Utils}
 object DriverWrapper {
   def main(args: Array[String]) {
     args.toList match {
-      case workerUrl :: mainClass :: extraArgs =>
+      case workerUrl :: userJar :: mainClass :: extraArgs =>
         val conf = new SparkConf()
         val (actorSystem, _) = AkkaUtils.createActorSystem("Driver",
           Utils.localHostName(), 0, conf, new SecurityManager(conf))
         actorSystem.actorOf(Props(classOf[WorkerWatcher], workerUrl), name = "workerWatcher")
 
+        val currentLoader = Thread.currentThread.getContextClassLoader
+        val userJarUrl = new File(userJar).toURI().toURL()
+        val loader =
+          if (sys.props.getOrElse("spark.driver.userClassPathFirst", "false").toBoolean) {
+            new ChildFirstURLClassLoader(Array(userJarUrl), currentLoader)
+          } else {
+            new MutableURLClassLoader(Array(userJarUrl), currentLoader)
+          }
+        Thread.currentThread.setContextClassLoader(loader)
+
         // Delegate to supplied main class
-        val clazz = Class.forName(args(1))
+        val clazz = Class.forName(mainClass, true, loader)
         val mainMethod = clazz.getMethod("main", classOf[Array[String]])
         mainMethod.invoke(null, extraArgs.toArray[String])
 
         actorSystem.shutdown()
 
       case _ =>
-        System.err.println("Usage: DriverWrapper <workerUrl> <driverMainClass> [options]")
+        System.err.println("Usage: DriverWrapper <workerUrl> <userJar> <driverMainClass> [options]")
         System.exit(-1)
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index 3a42f8b157977..dd19e4947db1e 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -17,8 +17,10 @@
 
 package org.apache.spark.executor
 
+import java.net.URL
 import java.nio.ByteBuffer
 
+import scala.collection.mutable
 import scala.concurrent.Await
 
 import akka.actor.{Actor, ActorSelection, Props}
@@ -38,6 +40,7 @@ private[spark] class CoarseGrainedExecutorBackend(
     executorId: String,
     hostPort: String,
     cores: Int,
+    userClassPath: Seq[URL],
     env: SparkEnv)
   extends Actor with ActorLogReceive with ExecutorBackend with Logging {
 
@@ -63,7 +66,7 @@ private[spark] class CoarseGrainedExecutorBackend(
     case RegisteredExecutor =>
       logInfo("Successfully registered with driver")
       val (hostname, _) = Utils.parseHostPort(hostPort)
-      executor = new Executor(executorId, hostname, env, isLocal = false)
+      executor = new Executor(executorId, hostname, env, userClassPath, isLocal = false)
 
     case RegisterExecutorFailed(message) =>
       logError("Slave registration failed: " + message)
@@ -117,7 +120,8 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
       hostname: String,
       cores: Int,
       appId: String,
-      workerUrl: Option[String]) {
+      workerUrl: Option[String],
+      userClassPath: Seq[URL]) {
 
     SignalLogger.register(log)
 
@@ -162,7 +166,7 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
       val sparkHostPort = hostname + ":" + boundPort
       env.actorSystem.actorOf(
         Props(classOf[CoarseGrainedExecutorBackend],
-          driverUrl, executorId, sparkHostPort, cores, env),
+          driverUrl, executorId, sparkHostPort, cores, userClassPath, env),
         name = "Executor")
       workerUrl.foreach { url =>
         env.actorSystem.actorOf(Props(classOf[WorkerWatcher], url), name = "WorkerWatcher")
@@ -172,20 +176,69 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
   }
 
   def main(args: Array[String]) {
-    args.length match {
-      case x if x < 5 =>
-        System.err.println(
+    var driverUrl: String = null
+    var executorId: String = null
+    var hostname: String = null
+    var cores: Int = 0
+    var appId: String = null
+    var workerUrl: Option[String] = None
+    val userClassPath = new mutable.ListBuffer[URL]()
+
+    var argv = args.toList
+    while (!argv.isEmpty) {
+      argv match {
+        case ("--driver-url") :: value :: tail =>
+          driverUrl = value
+          argv = tail
+        case ("--executor-id") :: value :: tail =>
+          executorId = value
+          argv = tail
+        case ("--hostname") :: value :: tail =>
+          hostname = value
+          argv = tail
+        case ("--cores") :: value :: tail =>
+          cores = value.toInt
+          argv = tail
+        case ("--app-id") :: value :: tail =>
+          appId = value
+          argv = tail
+        case ("--worker-url") :: value :: tail =>
           // Worker url is used in spark standalone mode to enforce fate-sharing with worker
-          "Usage: CoarseGrainedExecutorBackend <driverUrl> <executorId> <hostname> " +
-          "<cores> <appid> [<workerUrl>] ")
-        System.exit(1)
+          workerUrl = Some(value)
+          argv = tail
+        case ("--user-class-path") :: value :: tail =>
+          userClassPath += new URL(value)
+          argv = tail
+        case Nil =>
+        case tail =>
+          System.err.println(s"Unrecognized options: ${tail.mkString(" ")}")
+          printUsageAndExit()
+      }
+    }
 
-      // NB: These arguments are provided by SparkDeploySchedulerBackend (for standalone mode)
-      // and CoarseMesosSchedulerBackend (for mesos mode).
-      case 5 =>
-        run(args(0), args(1), args(2), args(3).toInt, args(4), None)
-      case x if x > 5 =>
-        run(args(0), args(1), args(2), args(3).toInt, args(4), Some(args(5)))
+    if (driverUrl == null || executorId == null || hostname == null || cores <= 0 ||
+      appId == null) {
+      printUsageAndExit()
     }
+
+    run(driverUrl, executorId, hostname, cores, appId, workerUrl, userClassPath)
   }
+
+  private def printUsageAndExit() = {
+    System.err.println(
+      """
+      |"Usage: CoarseGrainedExecutorBackend [options]
+      |
+      | Options are:
+      |   --driver-url <driverUrl>
+      |   --executor-id <executorId>
+      |   --hostname <hostname>
+      |   --cores <cores>
+      |   --app-id <appid>
+      |   --worker-url <workerUrl>
+      |   --user-class-path <url>
+      |""".stripMargin)
+    System.exit(1)
+  }
+
 }
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 5141483d1e745..6b22dcd6f5cbf 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -19,6 +19,7 @@ package org.apache.spark.executor
 
 import java.io.File
 import java.lang.management.ManagementFactory
+import java.net.URL
 import java.nio.ByteBuffer
 import java.util.concurrent._
 
@@ -33,7 +34,8 @@ import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.scheduler._
 import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.storage.{StorageLevel, TaskResultBlockId}
-import org.apache.spark.util.{SparkUncaughtExceptionHandler, AkkaUtils, Utils}
+import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader,
+  SparkUncaughtExceptionHandler, AkkaUtils, Utils}
 
 /**
  * Spark executor used with Mesos, YARN, and the standalone scheduler.
@@ -43,6 +45,7 @@ private[spark] class Executor(
     executorId: String,
     executorHostname: String,
     env: SparkEnv,
+    userClassPath: Seq[URL] = Nil,
     isLocal: Boolean = false)
   extends Logging
 {
@@ -288,17 +291,23 @@ private[spark] class Executor(
    * created by the interpreter to the search path
    */
   private def createClassLoader(): MutableURLClassLoader = {
+    // Bootstrap the list of jars with the user class path.
+    val now = System.currentTimeMillis()
+    userClassPath.foreach { url =>
+      currentJars(url.getPath().split("/").last) = now
+    }
+
     val currentLoader = Utils.getContextOrSparkClassLoader
 
     // For each of the jars in the jarSet, add them to the class loader.
     // We assume each of the files has already been fetched.
-    val urls = currentJars.keySet.map { uri =>
+    val urls = userClassPath.toArray ++ currentJars.keySet.map { uri =>
       new File(uri.split("/").last).toURI.toURL
-    }.toArray
-    val userClassPathFirst = conf.getBoolean("spark.files.userClassPathFirst", false)
-    userClassPathFirst match {
-      case true => new ChildExecutorURLClassLoader(urls, currentLoader)
-      case false => new ExecutorURLClassLoader(urls, currentLoader)
+    }
+    if (conf.getBoolean("spark.executor.userClassPathFirst", false)) {
+      new ChildFirstURLClassLoader(urls, currentLoader)
+    } else {
+      new MutableURLClassLoader(urls, currentLoader)
     }
   }
 
@@ -311,7 +320,7 @@ private[spark] class Executor(
     if (classUri != null) {
       logInfo("Using REPL class URI: " + classUri)
       val userClassPathFirst: java.lang.Boolean =
-        conf.getBoolean("spark.files.userClassPathFirst", false)
+        conf.getBoolean("spark.executor.userClassPathFirst", false)
       try {
         val klass = Class.forName("org.apache.spark.repl.ExecutorClassLoader")
           .asInstanceOf[Class[_ <: ClassLoader]]
@@ -344,18 +353,23 @@ private[spark] class Executor(
           env.securityManager, hadoopConf, timestamp, useCache = !isLocal)
         currentFiles(name) = timestamp
       }
-      for ((name, timestamp) <- newJars if currentJars.getOrElse(name, -1L) < timestamp) {
-        logInfo("Fetching " + name + " with timestamp " + timestamp)
-        // Fetch file with useCache mode, close cache for local mode.
-        Utils.fetchFile(name, new File(SparkFiles.getRootDirectory), conf,
-          env.securityManager, hadoopConf, timestamp, useCache = !isLocal)
-        currentJars(name) = timestamp
-        // Add it to our class loader
+      for ((name, timestamp) <- newJars) {
         val localName = name.split("/").last
-        val url = new File(SparkFiles.getRootDirectory, localName).toURI.toURL
-        if (!urlClassLoader.getURLs.contains(url)) {
-          logInfo("Adding " + url + " to class loader")
-          urlClassLoader.addURL(url)
+        val currentTimeStamp = currentJars.get(name)
+          .orElse(currentJars.get(localName))
+          .getOrElse(-1L)
+        if (currentTimeStamp < timestamp) {
+          logInfo("Fetching " + name + " with timestamp " + timestamp)
+          // Fetch file with useCache mode, close cache for local mode.
+          Utils.fetchFile(name, new File(SparkFiles.getRootDirectory), conf,
+            env.securityManager, hadoopConf, timestamp, useCache = !isLocal)
+          currentJars(name) = timestamp
+          // Add it to our class loader
+          val url = new File(SparkFiles.getRootDirectory, localName).toURI.toURL
+          if (!urlClassLoader.getURLs.contains(url)) {
+            logInfo("Adding " + url + " to class loader")
+            urlClassLoader.addURL(url)
+          }
         }
       }
     }
diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorURLClassLoader.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorURLClassLoader.scala
deleted file mode 100644
index 8011e75944aac..0000000000000
--- a/core/src/main/scala/org/apache/spark/executor/ExecutorURLClassLoader.scala
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.executor
-
-import java.net.{URLClassLoader, URL}
-
-import org.apache.spark.util.ParentClassLoader
-
-/**
- * The addURL method in URLClassLoader is protected. We subclass it to make this accessible.
- * We also make changes so user classes can come before the default classes.
- */
-
-private[spark] trait MutableURLClassLoader extends ClassLoader {
-  def addURL(url: URL)
-  def getURLs: Array[URL]
-}
-
-private[spark] class ChildExecutorURLClassLoader(urls: Array[URL], parent: ClassLoader)
-  extends MutableURLClassLoader {
-
-  private object userClassLoader extends URLClassLoader(urls, null){
-    override def addURL(url: URL) {
-      super.addURL(url)
-    }
-    override def findClass(name: String): Class[_] = {
-      val loaded = super.findLoadedClass(name)
-      if (loaded != null) {
-        return loaded
-      }
-      try {
-        super.findClass(name)
-      } catch {
-        case e: ClassNotFoundException => {
-          parentClassLoader.loadClass(name)
-        }
-      }
-    }
-  }
-
-  private val parentClassLoader = new ParentClassLoader(parent)
-
-  override def findClass(name: String): Class[_] = {
-    try {
-      userClassLoader.findClass(name)
-    } catch {
-      case e: ClassNotFoundException => {
-        parentClassLoader.loadClass(name)
-      }
-    }
-  }
-
-  def addURL(url: URL) {
-    userClassLoader.addURL(url)
-  }
-
-  def getURLs() = {
-    userClassLoader.getURLs()
-  }
-}
-
-private[spark] class ExecutorURLClassLoader(urls: Array[URL], parent: ClassLoader)
-  extends URLClassLoader(urls, parent) with MutableURLClassLoader {
-
-  override def addURL(url: URL) {
-    super.addURL(url)
-  }
-}
-
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
index d2e1680a5fd1b..40fc6b59cdf7b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
@@ -52,8 +52,13 @@ private[spark] class SparkDeploySchedulerBackend(
       conf.get("spark.driver.host"),
       conf.get("spark.driver.port"),
       CoarseGrainedSchedulerBackend.ACTOR_NAME)
-    val args = Seq(driverUrl, "{{EXECUTOR_ID}}", "{{HOSTNAME}}", "{{CORES}}", "{{APP_ID}}",
-      "{{WORKER_URL}}")
+    val args = Seq(
+      "--driver-url", driverUrl,
+      "--executor-id", "{{EXECUTOR_ID}}",
+      "--hostname", "{{HOSTNAME}}",
+      "--cores", "{{CORES}}",
+      "--app-id", "{{APP_ID}}",
+      "--worker-url", "{{WORKER_URL}}")
     val extraJavaOpts = sc.conf.getOption("spark.executor.extraJavaOptions")
       .map(Utils.splitCommandString).getOrElse(Seq.empty)
     val classPathEntries = sc.conf.getOption("spark.executor.extraClassPath")
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
index 0d1c2a916ca7f..90dfe14352a8e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
@@ -154,18 +154,25 @@ private[spark] class CoarseMesosSchedulerBackend(
     if (uri == null) {
       val runScript = new File(executorSparkHome, "./bin/spark-class").getCanonicalPath
       command.setValue(
-        "%s \"%s\" org.apache.spark.executor.CoarseGrainedExecutorBackend %s %s %s %d %s".format(
-          prefixEnv, runScript, driverUrl, offer.getSlaveId.getValue,
-          offer.getHostname, numCores, appId))
+        "%s \"%s\" org.apache.spark.executor.CoarseGrainedExecutorBackend"
+          .format(prefixEnv, runScript) +
+        s" --driver-url $driverUrl" +
+        s" --executor-id ${offer.getSlaveId.getValue}" +
+        s" --hostname ${offer.getHostname}" +
+        s" --cores $numCores" +
+        s" --app-id $appId")
     } else {
       // Grab everything to the first '.'. We'll use that and '*' to
       // glob the directory "correctly".
       val basename = uri.split('/').last.split('.').head
       command.setValue(
-        ("cd %s*; %s " +
-          "./bin/spark-class org.apache.spark.executor.CoarseGrainedExecutorBackend %s %s %s %d %s")
-          .format(basename, prefixEnv, driverUrl, offer.getSlaveId.getValue,
-            offer.getHostname, numCores, appId))
+        s"cd $basename*; $prefixEnv " +
+         "./bin/spark-class org.apache.spark.executor.CoarseGrainedExecutorBackend" +
+        s" --driver-url $driverUrl" +
+        s" --executor-id ${offer.getSlaveId.getValue}" +
+        s" --hostname ${offer.getHostname}" +
+        s" --cores $numCores" +
+        s" --app-id $appId")
       command.addUris(CommandInfo.URI.newBuilder().setValue(uri))
     }
     command.build()
diff --git a/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala b/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala
new file mode 100644
index 0000000000000..d9c7103b2f3bf
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.net.{URLClassLoader, URL}
+import java.util.Enumeration
+import java.util.concurrent.ConcurrentHashMap
+
+import scala.collection.JavaConversions._
+
+import org.apache.spark.util.ParentClassLoader
+
+/**
+ * URL class loader that exposes the `addURL` and `getURLs` methods in URLClassLoader.
+ */
+private[spark] class MutableURLClassLoader(urls: Array[URL], parent: ClassLoader)
+  extends URLClassLoader(urls, parent) {
+
+  override def addURL(url: URL): Unit = {
+    super.addURL(url)
+  }
+
+  override def getURLs(): Array[URL] = {
+    super.getURLs()
+  }
+
+}
+
+/**
+ * A mutable class loader that gives preference to its own URLs over the parent class loader
+ * when loading classes and resources.
+ */
+private[spark] class ChildFirstURLClassLoader(urls: Array[URL], parent: ClassLoader)
+  extends MutableURLClassLoader(urls, null) {
+
+  private val parentClassLoader = new ParentClassLoader(parent)
+
+  /**
+   * Used to implement fine-grained class loading locks similar to what is done by Java 7. This
+   * prevents deadlock issues when using non-hierarchical class loaders.
+   *
+   * Note that due to Java 6 compatibility (and some issues with implementing class loaders in
+   * Scala), Java 7's `ClassLoader.registerAsParallelCapable` method is not called.
+   */
+  private val locks = new ConcurrentHashMap[String, Object]()
+
+  override def loadClass(name: String, resolve: Boolean): Class[_] = {
+    var lock = locks.get(name)
+    if (lock == null) {
+      val newLock = new Object()
+      lock = locks.putIfAbsent(name, newLock)
+      if (lock == null) {
+        lock = newLock
+      }
+    }
+
+    lock.synchronized {
+      try {
+        super.loadClass(name, resolve)
+      } catch {
+        case e: ClassNotFoundException =>
+          parentClassLoader.loadClass(name, resolve)
+      }
+    }
+  }
+
+  override def getResource(name: String): URL = {
+    val url = super.findResource(name)
+    val res = if (url != null) url else parentClassLoader.getResource(name)
+    res
+  }
+
+  override def getResources(name: String): Enumeration[URL] = {
+    val urls = super.findResources(name)
+    val res =
+      if (urls != null && urls.hasMoreElements()) {
+        urls
+      } else {
+        parentClassLoader.getResources(name)
+      }
+    res
+  }
+
+  override def addURL(url: URL) {
+    super.addURL(url)
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/util/ParentClassLoader.scala b/core/src/main/scala/org/apache/spark/util/ParentClassLoader.scala
index 3abc12681fe9a..6d8d9e8da3678 100644
--- a/core/src/main/scala/org/apache/spark/util/ParentClassLoader.scala
+++ b/core/src/main/scala/org/apache/spark/util/ParentClassLoader.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.util
 
 /**
- * A class loader which makes findClass accesible to the child
+ * A class loader which makes some protected methods in ClassLoader accesible.
  */
 private[spark] class ParentClassLoader(parent: ClassLoader) extends ClassLoader(parent) {
 
@@ -29,4 +29,9 @@ private[spark] class ParentClassLoader(parent: ClassLoader) extends ClassLoader(
   override def loadClass(name: String): Class[_] = {
     super.loadClass(name)
   }
+
+  override def loadClass(name: String, resolve: Boolean): Class[_] = {
+    super.loadClass(name, resolve)
+  }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
index e08210ae60d17..ea6b73bc68b34 100644
--- a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
@@ -197,6 +197,18 @@ class SparkConfSuite extends FunSuite with LocalSparkContext with ResetSystemPro
     serializer.newInstance().serialize(new StringBuffer())
   }
 
+  test("deprecated config keys") {
+    val conf = new SparkConf()
+      .set("spark.files.userClassPathFirst", "true")
+      .set("spark.yarn.user.classpath.first", "true")
+    assert(conf.contains("spark.files.userClassPathFirst"))
+    assert(conf.contains("spark.executor.userClassPathFirst"))
+    assert(conf.contains("spark.yarn.user.classpath.first"))
+    assert(conf.getBoolean("spark.files.userClassPathFirst", false))
+    assert(conf.getBoolean("spark.executor.userClassPathFirst", false))
+    assert(conf.getBoolean("spark.yarn.user.classpath.first", false))
+  }
+
 }
 
 class Class1 {}
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 1ddccae1262bc..46d745c4ecbfa 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -21,6 +21,8 @@ import java.io._
 
 import scala.collection.mutable.ArrayBuffer
 
+import com.google.common.base.Charsets.UTF_8
+import com.google.common.io.ByteStreams
 import org.scalatest.FunSuite
 import org.scalatest.Matchers
 import org.scalatest.concurrent.Timeouts
@@ -450,6 +452,19 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties
       PythonRunner.formatPaths(Utils.resolveURIs(pyFiles)).mkString(","))
   }
 
+  test("user classpath first in driver") {
+    val systemJar = TestUtils.createJarWithFiles(Map("test.resource" -> "SYSTEM"))
+    val userJar = TestUtils.createJarWithFiles(Map("test.resource" -> "USER"))
+    val args = Seq(
+      "--class", UserClasspathFirstTest.getClass.getName.stripSuffix("$"),
+      "--name", "testApp",
+      "--master", "local",
+      "--conf", "spark.driver.extraClassPath=" + systemJar,
+      "--conf", "spark.driver.userClassPathFirst=true",
+      userJar.toString)
+    runSparkSubmit(args)
+  }
+
   test("SPARK_CONF_DIR overrides spark-defaults.conf") {
     forConfDir(Map("spark.executor.memory" -> "2.3g")) { path =>
       val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
@@ -541,3 +556,15 @@ object SimpleApplicationTest {
     }
   }
 }
+
+object UserClasspathFirstTest {
+  def main(args: Array[String]) {
+    val ccl = Thread.currentThread().getContextClassLoader()
+    val resource = ccl.getResourceAsStream("test.resource")
+    val bytes = ByteStreams.toByteArray(resource)
+    val contents = new String(bytes, 0, bytes.length, UTF_8)
+    if (contents != "USER") {
+      throw new SparkException("Should have read user resource, but instead read: " + contents)
+    }
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorURLClassLoaderSuite.scala b/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala
similarity index 90%
rename from core/src/test/scala/org/apache/spark/executor/ExecutorURLClassLoaderSuite.scala
rename to core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala
index b7912c09d1410..31e3b7e7bb71b 100644
--- a/core/src/test/scala/org/apache/spark/executor/ExecutorURLClassLoaderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.executor
+package org.apache.spark.util
 
 import java.net.URLClassLoader
 
@@ -24,7 +24,7 @@ import org.scalatest.FunSuite
 import org.apache.spark.{LocalSparkContext, SparkContext, SparkException, TestUtils}
 import org.apache.spark.util.Utils
 
-class ExecutorURLClassLoaderSuite extends FunSuite {
+class MutableURLClassLoaderSuite extends FunSuite {
 
   val urls2 = List(TestUtils.createJarWithClasses(
       classNames = Seq("FakeClass1", "FakeClass2", "FakeClass3"),
@@ -37,7 +37,7 @@ class ExecutorURLClassLoaderSuite extends FunSuite {
 
   test("child first") {
     val parentLoader = new URLClassLoader(urls2, null)
-    val classLoader = new ChildExecutorURLClassLoader(urls, parentLoader)
+    val classLoader = new ChildFirstURLClassLoader(urls, parentLoader)
     val fakeClass = classLoader.loadClass("FakeClass2").newInstance()
     val fakeClassVersion = fakeClass.toString
     assert(fakeClassVersion === "1")
@@ -47,7 +47,7 @@ class ExecutorURLClassLoaderSuite extends FunSuite {
 
   test("parent first") {
     val parentLoader = new URLClassLoader(urls2, null)
-    val classLoader = new ExecutorURLClassLoader(urls, parentLoader)
+    val classLoader = new MutableURLClassLoader(urls, parentLoader)
     val fakeClass = classLoader.loadClass("FakeClass1").newInstance()
     val fakeClassVersion = fakeClass.toString
     assert(fakeClassVersion === "2")
@@ -57,7 +57,7 @@ class ExecutorURLClassLoaderSuite extends FunSuite {
 
   test("child first can fall back") {
     val parentLoader = new URLClassLoader(urls2, null)
-    val classLoader = new ChildExecutorURLClassLoader(urls, parentLoader)
+    val classLoader = new ChildFirstURLClassLoader(urls, parentLoader)
     val fakeClass = classLoader.loadClass("FakeClass3").newInstance()
     val fakeClassVersion = fakeClass.toString
     assert(fakeClassVersion === "2")
@@ -65,7 +65,7 @@ class ExecutorURLClassLoaderSuite extends FunSuite {
 
   test("child first can fail") {
     val parentLoader = new URLClassLoader(urls2, null)
-    val classLoader = new ChildExecutorURLClassLoader(urls, parentLoader)
+    val classLoader = new ChildFirstURLClassLoader(urls, parentLoader)
     intercept[java.lang.ClassNotFoundException] {
       classLoader.loadClass("FakeClassDoesNotExist").newInstance()
     }
diff --git a/docs/configuration.md b/docs/configuration.md
index 00e973c245005..eb0d6d33c97d9 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -230,6 +230,15 @@ Apart from these, the following properties are also available, and may be useful
     Set a special library path to use when launching the driver JVM.
   </td>
 </tr>
+<tr>
+  <td><code>spark.driver.userClassPathFirst</code></td>
+  <td>false</td>
+  <td>
+    (Experimental) Whether to give user-added jars precedence over Spark's own jars when loading
+    classes in the the driver. This feature can be used to mitigate conflicts between Spark's
+    dependencies and user dependencies. It is currently an experimental feature.
+  </td>
+</tr>
 <tr>
   <td><code>spark.executor.extraJavaOptions</code></td>
   <td>(none)</td>
@@ -297,13 +306,11 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.files.userClassPathFirst</code></td>
+  <td><code>spark.executor.userClassPathFirst</code></td>
   <td>false</td>
   <td>
-    (Experimental) Whether to give user-added jars precedence over Spark's own jars when
-    loading classes in Executors. This feature can be used to mitigate conflicts between
-    Spark's dependencies and user dependencies. It is currently an experimental feature.
-    (Currently, this setting does not work for YARN, see <a href="https://issues.apache.org/jira/browse/SPARK-2996">SPARK-2996</a> for more details).
+    (Experimental) Same functionality as <code>spark.driver.userClassPathFirst</code>, but
+    applied to executor instances.
   </td>
 </tr>
 <tr>
@@ -865,8 +872,8 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.network.timeout</code></td>
   <td>120</td>
   <td>
-    Default timeout for all network interactions, in seconds. This config will be used in 
-    place of <code>spark.core.connection.ack.wait.timeout</code>, <code>spark.akka.timeout</code>, 
+    Default timeout for all network interactions, in seconds. This config will be used in
+    place of <code>spark.core.connection.ack.wait.timeout</code>, <code>spark.akka.timeout</code>,
     <code>spark.storage.blockManagerSlaveTimeoutMs</code> or
     <code>spark.shuffle.io.connectionTimeout</code>, if they are not configured.
   </td>
@@ -911,8 +918,8 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.shuffle.io.preferDirectBufs</code></td>
   <td>true</td>
   <td>
-    (Netty only) Off-heap buffers are used to reduce garbage collection during shuffle and cache 
-    block transfer. For environments where off-heap memory is tightly limited, users may wish to 
+    (Netty only) Off-heap buffers are used to reduce garbage collection during shuffle and cache
+    block transfer. For environments where off-heap memory is tightly limited, users may wish to
     turn this off to force all allocations from Netty to be on-heap.
   </td>
 </tr>
@@ -920,7 +927,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.shuffle.io.numConnectionsPerPeer</code></td>
   <td>1</td>
   <td>
-    (Netty only) Connections between hosts are reused in order to reduce connection buildup for 
+    (Netty only) Connections between hosts are reused in order to reduce connection buildup for
     large clusters. For clusters with many hard disks and few hosts, this may result in insufficient
     concurrency to saturate all disks, and so users may consider increasing this value.
   </td>
@@ -930,7 +937,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>3</td>
   <td>
     (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is
-    set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC 
+    set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC
     pauses or transient network connectivity issues.
   </td>
 </tr>
@@ -939,7 +946,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>5</td>
   <td>
     (Netty only) Seconds to wait between retries of fetches. The maximum delay caused by retrying
-    is simply <code>maxRetries * retryWait</code>, by default 15 seconds. 
+    is simply <code>maxRetries * retryWait</code>, by default 15 seconds.
   </td>
 </tr>
 </table>
diff --git a/pom.xml b/pom.xml
index f6f176d2004b7..a9e968af25453 100644
--- a/pom.xml
+++ b/pom.xml
@@ -342,7 +342,7 @@
           </exclusion>
         </exclusions>
       </dependency>
- 
+
       <!-- Shaded deps marked as provided. These are promoted to compile scope
            in the modules where we want the shaded classes to appear in the
            associated jar. -->
@@ -395,7 +395,7 @@
         <scope>provided</scope>
       </dependency>
       <!-- End of shaded deps -->
- 
+
       <dependency>
         <groupId>org.apache.commons</groupId>
         <artifactId>commons-lang3</artifactId>
@@ -1178,13 +1178,19 @@
             </includes>
             <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
             <argLine>-Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
+            <environmentVariables>
+              <!--
+                Setting SPARK_DIST_CLASSPATH is a simple way to make sure any child processes
+                launched by the tests have access to the correct test-time classpath.
+              -->
+              <SPARK_DIST_CLASSPATH>${test_classpath}</SPARK_DIST_CLASSPATH>
+            </environmentVariables>
             <systemProperties>
               <java.awt.headless>true</java.awt.headless>
               <spark.test.home>${session.executionRootDirectory}</spark.test.home>
               <spark.testing>1</spark.testing>
               <spark.ui.enabled>false</spark.ui.enabled>
               <spark.ui.showConsoleProgress>false</spark.ui.showConsoleProgress>
-              <spark.executor.extraClassPath>${test_classpath}</spark.executor.extraClassPath>
               <spark.driver.allowMultipleContexts>true</spark.driver.allowMultipleContexts>
             </systemProperties>
             <failIfNoTests>false</failIfNoTests>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 95f8dfa3d270f..8fb1239b4a96b 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -411,6 +411,10 @@ object TestSettings {
   lazy val settings = Seq (
     // Fork new JVMs for tests and set Java options for those
     fork := true,
+    // Setting SPARK_DIST_CLASSPATH is a simple way to make sure any child processes
+    // launched by the tests have access to the correct test-time classpath.
+    envVars in Test += ("SPARK_DIST_CLASSPATH" ->
+      (fullClasspath in Test).value.files.map(_.getAbsolutePath).mkString(":").stripSuffix(":")),
     javaOptions in Test += "-Dspark.test.home=" + sparkHome,
     javaOptions in Test += "-Dspark.testing=1",
     javaOptions in Test += "-Dspark.port.maxRetries=100",
@@ -423,10 +427,6 @@ object TestSettings {
     javaOptions in Test += "-ea",
     javaOptions in Test ++= "-Xmx3g -XX:PermSize=128M -XX:MaxNewSize=256m -XX:MaxPermSize=1g"
       .split(" ").toSeq,
-    // This places test scope jars on the classpath of executors during tests.
-    javaOptions in Test +=
-      "-Dspark.executor.extraClassPath=" + (fullClasspath in Test).value.files.
-      map(_.getAbsolutePath).mkString(":").stripSuffix(":"),
     javaOptions += "-Xmx3g",
     // Show full stack trace and duration in test cases.
     testOptions in Test += Tests.Argument("-oDF"),
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 4cc320c5d59b5..a9bf861d160c1 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -19,9 +19,9 @@ package org.apache.spark.deploy.yarn
 
 import scala.util.control.NonFatal
 
-import java.io.IOException
+import java.io.{File, IOException}
 import java.lang.reflect.InvocationTargetException
-import java.net.Socket
+import java.net.{Socket, URL}
 import java.util.concurrent.atomic.AtomicReference
 
 import akka.actor._
@@ -38,7 +38,8 @@ import org.apache.spark.deploy.{PythonRunner, SparkHadoopUtil}
 import org.apache.spark.deploy.history.HistoryServer
 import org.apache.spark.scheduler.cluster.YarnSchedulerBackend
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
-import org.apache.spark.util.{AkkaUtils, SignalLogger, Utils}
+import org.apache.spark.util.{AkkaUtils, ChildFirstURLClassLoader, MutableURLClassLoader,
+  SignalLogger, Utils}
 
 /**
  * Common application master functionality for Spark on Yarn.
@@ -244,7 +245,6 @@ private[spark] class ApplicationMaster(
       host: String,
       port: String,
       isClusterMode: Boolean): Unit = {
-    
     val driverUrl = AkkaUtils.address(
       AkkaUtils.protocol(actorSystem),
       SparkEnv.driverActorSystemName,
@@ -453,12 +453,24 @@ private[spark] class ApplicationMaster(
   private def startUserApplication(): Thread = {
     logInfo("Starting the user application in a separate Thread")
     System.setProperty("spark.executor.instances", args.numExecutors.toString)
+
+    val classpath = Client.getUserClasspath(sparkConf)
+    val urls = classpath.map { entry =>
+      new URL("file:" + new File(entry.getPath()).getAbsolutePath())
+    }
+    val userClassLoader =
+      if (Client.isUserClassPathFirst(sparkConf, isDriver = true)) {
+        new ChildFirstURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
+      } else {
+        new MutableURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
+      }
+
     if (args.primaryPyFile != null && args.primaryPyFile.endsWith(".py")) {
       System.setProperty("spark.submit.pyFiles",
         PythonRunner.formatPaths(args.pyFiles).mkString(","))
     }
-    val mainMethod = Class.forName(args.userClass, false,
-      Thread.currentThread.getContextClassLoader).getMethod("main", classOf[Array[String]])
+    val mainMethod = userClassLoader.loadClass(args.userClass)
+      .getMethod("main", classOf[Array[String]])
 
     val userThread = new Thread {
       override def run() {
@@ -483,6 +495,7 @@ private[spark] class ApplicationMaster(
         }
       }
     }
+    userThread.setContextClassLoader(userClassLoader)
     userThread.setName("Driver")
     userThread.start()
     userThread
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 8afc1ccdad732..46d9df93488cb 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -183,8 +183,7 @@ private[spark] class Client(
   private[yarn] def copyFileToRemote(
       destDir: Path,
       srcPath: Path,
-      replication: Short,
-      setPerms: Boolean = false): Path = {
+      replication: Short): Path = {
     val destFs = destDir.getFileSystem(hadoopConf)
     val srcFs = srcPath.getFileSystem(hadoopConf)
     var destPath = srcPath
@@ -193,9 +192,7 @@ private[spark] class Client(
       logInfo(s"Uploading resource $srcPath -> $destPath")
       FileUtil.copy(srcFs, srcPath, destFs, destPath, false, hadoopConf)
       destFs.setReplication(destPath, replication)
-      if (setPerms) {
-        destFs.setPermission(destPath, new FsPermission(APP_FILE_PERMISSION))
-      }
+      destFs.setPermission(destPath, new FsPermission(APP_FILE_PERMISSION))
     } else {
       logInfo(s"Source and destination file systems are the same. Not copying $srcPath")
     }
@@ -239,23 +236,22 @@ private[spark] class Client(
     /**
      * Copy the given main resource to the distributed cache if the scheme is not "local".
      * Otherwise, set the corresponding key in our SparkConf to handle it downstream.
-     * Each resource is represented by a 4-tuple of:
+     * Each resource is represented by a 3-tuple of:
      *   (1) destination resource name,
      *   (2) local path to the resource,
-     *   (3) Spark property key to set if the scheme is not local, and
-     *   (4) whether to set permissions for this resource
+     *   (3) Spark property key to set if the scheme is not local
      */
     List(
-      (SPARK_JAR, sparkJar(sparkConf), CONF_SPARK_JAR, false),
-      (APP_JAR, args.userJar, CONF_SPARK_USER_JAR, true),
-      ("log4j.properties", oldLog4jConf.orNull, null, false)
-    ).foreach { case (destName, _localPath, confKey, setPermissions) =>
+      (SPARK_JAR, sparkJar(sparkConf), CONF_SPARK_JAR),
+      (APP_JAR, args.userJar, CONF_SPARK_USER_JAR),
+      ("log4j.properties", oldLog4jConf.orNull, null)
+    ).foreach { case (destName, _localPath, confKey) =>
       val localPath: String = if (_localPath != null) _localPath.trim() else ""
       if (!localPath.isEmpty()) {
         val localURI = new URI(localPath)
         if (localURI.getScheme != LOCAL_SCHEME) {
           val src = getQualifiedLocalPath(localURI, hadoopConf)
-          val destPath = copyFileToRemote(dst, src, replication, setPermissions)
+          val destPath = copyFileToRemote(dst, src, replication)
           val destFs = FileSystem.get(destPath.toUri(), hadoopConf)
           distCacheMgr.addResource(destFs, hadoopConf, destPath,
             localResources, LocalResourceType.FILE, destName, statCache)
@@ -707,7 +703,7 @@ object Client extends Logging {
    * Return the path to the given application's staging directory.
    */
   private def getAppStagingDir(appId: ApplicationId): String = {
-    SPARK_STAGING + Path.SEPARATOR + appId.toString() + Path.SEPARATOR
+    buildPath(SPARK_STAGING, appId.toString())
   }
 
   /**
@@ -783,7 +779,13 @@ object Client extends Logging {
 
   /**
    * Populate the classpath entry in the given environment map.
-   * This includes the user jar, Spark jar, and any extra application jars.
+   *
+   * User jars are generally not added to the JVM's system classpath; those are handled by the AM
+   * and executor backend. When the deprecated `spark.yarn.user.classpath.first` is used, user jars
+   * are included in the system classpath, though. The extra class path and other uploaded files are
+   * always made available through the system class path.
+   *
+   * @param args Client arguments (when starting the AM) or null (when starting executors).
    */
   private[yarn] def populateClasspath(
       args: ClientArguments,
@@ -795,48 +797,38 @@ object Client extends Logging {
     addClasspathEntry(
       YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), env
     )
-
-    // Normally the users app.jar is last in case conflicts with spark jars
     if (sparkConf.getBoolean("spark.yarn.user.classpath.first", false)) {
-      addUserClasspath(args, sparkConf, env)
-      addFileToClasspath(sparkJar(sparkConf), SPARK_JAR, env)
-      populateHadoopClasspath(conf, env)
-    } else {
-      addFileToClasspath(sparkJar(sparkConf), SPARK_JAR, env)
-      populateHadoopClasspath(conf, env)
-      addUserClasspath(args, sparkConf, env)
+      val userClassPath =
+        if (args != null) {
+          getUserClasspath(Option(args.userJar), Option(args.addJars))
+        } else {
+          getUserClasspath(sparkConf)
+        }
+      userClassPath.foreach { x =>
+        addFileToClasspath(x, null, env)
+      }
     }
-
-    // Append all jar files under the working directory to the classpath.
-    addClasspathEntry(
-      YarnSparkHadoopUtil.expandEnvironment(Environment.PWD) + Path.SEPARATOR + "*", env
-    )
+    addFileToClasspath(new URI(sparkJar(sparkConf)), SPARK_JAR, env)
+    populateHadoopClasspath(conf, env)
+    sys.env.get(ENV_DIST_CLASSPATH).foreach(addClasspathEntry(_, env))
   }
 
   /**
-   * Adds the user jars which have local: URIs (or alternate names, such as APP_JAR) explicitly
-   * to the classpath.
+   * Returns a list of URIs representing the user classpath.
+   *
+   * @param conf Spark configuration.
    */
-  private def addUserClasspath(
-      args: ClientArguments,
-      conf: SparkConf,
-      env: HashMap[String, String]): Unit = {
-
-    // If `args` is not null, we are launching an AM container.
-    // Otherwise, we are launching executor containers.
-    val (mainJar, secondaryJars) =
-      if (args != null) {
-        (args.userJar, args.addJars)
-      } else {
-        (conf.get(CONF_SPARK_USER_JAR, null), conf.get(CONF_SPARK_YARN_SECONDARY_JARS, null))
-      }
+  def getUserClasspath(conf: SparkConf): Array[URI] = {
+    getUserClasspath(conf.getOption(CONF_SPARK_USER_JAR),
+      conf.getOption(CONF_SPARK_YARN_SECONDARY_JARS))
+  }
 
-    addFileToClasspath(mainJar, APP_JAR, env)
-    if (secondaryJars != null) {
-      secondaryJars.split(",").filter(_.nonEmpty).foreach { jar =>
-        addFileToClasspath(jar, null, env)
-      }
-    }
+  private def getUserClasspath(
+      mainJar: Option[String],
+      secondaryJars: Option[String]): Array[URI] = {
+    val mainUri = mainJar.orElse(Some(APP_JAR)).map(new URI(_))
+    val secondaryUris = secondaryJars.map(_.split(",")).toSeq.flatten.map(new URI(_))
+    (mainUri ++ secondaryUris).toArray
   }
 
   /**
@@ -847,27 +839,19 @@ object Client extends Logging {
    *
    * If not a "local:" file and no alternate name, the environment is not modified.
    *
-   * @param path      Path to add to classpath (optional).
+   * @param uri       URI to add to classpath (optional).
    * @param fileName  Alternate name for the file (optional).
    * @param env       Map holding the environment variables.
    */
   private def addFileToClasspath(
-      path: String,
+      uri: URI,
       fileName: String,
       env: HashMap[String, String]): Unit = {
-    if (path != null) {
-      scala.util.control.Exception.ignoring(classOf[URISyntaxException]) {
-        val uri = new URI(path)
-        if (uri.getScheme == LOCAL_SCHEME) {
-          addClasspathEntry(uri.getPath, env)
-          return
-        }
-      }
-    }
-    if (fileName != null) {
-      addClasspathEntry(
-        YarnSparkHadoopUtil.expandEnvironment(Environment.PWD) + Path.SEPARATOR + fileName, env
-      )
+    if (uri != null && uri.getScheme == LOCAL_SCHEME) {
+      addClasspathEntry(uri.getPath, env)
+    } else if (fileName != null) {
+      addClasspathEntry(buildPath(
+        YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), fileName), env)
     }
   }
 
@@ -963,4 +947,23 @@ object Client extends Logging {
     new Path(qualifiedURI)
   }
 
+  /**
+   * Whether to consider jars provided by the user to have precedence over the Spark jars when
+   * loading user classes.
+   */
+  def isUserClassPathFirst(conf: SparkConf, isDriver: Boolean): Boolean = {
+    if (isDriver) {
+      conf.getBoolean("spark.driver.userClassPathFirst", false)
+    } else {
+      conf.getBoolean("spark.executor.userClassPathFirst", false)
+    }
+  }
+
+  /**
+   * Joins all the path components using Path.SEPARATOR.
+   */
+  def buildPath(components: String*): String = {
+    components.mkString(Path.SEPARATOR)
+  }
+
 }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
index 7cd8c5f0f9204..6d5b8fda76ab8 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.deploy.yarn
 
+import java.io.File
 import java.net.URI
 import java.nio.ByteBuffer
 
@@ -57,7 +58,7 @@ class ExecutorRunnable(
   var nmClient: NMClient = _
   val yarnConf: YarnConfiguration = new YarnConfiguration(conf)
   lazy val env = prepareEnvironment(container)
-  
+
   def run = {
     logInfo("Starting Executor Container")
     nmClient = NMClient.createNMClient()
@@ -185,6 +186,16 @@ class ExecutorRunnable(
     // For log4j configuration to reference
     javaOpts += ("-Dspark.yarn.app.container.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR)
 
+    val userClassPath = Client.getUserClasspath(sparkConf).flatMap { uri =>
+      val absPath =
+        if (new File(uri.getPath()).isAbsolute()) {
+          uri.getPath()
+        } else {
+          Client.buildPath(Environment.PWD.$(), uri.getPath())
+        }
+      Seq("--user-class-path", "file:" + absPath)
+    }.toSeq
+
     val commands = prefixEnv ++ Seq(
       YarnSparkHadoopUtil.expandEnvironment(Environment.JAVA_HOME) + "/bin/java",
       "-server",
@@ -196,11 +207,13 @@ class ExecutorRunnable(
       "-XX:OnOutOfMemoryError='kill %p'") ++
       javaOpts ++
       Seq("org.apache.spark.executor.CoarseGrainedExecutorBackend",
-        masterAddress.toString,
-        slaveId.toString,
-        hostname.toString,
-        executorCores.toString,
-        appId,
+        "--driver-url", masterAddress.toString,
+        "--executor-id", slaveId.toString,
+        "--hostname", hostname.toString,
+        "--cores", executorCores.toString,
+        "--app-id", appId) ++
+      userClassPath ++
+      Seq(
         "1>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout",
         "2>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr")
 
diff --git a/yarn/src/test/resources/log4j.properties b/yarn/src/test/resources/log4j.properties
index 287c8e3563503..aab41fa49430f 100644
--- a/yarn/src/test/resources/log4j.properties
+++ b/yarn/src/test/resources/log4j.properties
@@ -16,7 +16,7 @@
 #
 
 # Set everything to be logged to the file target/unit-tests.log
-log4j.rootCategory=INFO, file
+log4j.rootCategory=DEBUG, file
 log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
@@ -25,4 +25,4 @@ log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{
 
 # Ignore messages below warning level from Jetty, because it's a bit verbose
 log4j.logger.org.eclipse.jetty=WARN
-org.eclipse.jetty.LEVEL=WARN
+log4j.logger.org.apache.hadoop=WARN
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 2bb3dcffd61d9..f8f8129d220e4 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -82,6 +82,7 @@ class ClientSuite extends FunSuite with Matchers {
   test("Local jar URIs") {
     val conf = new Configuration()
     val sparkConf = new SparkConf().set(Client.CONF_SPARK_JAR, SPARK)
+      .set("spark.yarn.user.classpath.first", "true")
     val env = new MutableHashMap[String, String]()
     val args = new ClientArguments(Array("--jar", USER, "--addJars", ADDED), sparkConf)
 
@@ -98,13 +99,10 @@ class ClientSuite extends FunSuite with Matchers {
     })
     if (classOf[Environment].getMethods().exists(_.getName == "$$")) {
       cp should contain("{{PWD}}")
-      cp should contain(s"{{PWD}}${Path.SEPARATOR}*")
     } else if (Utils.isWindows) {
       cp should contain("%PWD%")
-      cp should contain(s"%PWD%${Path.SEPARATOR}*")
     } else {
       cp should contain(Environment.PWD.$())
-      cp should contain(s"${Environment.PWD.$()}${File.separator}*")
     }
     cp should not contain (Client.SPARK_JAR)
     cp should not contain (Client.APP_JAR)
@@ -117,7 +115,7 @@ class ClientSuite extends FunSuite with Matchers {
 
     val client = spy(new Client(args, conf, sparkConf))
     doReturn(new Path("/")).when(client).copyFileToRemote(any(classOf[Path]),
-      any(classOf[Path]), anyShort(), anyBoolean())
+      any(classOf[Path]), anyShort())
 
     val tempDir = Utils.createTempDir()
     try {
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index e39de82740b1d..0e37276ba724b 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -17,27 +17,34 @@
 
 package org.apache.spark.deploy.yarn
 
-import java.io.File
+import java.io.{File, FileOutputStream, OutputStreamWriter}
+import java.util.Properties
 import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable
 
-import com.google.common.base.Charsets
+import com.google.common.base.Charsets.UTF_8
+import com.google.common.io.ByteStreams
 import com.google.common.io.Files
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.server.MiniYARNCluster
 import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}
 
-import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException}
+import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException, TestUtils}
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.scheduler.{SparkListener, SparkListenerExecutorAdded}
 import org.apache.spark.util.Utils
 
+/**
+ * Integration tests for YARN; these tests use a mini Yarn cluster to run Spark-on-YARN
+ * applications, and require the Spark assembly to be built before they can be successfully
+ * run.
+ */
 class YarnClusterSuite extends FunSuite with BeforeAndAfterAll with Matchers with Logging {
 
-  // log4j configuration for the Yarn containers, so that their output is collected
-  // by Yarn instead of trying to overwrite unit-tests.log.
+  // log4j configuration for the YARN containers, so that their output is collected
+  // by YARN instead of trying to overwrite unit-tests.log.
   private val LOG4J_CONF = """
     |log4j.rootCategory=DEBUG, console
     |log4j.appender.console=org.apache.log4j.ConsoleAppender
@@ -52,13 +59,11 @@ class YarnClusterSuite extends FunSuite with BeforeAndAfterAll with Matchers wit
     |
     |from pyspark import SparkConf , SparkContext
     |if __name__ == "__main__":
-    |    if len(sys.argv) != 3:
-    |        print >> sys.stderr, "Usage: test.py [master] [result file]"
+    |    if len(sys.argv) != 2:
+    |        print >> sys.stderr, "Usage: test.py [result file]"
     |        exit(-1)
-    |    conf = SparkConf()
-    |    conf.setMaster(sys.argv[1]).setAppName("python test in yarn cluster mode")
-    |    sc = SparkContext(conf=conf)
-    |    status = open(sys.argv[2],'w')
+    |    sc = SparkContext(conf=SparkConf())
+    |    status = open(sys.argv[1],'w')
     |    result = "failure"
     |    rdd = sc.parallelize(range(10))
     |    cnt = rdd.count()
@@ -72,23 +77,17 @@ class YarnClusterSuite extends FunSuite with BeforeAndAfterAll with Matchers wit
   private var yarnCluster: MiniYARNCluster = _
   private var tempDir: File = _
   private var fakeSparkJar: File = _
-  private var oldConf: Map[String, String] = _
+  private var logConfDir: File = _
 
   override def beforeAll() {
     super.beforeAll()
 
     tempDir = Utils.createTempDir()
-
-    val logConfDir = new File(tempDir, "log4j")
+    logConfDir = new File(tempDir, "log4j")
     logConfDir.mkdir()
 
     val logConfFile = new File(logConfDir, "log4j.properties")
-    Files.write(LOG4J_CONF, logConfFile, Charsets.UTF_8)
-
-    val childClasspath = logConfDir.getAbsolutePath() + File.pathSeparator +
-      sys.props("java.class.path")
-
-    oldConf = sys.props.filter { case (k, v) => k.startsWith("spark.") }.toMap
+    Files.write(LOG4J_CONF, logConfFile, UTF_8)
 
     yarnCluster = new MiniYARNCluster(getClass().getName(), 1, 1, 1)
     yarnCluster.init(new YarnConfiguration())
@@ -119,99 +118,165 @@ class YarnClusterSuite extends FunSuite with BeforeAndAfterAll with Matchers wit
     }
 
     logInfo(s"RM address in configuration is ${config.get(YarnConfiguration.RM_ADDRESS)}")
-    config.foreach { e =>
-      sys.props += ("spark.hadoop." + e.getKey() -> e.getValue())
-    }
 
     fakeSparkJar = File.createTempFile("sparkJar", null, tempDir)
-    val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!"))
-    sys.props += ("spark.yarn.appMasterEnv.SPARK_HOME" ->  sparkHome)
-    sys.props += ("spark.executorEnv.SPARK_HOME" -> sparkHome)
-    sys.props += ("spark.yarn.jar" -> ("local:" + fakeSparkJar.getAbsolutePath()))
-    sys.props += ("spark.executor.instances" -> "1")
-    sys.props += ("spark.driver.extraClassPath" -> childClasspath)
-    sys.props += ("spark.executor.extraClassPath" -> childClasspath)
-    sys.props += ("spark.executor.extraJavaOptions" -> "-Dfoo=\"one two three\"")
-    sys.props += ("spark.driver.extraJavaOptions" -> "-Dfoo=\"one two three\"")
   }
 
   override def afterAll() {
     yarnCluster.stop()
-    sys.props.retain { case (k, v) => !k.startsWith("spark.") }
-    sys.props ++= oldConf
     super.afterAll()
   }
 
   test("run Spark in yarn-client mode") {
-    var result = File.createTempFile("result", null, tempDir)
-    YarnClusterDriver.main(Array("yarn-client", result.getAbsolutePath()))
-    checkResult(result)
-
-    // verify log urls are present
-    YarnClusterDriver.listener.addedExecutorInfos.values.foreach { info =>
-      assert(info.logUrlMap.nonEmpty)
-    }
+    testBasicYarnApp(true)
   }
 
   test("run Spark in yarn-cluster mode") {
-    val main = YarnClusterDriver.getClass.getName().stripSuffix("$")
-    var result = File.createTempFile("result", null, tempDir)
-
-    val args = Array("--class", main,
-      "--jar", "file:" + fakeSparkJar.getAbsolutePath(),
-      "--arg", "yarn-cluster",
-      "--arg", result.getAbsolutePath(),
-      "--num-executors", "1")
-    Client.main(args)
-    checkResult(result)
-
-    // verify log urls are present.
-    YarnClusterDriver.listener.addedExecutorInfos.values.foreach { info =>
-      assert(info.logUrlMap.nonEmpty)
-    }
+    testBasicYarnApp(false)
   }
 
   test("run Spark in yarn-cluster mode unsuccessfully") {
-    val main = YarnClusterDriver.getClass.getName().stripSuffix("$")
-
-    // Use only one argument so the driver will fail
-    val args = Array("--class", main,
-      "--jar", "file:" + fakeSparkJar.getAbsolutePath(),
-      "--arg", "yarn-cluster",
-      "--num-executors", "1")
+    // Don't provide arguments so the driver will fail.
     val exception = intercept[SparkException] {
-      Client.main(args)
+      runSpark(false, mainClassName(YarnClusterDriver.getClass))
+      fail("Spark application should have failed.")
     }
-    assert(Utils.exceptionString(exception).contains("Application finished with failed status"))
   }
 
   test("run Python application in yarn-cluster mode") {
     val primaryPyFile = new File(tempDir, "test.py")
-    Files.write(TEST_PYFILE, primaryPyFile, Charsets.UTF_8)
+    Files.write(TEST_PYFILE, primaryPyFile, UTF_8)
     val pyFile = new File(tempDir, "test2.py")
-    Files.write(TEST_PYFILE, pyFile, Charsets.UTF_8)
+    Files.write(TEST_PYFILE, pyFile, UTF_8)
     var result = File.createTempFile("result", null, tempDir)
 
-    val args = Array("--class", "org.apache.spark.deploy.PythonRunner",
-      "--primary-py-file", primaryPyFile.getAbsolutePath(),
-      "--py-files", pyFile.getAbsolutePath(),
-      "--arg", "yarn-cluster",
-      "--arg", result.getAbsolutePath(),
-      "--name", "python test in yarn-cluster mode",
-      "--num-executors", "1")
-    Client.main(args)
+    // The sbt assembly does not include pyspark / py4j python dependencies, so we need to
+    // propagate SPARK_HOME so that those are added to PYTHONPATH. See PythonUtils.scala.
+    val sparkHome = sys.props("spark.test.home")
+    val extraConf = Map(
+      "spark.executorEnv.SPARK_HOME" -> sparkHome,
+      "spark.yarn.appMasterEnv.SPARK_HOME" -> sparkHome)
+
+    runSpark(false, primaryPyFile.getAbsolutePath(),
+      sparkArgs = Seq("--py-files", pyFile.getAbsolutePath()),
+      appArgs = Seq(result.getAbsolutePath()),
+      extraConf = extraConf)
     checkResult(result)
   }
 
+  test("user class path first in client mode") {
+    testUseClassPathFirst(true)
+  }
+
+  test("user class path first in cluster mode") {
+    testUseClassPathFirst(false)
+  }
+
+  private def testBasicYarnApp(clientMode: Boolean): Unit = {
+    var result = File.createTempFile("result", null, tempDir)
+    runSpark(clientMode, mainClassName(YarnClusterDriver.getClass),
+      appArgs = Seq(result.getAbsolutePath()))
+    checkResult(result)
+  }
+
+  private def testUseClassPathFirst(clientMode: Boolean): Unit = {
+    // Create a jar file that contains a different version of "test.resource".
+    val originalJar = TestUtils.createJarWithFiles(Map("test.resource" -> "ORIGINAL"), tempDir)
+    val userJar = TestUtils.createJarWithFiles(Map("test.resource" -> "OVERRIDDEN"), tempDir)
+    val driverResult = File.createTempFile("driver", null, tempDir)
+    val executorResult = File.createTempFile("executor", null, tempDir)
+    runSpark(clientMode, mainClassName(YarnClasspathTest.getClass),
+      appArgs = Seq(driverResult.getAbsolutePath(), executorResult.getAbsolutePath()),
+      extraClassPath = Seq(originalJar.getPath()),
+      extraJars = Seq("local:" + userJar.getPath()),
+      extraConf = Map(
+        "spark.driver.userClassPathFirst" -> "true",
+        "spark.executor.userClassPathFirst" -> "true"))
+    checkResult(driverResult, "OVERRIDDEN")
+    checkResult(executorResult, "OVERRIDDEN")
+  }
+
+  private def runSpark(
+      clientMode: Boolean,
+      klass: String,
+      appArgs: Seq[String] = Nil,
+      sparkArgs: Seq[String] = Nil,
+      extraClassPath: Seq[String] = Nil,
+      extraJars: Seq[String] = Nil,
+      extraConf: Map[String, String] = Map()): Unit = {
+    val master = if (clientMode) "yarn-client" else "yarn-cluster"
+    val props = new Properties()
+
+    props.setProperty("spark.yarn.jar", "local:" + fakeSparkJar.getAbsolutePath())
+
+    val childClasspath = logConfDir.getAbsolutePath() +
+      File.pathSeparator +
+      sys.props("java.class.path") +
+      File.pathSeparator +
+      extraClassPath.mkString(File.pathSeparator)
+    props.setProperty("spark.driver.extraClassPath", childClasspath)
+    props.setProperty("spark.executor.extraClassPath", childClasspath)
+
+    // SPARK-4267: make sure java options are propagated correctly.
+    props.setProperty("spark.driver.extraJavaOptions", "-Dfoo=\"one two three\"")
+    props.setProperty("spark.executor.extraJavaOptions", "-Dfoo=\"one two three\"")
+
+    yarnCluster.getConfig().foreach { e =>
+      props.setProperty("spark.hadoop." + e.getKey(), e.getValue())
+    }
+
+    sys.props.foreach { case (k, v) =>
+      if (k.startsWith("spark.")) {
+        props.setProperty(k, v)
+      }
+    }
+
+    extraConf.foreach { case (k, v) => props.setProperty(k, v) }
+
+    val propsFile = File.createTempFile("spark", ".properties", tempDir)
+    val writer = new OutputStreamWriter(new FileOutputStream(propsFile), UTF_8)
+    props.store(writer, "Spark properties.")
+    writer.close()
+
+    val extraJarArgs = if (!extraJars.isEmpty()) Seq("--jars", extraJars.mkString(",")) else Nil
+    val mainArgs =
+      if (klass.endsWith(".py")) {
+        Seq(klass)
+      } else {
+        Seq("--class", klass, fakeSparkJar.getAbsolutePath())
+      }
+    val argv =
+      Seq(
+        new File(sys.props("spark.test.home"), "bin/spark-submit").getAbsolutePath(),
+        "--master", master,
+        "--num-executors", "1",
+        "--properties-file", propsFile.getAbsolutePath()) ++
+      extraJarArgs ++
+      sparkArgs ++
+      mainArgs ++
+      appArgs
+
+    Utils.executeAndGetOutput(argv,
+      extraEnvironment = Map("YARN_CONF_DIR" -> tempDir.getAbsolutePath()))
+  }
+
   /**
    * This is a workaround for an issue with yarn-cluster mode: the Client class will not provide
    * any sort of error when the job process finishes successfully, but the job itself fails. So
    * the tests enforce that something is written to a file after everything is ok to indicate
    * that the job succeeded.
    */
-  private def checkResult(result: File) = {
-    var resultString = Files.toString(result, Charsets.UTF_8)
-    resultString should be ("success")
+  private def checkResult(result: File): Unit = {
+    checkResult(result, "success")
+  }
+
+  private def checkResult(result: File, expected: String): Unit = {
+    var resultString = Files.toString(result, UTF_8)
+    resultString should be (expected)
+  }
+
+  private def mainClassName(klass: Class[_]): String = {
+    klass.getName().stripSuffix("$")
   }
 
 }
@@ -229,22 +294,22 @@ private object YarnClusterDriver extends Logging with Matchers {
   val WAIT_TIMEOUT_MILLIS = 10000
   var listener: SaveExecutorInfo = null
 
-  def main(args: Array[String]) = {
-    if (args.length != 2) {
+  def main(args: Array[String]): Unit = {
+    if (args.length != 1) {
       System.err.println(
         s"""
         |Invalid command line: ${args.mkString(" ")}
         |
-        |Usage: YarnClusterDriver [master] [result file]
+        |Usage: YarnClusterDriver [result file]
         """.stripMargin)
       System.exit(1)
     }
 
     listener = new SaveExecutorInfo
-    val sc = new SparkContext(new SparkConf().setMaster(args(0))
+    val sc = new SparkContext(new SparkConf()
       .setAppName("yarn \"test app\" 'with quotes' and \\back\\slashes and $dollarSigns"))
     sc.addSparkListener(listener)
-    val status = new File(args(1))
+    val status = new File(args(0))
     var result = "failure"
     try {
       val data = sc.parallelize(1 to 4, 4).collect().toSet
@@ -253,7 +318,48 @@ private object YarnClusterDriver extends Logging with Matchers {
       result = "success"
     } finally {
       sc.stop()
-      Files.write(result, status, Charsets.UTF_8)
+      Files.write(result, status, UTF_8)
+    }
+
+    // verify log urls are present
+    listener.addedExecutorInfos.values.foreach { info =>
+      assert(info.logUrlMap.nonEmpty)
+    }
+  }
+
+}
+
+private object YarnClasspathTest {
+
+  def main(args: Array[String]): Unit = {
+    if (args.length != 2) {
+      System.err.println(
+        s"""
+        |Invalid command line: ${args.mkString(" ")}
+        |
+        |Usage: YarnClasspathTest [driver result file] [executor result file]
+        """.stripMargin)
+      System.exit(1)
+    }
+
+    readResource(args(0))
+    val sc = new SparkContext(new SparkConf())
+    try {
+      sc.parallelize(Seq(1)).foreach { x => readResource(args(1)) }
+    } finally {
+      sc.stop()
+    }
+  }
+
+  private def readResource(resultPath: String): Unit = {
+    var result = "failure"
+    try {
+      val ccl = Thread.currentThread().getContextClassLoader()
+      val resource = ccl.getResourceAsStream("test.resource")
+      val bytes = ByteStreams.toByteArray(resource)
+      result = new String(bytes, 0, bytes.length, UTF_8)
+    } finally {
+      Files.write(result, new File(resultPath), UTF_8)
     }
   }
 

From a95ed52157473fb0e42e910ee15270e7f0edf943 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Mon, 9 Feb 2015 21:18:48 -0800
Subject: [PATCH 084/357] [SPARK-5703] AllJobsPage throws empty.max exception

If you have a `SparkListenerJobEnd` event without the corresponding `SparkListenerJobStart` event, then `JobProgressListener` will create an empty `JobUIData` with an empty `stageIds` list. However, later in `AllJobsPage` we call `stageIds.max`. If this is empty, it will throw an exception.

This crashed my history server.

Author: Andrew Or <andrew@databricks.com>

Closes #4490 from andrewor14/jobs-page-max and squashes the following commits:

21797d3 [Andrew Or] Check nonEmpty before calling max
---
 .../src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index 045c69da06feb..bd923d78a86ce 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -42,7 +42,9 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
     }
 
     def makeRow(job: JobUIData): Seq[Node] = {
-      val lastStageInfo = listener.stageIdToInfo.get(job.stageIds.max)
+      val lastStageInfo = Option(job.stageIds)
+        .filter(_.nonEmpty)
+        .flatMap { ids => listener.stageIdToInfo.get(ids.max) }
       val lastStageData = lastStageInfo.flatMap { s =>
         listener.stageIdToData.get((s.stageId, s.attemptId))
       }

From a2d33d0b01af87e931d9d883638a52d7a86f6248 Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Mon, 9 Feb 2015 21:22:09 -0800
Subject: [PATCH 085/357] [SPARK-5701] Only set ShuffleReadMetrics when task
 has shuffle deps

The updateShuffleReadMetrics method in TaskMetrics (called by the executor heartbeater) will currently always add a ShuffleReadMetrics to TaskMetrics (with values set to 0), even when the task didn't read any shuffle data. ShuffleReadMetrics should only be added if the task reads shuffle data.

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #4488 from kayousterhout/SPARK-5701 and squashes the following commits:

673ed58 [Kay Ousterhout] SPARK-5701: Only set ShuffleReadMetrics when task has shuffle deps
---
 .../apache/spark/executor/TaskMetrics.scala   | 22 ++++++++-------
 .../spark/executor/TaskMetricsSuite.scala     | 28 +++++++++++++++++++
 2 files changed, 40 insertions(+), 10 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/executor/TaskMetricsSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
index d05659193b334..bf3f1e4fc7832 100644
--- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
+++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
@@ -177,8 +177,8 @@ class TaskMetrics extends Serializable {
    * Once https://issues.apache.org/jira/browse/SPARK-5225 is addressed,
    * we can store all the different inputMetrics (one per readMethod).
    */
-  private[spark] def getInputMetricsForReadMethod(readMethod: DataReadMethod):
-    InputMetrics =synchronized {
+  private[spark] def getInputMetricsForReadMethod(
+      readMethod: DataReadMethod): InputMetrics = synchronized {
     _inputMetrics match {
       case None =>
         val metrics = new InputMetrics(readMethod)
@@ -195,15 +195,17 @@ class TaskMetrics extends Serializable {
    * Aggregates shuffle read metrics for all registered dependencies into shuffleReadMetrics.
    */
   private[spark] def updateShuffleReadMetrics(): Unit = synchronized {
-    val merged = new ShuffleReadMetrics()
-    for (depMetrics <- depsShuffleReadMetrics) {
-      merged.incFetchWaitTime(depMetrics.fetchWaitTime)
-      merged.incLocalBlocksFetched(depMetrics.localBlocksFetched)
-      merged.incRemoteBlocksFetched(depMetrics.remoteBlocksFetched)
-      merged.incRemoteBytesRead(depMetrics.remoteBytesRead)
-      merged.incRecordsRead(depMetrics.recordsRead)
+    if (!depsShuffleReadMetrics.isEmpty) {
+      val merged = new ShuffleReadMetrics()
+      for (depMetrics <- depsShuffleReadMetrics) {
+        merged.incFetchWaitTime(depMetrics.fetchWaitTime)
+        merged.incLocalBlocksFetched(depMetrics.localBlocksFetched)
+        merged.incRemoteBlocksFetched(depMetrics.remoteBlocksFetched)
+        merged.incRemoteBytesRead(depMetrics.remoteBytesRead)
+        merged.incRecordsRead(depMetrics.recordsRead)
+      }
+      _shuffleReadMetrics = Some(merged)
     }
-    _shuffleReadMetrics = Some(merged)
   }
 
   private[spark] def updateInputMetrics(): Unit = synchronized {
diff --git a/core/src/test/scala/org/apache/spark/executor/TaskMetricsSuite.scala b/core/src/test/scala/org/apache/spark/executor/TaskMetricsSuite.scala
new file mode 100644
index 0000000000000..326e203afe136
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/executor/TaskMetricsSuite.scala
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.executor
+
+import org.scalatest.FunSuite
+
+class TaskMetricsSuite extends FunSuite {
+  test("[SPARK-5701] updateShuffleReadMetrics: ShuffleReadMetrics not added when no shuffle deps") {
+    val taskMetrics = new TaskMetrics()
+    taskMetrics.updateShuffleReadMetrics()
+    assert(taskMetrics.shuffleReadMetrics.isEmpty)
+  }
+}

From bd0b5ea708aa5b84adb67c039ec52408289718bb Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Mon, 9 Feb 2015 21:33:34 -0800
Subject: [PATCH 086/357] [SQL] Remove the duplicated code

Author: Cheng Hao <hao.cheng@intel.com>

Closes #4494 from chenghao-intel/tiny_code_change and squashes the following commits:

450dfe7 [Cheng Hao] remove the duplicated code
---
 .../org/apache/spark/sql/execution/SparkStrategies.scala     | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 81bcf5a6f32dd..edf8a5be64ff1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -342,11 +342,6 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         ExecutedCommand(
           RunnableDescribeCommand(resultPlan, resultPlan.output, isExtended)) :: Nil
 
-      case LogicalDescribeCommand(table, isExtended) =>
-        val resultPlan = self.sqlContext.executePlan(table).executedPlan
-        ExecutedCommand(
-          RunnableDescribeCommand(resultPlan, resultPlan.output, isExtended)) :: Nil
-
       case _ => Nil
     }
   }

From ef2f55b97f58fa06acb30e9e0172fb66fba383bc Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Mon, 9 Feb 2015 22:09:07 -0800
Subject: [PATCH 087/357] [SPARK-5597][MLLIB] save/load for decision trees and
 emsembles

This is based on #4444 from jkbradley with the following changes:

1. Node schema updated to
   ~~~
treeId: int
nodeId: Int
predict/
       |- predict: Double
       |- prob: Double
impurity: Double
isLeaf: Boolean
split/
     |- feature: Int
     |- threshold: Double
     |- featureType: Int
     |- categories: Array[Double]
leftNodeId: Integer
rightNodeId: Integer
infoGain: Double
~~~

2. Some refactor of the implementation.

Closes #4444.

Author: Joseph K. Bradley <joseph@databricks.com>
Author: Xiangrui Meng <meng@databricks.com>

Closes #4493 from mengxr/SPARK-5597 and squashes the following commits:

75e3bb6 [Xiangrui Meng] fix style
2b0033d [Xiangrui Meng] update tree export schema and refactor the implementation
45873a2 [Joseph K. Bradley] org imports
1d4c264 [Joseph K. Bradley] Added save/load for tree ensembles
dcdbf85 [Joseph K. Bradley] added save/load for decision tree but need to generalize it to ensembles
---
 .../mllib/tree/model/DecisionTreeModel.scala  | 197 +++++++++++++++++-
 .../tree/model/InformationGainStats.scala     |   4 +-
 .../apache/spark/mllib/tree/model/Node.scala  |   5 +
 .../spark/mllib/tree/model/Predict.scala      |   7 +
 .../mllib/tree/model/treeEnsembleModels.scala | 157 +++++++++++++-
 .../spark/mllib/tree/DecisionTreeSuite.scala  | 120 ++++++++++-
 .../tree/GradientBoostedTreesSuite.scala      |  81 ++++---
 .../spark/mllib/tree/RandomForestSuite.scala  |  28 ++-
 8 files changed, 561 insertions(+), 38 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
index a25e625a4017a..89ecf3773dd77 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
@@ -17,11 +17,17 @@
 
 package org.apache.spark.mllib.tree.model
 
+import scala.collection.mutable
+
+import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.tree.configuration.{Algo, FeatureType}
 import org.apache.spark.mllib.tree.configuration.Algo._
+import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 
 /**
  * :: Experimental ::
@@ -31,7 +37,7 @@ import org.apache.spark.rdd.RDD
  * @param algo algorithm type -- classification or regression
  */
 @Experimental
-class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable {
+class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable with Saveable {
 
   /**
    * Predict values for a single data point using the model trained.
@@ -98,4 +104,193 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
     header + topNode.subtreeToString(2)
   }
 
+  override def save(sc: SparkContext, path: String): Unit = {
+    DecisionTreeModel.SaveLoadV1_0.save(sc, path, this)
+  }
+
+  override protected def formatVersion: String = "1.0"
+}
+
+object DecisionTreeModel extends Loader[DecisionTreeModel] {
+
+  private[tree] object SaveLoadV1_0 {
+
+    def thisFormatVersion = "1.0"
+
+    // Hard-code class name string in case it changes in the future
+    def thisClassName = "org.apache.spark.mllib.tree.DecisionTreeModel"
+
+    case class PredictData(predict: Double, prob: Double) {
+      def toPredict: Predict = new Predict(predict, prob)
+    }
+
+    object PredictData {
+      def apply(p: Predict): PredictData = PredictData(p.predict, p.prob)
+
+      def apply(r: Row): PredictData = PredictData(r.getDouble(0), r.getDouble(1))
+    }
+
+    case class SplitData(
+        feature: Int,
+        threshold: Double,
+        featureType: Int,
+        categories: Seq[Double]) { // TODO: Change to List once SPARK-3365 is fixed
+      def toSplit: Split = {
+        new Split(feature, threshold, FeatureType(featureType), categories.toList)
+      }
+    }
+
+    object SplitData {
+      def apply(s: Split): SplitData = {
+        SplitData(s.feature, s.threshold, s.featureType.id, s.categories)
+      }
+
+      def apply(r: Row): SplitData = {
+        SplitData(r.getInt(0), r.getDouble(1), r.getInt(2), r.getAs[Seq[Double]](3))
+      }
+    }
+
+    /** Model data for model import/export */
+    case class NodeData(
+        treeId: Int,
+        nodeId: Int,
+        predict: PredictData,
+        impurity: Double,
+        isLeaf: Boolean,
+        split: Option[SplitData],
+        leftNodeId: Option[Int],
+        rightNodeId: Option[Int],
+        infoGain: Option[Double])
+
+    object NodeData {
+      def apply(treeId: Int, n: Node): NodeData = {
+        NodeData(treeId, n.id, PredictData(n.predict), n.impurity, n.isLeaf,
+          n.split.map(SplitData.apply), n.leftNode.map(_.id), n.rightNode.map(_.id),
+          n.stats.map(_.gain))
+      }
+
+      def apply(r: Row): NodeData = {
+        val split = if (r.isNullAt(5)) None else Some(SplitData(r.getStruct(5)))
+        val leftNodeId = if (r.isNullAt(6)) None else Some(r.getInt(6))
+        val rightNodeId = if (r.isNullAt(7)) None else Some(r.getInt(7))
+        val infoGain = if (r.isNullAt(8)) None else Some(r.getDouble(8))
+        NodeData(r.getInt(0), r.getInt(1), PredictData(r.getStruct(2)), r.getDouble(3),
+          r.getBoolean(4), split, leftNodeId, rightNodeId, infoGain)
+      }
+    }
+
+    def save(sc: SparkContext, path: String, model: DecisionTreeModel): Unit = {
+      val sqlContext = new SQLContext(sc)
+      import sqlContext.implicits._
+
+      // Create JSON metadata.
+      val metadataRDD = sc.parallelize(
+        Seq((thisClassName, thisFormatVersion, model.algo.toString, model.numNodes)), 1)
+        .toDataFrame("class", "version", "algo", "numNodes")
+      metadataRDD.toJSON.saveAsTextFile(Loader.metadataPath(path))
+
+      // Create Parquet data.
+      val nodes = model.topNode.subtreeIterator.toSeq
+      val dataRDD: DataFrame = sc.parallelize(nodes)
+        .map(NodeData.apply(0, _))
+        .toDataFrame
+      dataRDD.saveAsParquetFile(Loader.dataPath(path))
+    }
+
+    def load(sc: SparkContext, path: String, algo: String, numNodes: Int): DecisionTreeModel = {
+      val datapath = Loader.dataPath(path)
+      val sqlContext = new SQLContext(sc)
+      // Load Parquet data.
+      val dataRDD = sqlContext.parquetFile(datapath)
+      // Check schema explicitly since erasure makes it hard to use match-case for checking.
+      Loader.checkSchema[NodeData](dataRDD.schema)
+      val nodes = dataRDD.map(NodeData.apply)
+      // Build node data into a tree.
+      val trees = constructTrees(nodes)
+      assert(trees.size == 1,
+        "Decision tree should contain exactly one tree but got ${trees.size} trees.")
+      val model = new DecisionTreeModel(trees(0), Algo.fromString(algo))
+      assert(model.numNodes == numNodes, s"Unable to load DecisionTreeModel data from: $datapath." +
+        s" Expected $numNodes nodes but found ${model.numNodes}")
+      model
+    }
+
+    def constructTrees(nodes: RDD[NodeData]): Array[Node] = {
+      val trees = nodes
+        .groupBy(_.treeId)
+        .mapValues(_.toArray)
+        .collect()
+        .map { case (treeId, data) =>
+          (treeId, constructTree(data))
+        }.sortBy(_._1)
+      val numTrees = trees.size
+      val treeIndices = trees.map(_._1).toSeq
+      assert(treeIndices == (0 until numTrees),
+        s"Tree indices must start from 0 and increment by 1, but we found $treeIndices.")
+      trees.map(_._2)
+    }
+
+    /**
+     * Given a list of nodes from a tree, construct the tree.
+     * @param data array of all node data in a tree.
+     */
+    def constructTree(data: Array[NodeData]): Node = {
+      val dataMap: Map[Int, NodeData] = data.map(n => n.nodeId -> n).toMap
+      assert(dataMap.contains(1),
+        s"DecisionTree missing root node (id = 1).")
+      constructNode(1, dataMap, mutable.Map.empty)
+    }
+
+    /**
+     * Builds a node from the node data map and adds new nodes to the input nodes map.
+     */
+    private def constructNode(
+      id: Int,
+      dataMap: Map[Int, NodeData],
+      nodes: mutable.Map[Int, Node]): Node = {
+      if (nodes.contains(id)) {
+        return nodes(id)
+      }
+      val data = dataMap(id)
+      val node =
+        if (data.isLeaf) {
+          Node(data.nodeId, data.predict.toPredict, data.impurity, data.isLeaf)
+        } else {
+          val leftNode = constructNode(data.leftNodeId.get, dataMap, nodes)
+          val rightNode = constructNode(data.rightNodeId.get, dataMap, nodes)
+          val stats = new InformationGainStats(data.infoGain.get, data.impurity, leftNode.impurity,
+            rightNode.impurity, leftNode.predict, rightNode.predict)
+          new Node(data.nodeId, data.predict.toPredict, data.impurity, data.isLeaf,
+            data.split.map(_.toSplit), Some(leftNode), Some(rightNode), Some(stats))
+        }
+      nodes += node.id -> node
+      node
+    }
+  }
+
+  override def load(sc: SparkContext, path: String): DecisionTreeModel = {
+    val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
+    val (algo: String, numNodes: Int) = try {
+      val algo_numNodes = metadata.select("algo", "numNodes").collect()
+      assert(algo_numNodes.length == 1)
+      algo_numNodes(0) match {
+        case Row(a: String, n: Int) => (a, n)
+      }
+    } catch {
+      // Catch both Error and Exception since the checks above can throw either.
+      case e: Throwable =>
+        throw new Exception(
+          s"Unable to load DecisionTreeModel metadata from: ${Loader.metadataPath(path)}."
+          + s"  Error message: ${e.getMessage}")
+    }
+    val classNameV1_0 = SaveLoadV1_0.thisClassName
+    (loadedClassName, version) match {
+      case (className, "1.0") if className == classNameV1_0 =>
+        SaveLoadV1_0.load(sc, path, algo, numNodes)
+      case _ => throw new Exception(
+        s"DecisionTreeModel.load did not recognize model with (className, format version):" +
+        s"($loadedClassName, $version).  Supported:\n" +
+        s"  ($classNameV1_0, 1.0)")
+    }
+  }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
index 9a50ecb550c38..80990aa9a603f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
@@ -49,7 +49,9 @@ class InformationGainStats(
         gain == other.gain &&
         impurity == other.impurity &&
         leftImpurity == other.leftImpurity &&
-        rightImpurity == other.rightImpurity
+        rightImpurity == other.rightImpurity &&
+        leftPredict == other.leftPredict &&
+        rightPredict == other.rightPredict
       }
       case _ => false
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
index 2179da8dbe03e..d961081d185e9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
@@ -166,6 +166,11 @@ class Node (
     }
   }
 
+  /** Returns an iterator that traverses (DFS, left to right) the subtree of this node. */
+  private[tree] def subtreeIterator: Iterator[Node] = {
+    Iterator.single(this) ++ leftNode.map(_.subtreeIterator).getOrElse(Iterator.empty) ++
+      rightNode.map(_.subtreeIterator).getOrElse(Iterator.empty)
+  }
 }
 
 private[tree] object Node {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala
index 004838ee5ba0e..ad4c0dbbfb3e5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala
@@ -32,4 +32,11 @@ class Predict(
   override def toString = {
     "predict = %f, prob = %f".format(predict, prob)
   }
+
+  override def equals(other: Any): Boolean = {
+    other match {
+      case p: Predict => predict == p.predict && prob == p.prob
+      case _ => false
+    }
+  }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
index 22997110de8dd..23bd46baabf65 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -21,12 +21,17 @@ import scala.collection.mutable
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
+import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.tree.configuration.Algo._
+import org.apache.spark.mllib.tree.configuration.Algo
 import org.apache.spark.mllib.tree.configuration.EnsembleCombiningStrategy._
+import org.apache.spark.mllib.util.{Saveable, Loader}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{DataFrame, SQLContext}
+
 
 /**
  * :: Experimental ::
@@ -38,9 +43,42 @@ import org.apache.spark.rdd.RDD
 @Experimental
 class RandomForestModel(override val algo: Algo, override val trees: Array[DecisionTreeModel])
   extends TreeEnsembleModel(algo, trees, Array.fill(trees.size)(1.0),
-    combiningStrategy = if (algo == Classification) Vote else Average) {
+    combiningStrategy = if (algo == Classification) Vote else Average)
+  with Saveable {
 
   require(trees.forall(_.algo == algo))
+
+  override def save(sc: SparkContext, path: String): Unit = {
+    TreeEnsembleModel.SaveLoadV1_0.save(sc, path, this,
+      RandomForestModel.SaveLoadV1_0.thisClassName)
+  }
+
+  override protected def formatVersion: String = TreeEnsembleModel.SaveLoadV1_0.thisFormatVersion
+}
+
+object RandomForestModel extends Loader[RandomForestModel] {
+
+  override def load(sc: SparkContext, path: String): RandomForestModel = {
+    val (loadedClassName, version, metadataRDD) = Loader.loadMetadata(sc, path)
+    val classNameV1_0 = SaveLoadV1_0.thisClassName
+    (loadedClassName, version) match {
+      case (className, "1.0") if className == classNameV1_0 =>
+        val metadata = TreeEnsembleModel.SaveLoadV1_0.readMetadata(metadataRDD, path)
+        assert(metadata.treeWeights.forall(_ == 1.0))
+        val trees =
+          TreeEnsembleModel.SaveLoadV1_0.loadTrees(sc, path, metadata.treeAlgo)
+        new RandomForestModel(Algo.fromString(metadata.algo), trees)
+      case _ => throw new Exception(s"RandomForestModel.load did not recognize model" +
+        s" with (className, format version): ($loadedClassName, $version).  Supported:\n" +
+        s"  ($classNameV1_0, 1.0)")
+    }
+  }
+
+  private object SaveLoadV1_0 {
+    // Hard-code class name string in case it changes in the future
+    def thisClassName = "org.apache.spark.mllib.tree.model.RandomForestModel"
+  }
+
 }
 
 /**
@@ -56,9 +94,42 @@ class GradientBoostedTreesModel(
     override val algo: Algo,
     override val trees: Array[DecisionTreeModel],
     override val treeWeights: Array[Double])
-  extends TreeEnsembleModel(algo, trees, treeWeights, combiningStrategy = Sum) {
+  extends TreeEnsembleModel(algo, trees, treeWeights, combiningStrategy = Sum)
+  with Saveable {
 
   require(trees.size == treeWeights.size)
+
+  override def save(sc: SparkContext, path: String): Unit = {
+    TreeEnsembleModel.SaveLoadV1_0.save(sc, path, this,
+      GradientBoostedTreesModel.SaveLoadV1_0.thisClassName)
+  }
+
+  override protected def formatVersion: String = TreeEnsembleModel.SaveLoadV1_0.thisFormatVersion
+}
+
+object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
+
+  override def load(sc: SparkContext, path: String): GradientBoostedTreesModel = {
+    val (loadedClassName, version, metadataRDD) = Loader.loadMetadata(sc, path)
+    val classNameV1_0 = SaveLoadV1_0.thisClassName
+    (loadedClassName, version) match {
+      case (className, "1.0") if className == classNameV1_0 =>
+        val metadata = TreeEnsembleModel.SaveLoadV1_0.readMetadata(metadataRDD, path)
+        assert(metadata.combiningStrategy == Sum.toString)
+        val trees =
+          TreeEnsembleModel.SaveLoadV1_0.loadTrees(sc, path, metadata.treeAlgo)
+        new GradientBoostedTreesModel(Algo.fromString(metadata.algo), trees, metadata.treeWeights)
+      case _ => throw new Exception(s"GradientBoostedTreesModel.load did not recognize model" +
+        s" with (className, format version): ($loadedClassName, $version).  Supported:\n" +
+        s"  ($classNameV1_0, 1.0)")
+    }
+  }
+
+  private object SaveLoadV1_0 {
+    // Hard-code class name string in case it changes in the future
+    def thisClassName = "org.apache.spark.mllib.tree.model.GradientBoostedTreesModel"
+  }
+
 }
 
 /**
@@ -176,3 +247,85 @@ private[tree] sealed class TreeEnsembleModel(
    */
   def totalNumNodes: Int = trees.map(_.numNodes).sum
 }
+
+private[tree] object TreeEnsembleModel {
+
+  object SaveLoadV1_0 {
+
+    import DecisionTreeModel.SaveLoadV1_0.{NodeData, constructTrees}
+
+    def thisFormatVersion = "1.0"
+
+    case class Metadata(
+        algo: String,
+        treeAlgo: String,
+        combiningStrategy: String,
+        treeWeights: Array[Double])
+
+    /**
+     * Model data for model import/export.
+     * We have to duplicate NodeData here since Spark SQL does not yet support extracting subfields
+     * of nested fields; once that is possible, we can use something like:
+     *  case class EnsembleNodeData(treeId: Int, node: NodeData),
+     *  where NodeData is from DecisionTreeModel.
+     */
+    case class EnsembleNodeData(treeId: Int, node: NodeData)
+
+    def save(sc: SparkContext, path: String, model: TreeEnsembleModel, className: String): Unit = {
+      val sqlContext = new SQLContext(sc)
+      import sqlContext.implicits._
+
+      // Create JSON metadata.
+      val metadata = Metadata(model.algo.toString, model.trees(0).algo.toString,
+        model.combiningStrategy.toString, model.treeWeights)
+      val metadataRDD = sc.parallelize(Seq((className, thisFormatVersion, metadata)), 1)
+        .toDataFrame("class", "version", "metadata")
+      metadataRDD.toJSON.saveAsTextFile(Loader.metadataPath(path))
+
+      // Create Parquet data.
+      val dataRDD = sc.parallelize(model.trees.zipWithIndex).flatMap { case (tree, treeId) =>
+        tree.topNode.subtreeIterator.toSeq.map(node => NodeData(treeId, node))
+      }.toDataFrame
+      dataRDD.saveAsParquetFile(Loader.dataPath(path))
+    }
+
+    /**
+     * Read metadata from the loaded metadata DataFrame.
+     * @param path  Path for loading data, used for debug messages.
+     */
+    def readMetadata(metadata: DataFrame, path: String): Metadata = {
+      try {
+        // We rely on the try-catch for schema checking rather than creating a schema just for this.
+        val metadataArray = metadata.select("metadata.algo", "metadata.treeAlgo",
+          "metadata.combiningStrategy", "metadata.treeWeights").collect()
+        assert(metadataArray.size == 1)
+        Metadata(metadataArray(0).getString(0), metadataArray(0).getString(1),
+          metadataArray(0).getString(2), metadataArray(0).getAs[Seq[Double]](3).toArray)
+      } catch {
+        // Catch both Error and Exception since the checks above can throw either.
+        case e: Throwable =>
+          throw new Exception(
+            s"Unable to load TreeEnsembleModel metadata from: ${Loader.metadataPath(path)}."
+              + s"  Error message: ${e.getMessage}")
+      }
+    }
+
+    /**
+     * Load trees for an ensemble, and return them in order.
+     * @param path path to load the model from
+     * @param treeAlgo Algorithm for individual trees (which may differ from the ensemble's
+     *                 algorithm).
+     */
+    def loadTrees(
+        sc: SparkContext,
+        path: String,
+        treeAlgo: String): Array[DecisionTreeModel] = {
+      val datapath = Loader.dataPath(path)
+      val sqlContext = new SQLContext(sc)
+      val nodes = sqlContext.parquetFile(datapath).map(NodeData.apply)
+      val trees = constructTrees(nodes)
+      trees.map(new DecisionTreeModel(_, Algo.fromString(treeAlgo)))
+    }
+  }
+
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
index 9347eaf9221a8..7b1aed5ffeb3e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
@@ -29,8 +29,10 @@ import org.apache.spark.mllib.tree.configuration.FeatureType._
 import org.apache.spark.mllib.tree.configuration.{QuantileStrategy, Strategy}
 import org.apache.spark.mllib.tree.impl.{BaggedPoint, DecisionTreeMetadata, TreePoint}
 import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Variance}
-import org.apache.spark.mllib.tree.model.{InformationGainStats, DecisionTreeModel, Node}
+import org.apache.spark.mllib.tree.model._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.util.Utils
+
 
 class DecisionTreeSuite extends FunSuite with MLlibTestSparkContext {
 
@@ -857,9 +859,32 @@ class DecisionTreeSuite extends FunSuite with MLlibTestSparkContext {
     assert(topNode.leftNode.get.impurity === 0.0)
     assert(topNode.rightNode.get.impurity === 0.0)
   }
+
+  test("Node.subtreeIterator") {
+    val model = DecisionTreeSuite.createModel(Classification)
+    val nodeIds = model.topNode.subtreeIterator.map(_.id).toArray.sorted
+    assert(nodeIds === DecisionTreeSuite.createdModelNodeIds)
+  }
+
+  test("model save/load") {
+    val tempDir = Utils.createTempDir()
+    val path = tempDir.toURI.toString
+
+    Array(Classification, Regression).foreach { algo =>
+      val model = DecisionTreeSuite.createModel(algo)
+      // Save model, load it back, and compare.
+      try {
+        model.save(sc, path)
+        val sameModel = DecisionTreeModel.load(sc, path)
+        DecisionTreeSuite.checkEqual(model, sameModel)
+      } finally {
+        Utils.deleteRecursively(tempDir)
+      }
+    }
+  }
 }
 
-object DecisionTreeSuite {
+object DecisionTreeSuite extends FunSuite {
 
   def validateClassifier(
       model: DecisionTreeModel,
@@ -979,4 +1004,95 @@ object DecisionTreeSuite {
     arr
   }
 
+  /** Create a leaf node with the given node ID */
+  private def createLeafNode(id: Int): Node = {
+    Node(nodeIndex = id, new Predict(0.0, 1.0), impurity = 0.5, isLeaf = true)
+  }
+
+  /**
+   * Create an internal node with the given node ID and feature type.
+   * Note: This does NOT set the child nodes.
+   */
+  private def createInternalNode(id: Int, featureType: FeatureType): Node = {
+    val node = Node(nodeIndex = id, new Predict(0.0, 1.0), impurity = 0.5, isLeaf = false)
+    featureType match {
+      case Continuous =>
+        node.split = Some(new Split(feature = 0, threshold = 0.5, Continuous,
+          categories = List.empty[Double]))
+      case Categorical =>
+        node.split = Some(new Split(feature = 1, threshold = 0.0, Categorical,
+          categories = List(0.0, 1.0)))
+    }
+    // TODO: The information gain stats should be consistent with the same info stored in children.
+    node.stats = Some(new InformationGainStats(gain = 0.1, impurity = 0.2,
+      leftImpurity = 0.3, rightImpurity = 0.4, new Predict(1.0, 0.4), new Predict(0.0, 0.6)))
+    node
+  }
+
+  /**
+   * Create a tree model.  This is deterministic and contains a variety of node and feature types.
+   */
+  private[tree] def createModel(algo: Algo): DecisionTreeModel = {
+    val topNode = createInternalNode(id = 1, Continuous)
+    val (node2, node3) = (createLeafNode(id = 2), createInternalNode(id = 3, Categorical))
+    val (node6, node7) = (createLeafNode(id = 6), createLeafNode(id = 7))
+    topNode.leftNode = Some(node2)
+    topNode.rightNode = Some(node3)
+    node3.leftNode = Some(node6)
+    node3.rightNode = Some(node7)
+    new DecisionTreeModel(topNode, algo)
+  }
+
+  /** Sorted Node IDs matching the model returned by [[createModel()]] */
+  private val createdModelNodeIds = Array(1, 2, 3, 6, 7)
+
+  /**
+   * Check if the two trees are exactly the same.
+   * Note: I hesitate to override Node.equals since it could cause problems if users
+   *       make mistakes such as creating loops of Nodes.
+   * If the trees are not equal, this prints the two trees and throws an exception.
+   */
+  private[tree] def checkEqual(a: DecisionTreeModel, b: DecisionTreeModel): Unit = {
+    try {
+      assert(a.algo === b.algo)
+      checkEqual(a.topNode, b.topNode)
+    } catch {
+      case ex: Exception =>
+        throw new AssertionError("checkEqual failed since the two trees were not identical.\n" +
+          "TREE A:\n" + a.toDebugString + "\n" +
+          "TREE B:\n" + b.toDebugString + "\n", ex)
+    }
+  }
+
+  /**
+   * Return true iff the two nodes and their descendents are exactly the same.
+   * Note: I hesitate to override Node.equals since it could cause problems if users
+   *       make mistakes such as creating loops of Nodes.
+   */
+  private def checkEqual(a: Node, b: Node): Unit = {
+    assert(a.id === b.id)
+    assert(a.predict === b.predict)
+    assert(a.impurity === b.impurity)
+    assert(a.isLeaf === b.isLeaf)
+    assert(a.split === b.split)
+    (a.stats, b.stats) match {
+      // TODO: Check other fields besides the infomation gain.
+      case (Some(aStats), Some(bStats)) => assert(aStats.gain === bStats.gain)
+      case (None, None) =>
+      case _ => throw new AssertionError(
+          s"Only one instance has stats defined. (a.stats: ${a.stats}, b.stats: ${b.stats})")
+    }
+    (a.leftNode, b.leftNode) match {
+      case (Some(aNode), Some(bNode)) => checkEqual(aNode, bNode)
+      case (None, None) =>
+      case _ => throw new AssertionError("Only one instance has leftNode defined. " +
+        s"(a.leftNode: ${a.leftNode}, b.leftNode: ${b.leftNode})")
+    }
+    (a.rightNode, b.rightNode) match {
+      case (Some(aNode: Node), Some(bNode: Node)) => checkEqual(aNode, bNode)
+      case (None, None) =>
+      case _ => throw new AssertionError("Only one instance has rightNode defined. " +
+        s"(a.rightNode: ${a.rightNode}, b.rightNode: ${b.rightNode})")
+    }
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
index e8341a5d0d104..bde47606eb001 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
@@ -24,8 +24,10 @@ import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy}
 import org.apache.spark.mllib.tree.impurity.Variance
 import org.apache.spark.mllib.tree.loss.{AbsoluteError, SquaredError, LogLoss}
-
+import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
 import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.util.Utils
+
 
 /**
  * Test suite for [[GradientBoostedTrees]].
@@ -35,32 +37,30 @@ class GradientBoostedTreesSuite extends FunSuite with MLlibTestSparkContext {
   test("Regression with continuous features: SquaredError") {
     GradientBoostedTreesSuite.testCombinations.foreach {
       case (numIterations, learningRate, subsamplingRate) =>
-        GradientBoostedTreesSuite.randomSeeds.foreach { randomSeed =>
-          val rdd = sc.parallelize(GradientBoostedTreesSuite.data, 2)
-
-          val treeStrategy = new Strategy(algo = Regression, impurity = Variance, maxDepth = 2,
-            categoricalFeaturesInfo = Map.empty, subsamplingRate = subsamplingRate)
-          val boostingStrategy =
-            new BoostingStrategy(treeStrategy, SquaredError, numIterations, learningRate)
-
-          val gbt = GradientBoostedTrees.train(rdd, boostingStrategy)
-
-          assert(gbt.trees.size === numIterations)
-          try {
-            EnsembleTestHelper.validateRegressor(gbt, GradientBoostedTreesSuite.data, 0.06)
-          } catch {
-            case e: java.lang.AssertionError =>
-              println(s"FAILED for numIterations=$numIterations, learningRate=$learningRate," +
-                s" subsamplingRate=$subsamplingRate")
-              throw e
-          }
-
-          val remappedInput = rdd.map(x => new LabeledPoint((x.label * 2) - 1, x.features))
-          val dt = DecisionTree.train(remappedInput, treeStrategy)
-
-          // Make sure trees are the same.
-          assert(gbt.trees.head.toString == dt.toString)
+        val rdd = sc.parallelize(GradientBoostedTreesSuite.data, 2)
+
+        val treeStrategy = new Strategy(algo = Regression, impurity = Variance, maxDepth = 2,
+          categoricalFeaturesInfo = Map.empty, subsamplingRate = subsamplingRate)
+        val boostingStrategy =
+          new BoostingStrategy(treeStrategy, SquaredError, numIterations, learningRate)
+
+        val gbt = GradientBoostedTrees.train(rdd, boostingStrategy)
+
+        assert(gbt.trees.size === numIterations)
+        try {
+          EnsembleTestHelper.validateRegressor(gbt, GradientBoostedTreesSuite.data, 0.06)
+        } catch {
+          case e: java.lang.AssertionError =>
+            println(s"FAILED for numIterations=$numIterations, learningRate=$learningRate," +
+              s" subsamplingRate=$subsamplingRate")
+            throw e
         }
+
+        val remappedInput = rdd.map(x => new LabeledPoint((x.label * 2) - 1, x.features))
+        val dt = DecisionTree.train(remappedInput, treeStrategy)
+
+        // Make sure trees are the same.
+        assert(gbt.trees.head.toString == dt.toString)
     }
   }
 
@@ -133,14 +133,37 @@ class GradientBoostedTreesSuite extends FunSuite with MLlibTestSparkContext {
       BoostingStrategy.defaultParams(algo)
     }
   }
+
+  test("model save/load") {
+    val tempDir = Utils.createTempDir()
+    val path = tempDir.toURI.toString
+
+    val trees = Range(0, 3).map(_ => DecisionTreeSuite.createModel(Regression)).toArray
+    val treeWeights = Array(0.1, 0.3, 1.1)
+
+    Array(Classification, Regression).foreach { algo =>
+      val model = new GradientBoostedTreesModel(algo, trees, treeWeights)
+
+      // Save model, load it back, and compare.
+      try {
+        model.save(sc, path)
+        val sameModel = GradientBoostedTreesModel.load(sc, path)
+        assert(model.algo == sameModel.algo)
+        model.trees.zip(sameModel.trees).foreach { case (treeA, treeB) =>
+          DecisionTreeSuite.checkEqual(treeA, treeB)
+        }
+        assert(model.treeWeights === sameModel.treeWeights)
+      } finally {
+        Utils.deleteRecursively(tempDir)
+      }
+    }
+  }
 }
 
-object GradientBoostedTreesSuite {
+private object GradientBoostedTreesSuite {
 
   // Combinations for estimators, learning rates and subsamplingRate
   val testCombinations = Array((10, 1.0, 1.0), (10, 0.1, 1.0), (10, 0.5, 0.75), (10, 0.1, 0.75))
 
-  val randomSeeds = Array(681283, 4398)
-
   val data = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 10, 100)
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
index 55e963977b54f..ee3bc98486862 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/RandomForestSuite.scala
@@ -27,8 +27,10 @@ import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.Strategy
 import org.apache.spark.mllib.tree.impl.DecisionTreeMetadata
 import org.apache.spark.mllib.tree.impurity.{Gini, Variance}
-import org.apache.spark.mllib.tree.model.Node
+import org.apache.spark.mllib.tree.model.{Node, RandomForestModel}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.util.Utils
+
 
 /**
  * Test suite for [[RandomForest]].
@@ -212,6 +214,26 @@ class RandomForestSuite extends FunSuite with MLlibTestSparkContext {
     assert(rf1.toDebugString != rf2.toDebugString)
   }
 
-}
-
+  test("model save/load") {
+    val tempDir = Utils.createTempDir()
+    val path = tempDir.toURI.toString
+
+    Array(Classification, Regression).foreach { algo =>
+      val trees = Range(0, 3).map(_ => DecisionTreeSuite.createModel(algo)).toArray
+      val model = new RandomForestModel(algo, trees)
+
+      // Save model, load it back, and compare.
+      try {
+        model.save(sc, path)
+        val sameModel = RandomForestModel.load(sc, path)
+        assert(model.algo == sameModel.algo)
+        model.trees.zip(sameModel.trees).foreach { case (treeA, treeB) =>
+          DecisionTreeSuite.checkEqual(treeA, treeB)
+        }
+      } finally {
+        Utils.deleteRecursively(tempDir)
+      }
+    }
+  }
 
+}

From c15134632e74e3dee05eda20c6ef79915e15d02e Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Mon, 9 Feb 2015 22:45:48 -0800
Subject: [PATCH 088/357] [SPARK-4964][Streaming][Kafka] More updates to
 Exactly-once Kafka stream

Changes
- Added example
- Added a critical unit test that verifies that offset ranges can be recovered through checkpoints

Might add more changes.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #4384 from tdas/new-kafka-fixes and squashes the following commits:

7c931c3 [Tathagata Das] Small update
3ed9284 [Tathagata Das] updated scala doc
83d0402 [Tathagata Das] Added JavaDirectKafkaWordCount example.
26df23c [Tathagata Das] Updates based on PR comments from Cody
e4abf69 [Tathagata Das] Scala doc improvements and stuff.
bb65232 [Tathagata Das] Fixed test bug and refactored KafkaStreamSuite
50f2b56 [Tathagata Das] Added Java API and added more Scala and Java unit tests. Also updated docs.
e73589c [Tathagata Das] Minor changes.
4986784 [Tathagata Das] Added unit test to kafka offset recovery
6a91cab [Tathagata Das] Added example
---
 .../streaming/JavaDirectKafkaWordCount.java   | 113 ++++++
 .../streaming/DirectKafkaWordCount.scala      |  71 ++++
 .../kafka/DirectKafkaInputDStream.scala       |   5 +-
 .../spark/streaming/kafka/KafkaCluster.scala  |   3 +
 .../spark/streaming/kafka/KafkaRDD.scala      |  12 +-
 .../streaming/kafka/KafkaRDDPartition.scala   |  23 +-
 .../spark/streaming/kafka/KafkaUtils.scala    | 353 ++++++++++++++----
 .../apache/spark/streaming/kafka/Leader.scala |  21 +-
 .../spark/streaming/kafka/OffsetRange.scala   |  53 ++-
 .../kafka/JavaDirectKafkaStreamSuite.java     | 159 ++++++++
 .../streaming/kafka/JavaKafkaStreamSuite.java |   5 +-
 .../kafka/DirectKafkaStreamSuite.scala        | 302 +++++++++++++++
 .../streaming/kafka/KafkaClusterSuite.scala   |  24 +-
 .../kafka/KafkaDirectStreamSuite.scala        |  92 -----
 .../spark/streaming/kafka/KafkaRDDSuite.scala |   8 +-
 .../streaming/kafka/KafkaStreamSuite.scala    |  62 +--
 .../kafka/ReliableKafkaStreamSuite.scala      |   4 +-
 17 files changed, 1048 insertions(+), 262 deletions(-)
 create mode 100644 examples/scala-2.10/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java
 create mode 100644 examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
 create mode 100644 external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java
 create mode 100644 external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
 delete mode 100644 external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaDirectStreamSuite.scala

diff --git a/examples/scala-2.10/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java b/examples/scala-2.10/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java
new file mode 100644
index 0000000000000..bab9f2478e779
--- /dev/null
+++ b/examples/scala-2.10/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.streaming;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Arrays;
+import java.util.regex.Pattern;
+
+import scala.Tuple2;
+
+import com.google.common.collect.Lists;
+import kafka.serializer.StringDecoder;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.*;
+import org.apache.spark.streaming.api.java.*;
+import org.apache.spark.streaming.kafka.KafkaUtils;
+import org.apache.spark.streaming.Durations;
+
+/**
+ * Consumes messages from one or more topics in Kafka and does wordcount.
+ * Usage: DirectKafkaWordCount <brokers> <topics>
+ *   <brokers> is a list of one or more Kafka brokers
+ *   <topics> is a list of one or more kafka topics to consume from
+ *
+ * Example:
+ *    $ bin/run-example streaming.KafkaWordCount broker1-host:port,broker2-host:port topic1,topic2
+ */
+
+public final class JavaDirectKafkaWordCount {
+  private static final Pattern SPACE = Pattern.compile(" ");
+
+  public static void main(String[] args) {
+    if (args.length < 2) {
+      System.err.println("Usage: DirectKafkaWordCount <brokers> <topics>\n" +
+          "  <brokers> is a list of one or more Kafka brokers\n" +
+          "  <topics> is a list of one or more kafka topics to consume from\n\n");
+      System.exit(1);
+    }
+
+    StreamingExamples.setStreamingLogLevels();
+
+    String brokers = args[0];
+    String topics = args[1];
+
+    // Create context with 2 second batch interval
+    SparkConf sparkConf = new SparkConf().setAppName("JavaDirectKafkaWordCount");
+    JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2));
+
+    HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(topics.split(",")));
+    HashMap<String, String> kafkaParams = new HashMap<String, String>();
+    kafkaParams.put("metadata.broker.list", brokers);
+
+    // Create direct kafka stream with brokers and topics
+    JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
+        jssc,
+        String.class,
+        String.class,
+        StringDecoder.class,
+        StringDecoder.class,
+        kafkaParams,
+        topicsSet
+    );
+
+    // Get the lines, split them into words, count the words and print
+    JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
+      @Override
+      public String call(Tuple2<String, String> tuple2) {
+        return tuple2._2();
+      }
+    });
+    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
+      @Override
+      public Iterable<String> call(String x) {
+        return Lists.newArrayList(SPACE.split(x));
+      }
+    });
+    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
+      new PairFunction<String, String, Integer>() {
+        @Override
+        public Tuple2<String, Integer> call(String s) {
+          return new Tuple2<String, Integer>(s, 1);
+        }
+      }).reduceByKey(
+        new Function2<Integer, Integer, Integer>() {
+        @Override
+        public Integer call(Integer i1, Integer i2) {
+          return i1 + i2;
+        }
+      });
+    wordCounts.print();
+
+    // Start the computation
+    jssc.start();
+    jssc.awaitTermination();
+  }
+}
diff --git a/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala b/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
new file mode 100644
index 0000000000000..deb08fd57b8c7
--- /dev/null
+++ b/examples/scala-2.10/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.streaming
+
+import kafka.serializer.StringDecoder
+
+import org.apache.spark.streaming._
+import org.apache.spark.streaming.kafka._
+import org.apache.spark.SparkConf
+
+/**
+ * Consumes messages from one or more topics in Kafka and does wordcount.
+ * Usage: DirectKafkaWordCount <brokers> <topics>
+ *   <brokers> is a list of one or more Kafka brokers
+ *   <topics> is a list of one or more kafka topics to consume from
+ *
+ * Example:
+ *    $ bin/run-example streaming.KafkaWordCount broker1-host:port,broker2-host:port topic1,topic2
+ */
+object DirectKafkaWordCount {
+  def main(args: Array[String]) {
+    if (args.length < 2) {
+      System.err.println(s"""
+        |Usage: DirectKafkaWordCount <brokers> <topics>
+        |  <brokers> is a list of one or more Kafka brokers
+        |  <topics> is a list of one or more kafka topics to consume from
+        |
+        """".stripMargin)
+      System.exit(1)
+    }
+
+    StreamingExamples.setStreamingLogLevels()
+
+    val Array(brokers, topics) = args
+
+    // Create context with 2 second batch interval
+    val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount")
+    val ssc =  new StreamingContext(sparkConf, Seconds(2))
+
+    // Create direct kafka stream with brokers and topics
+    val topicsSet = topics.split(",").toSet
+    val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
+    val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
+      ssc, kafkaParams, topicsSet)
+
+    // Get the lines, split them into words, count the words and print
+    val lines = messages.map(_._2)
+    val words = lines.flatMap(_.split(" "))
+    val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
+    wordCounts.print()
+
+    // Start the computation
+    ssc.start()
+    ssc.awaitTermination()
+  }
+}
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
index c7bca43eb889d..04e65cb3d708c 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
@@ -50,14 +50,13 @@ import org.apache.spark.streaming.dstream._
  * @param fromOffsets per-topic/partition Kafka offsets defining the (inclusive)
  *  starting point of the stream
  * @param messageHandler function for translating each message into the desired type
- * @param maxRetries maximum number of times in a row to retry getting leaders' offsets
  */
 private[streaming]
 class DirectKafkaInputDStream[
   K: ClassTag,
   V: ClassTag,
-  U <: Decoder[_]: ClassTag,
-  T <: Decoder[_]: ClassTag,
+  U <: Decoder[K]: ClassTag,
+  T <: Decoder[V]: ClassTag,
   R: ClassTag](
     @transient ssc_ : StreamingContext,
     val kafkaParams: Map[String, String],
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
index ccc62bfe8f057..2f7e0ab39fefd 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
@@ -332,6 +332,9 @@ object KafkaCluster {
       extends ConsumerConfig(originalProps) {
     val seedBrokers: Array[(String, Int)] = brokers.split(",").map { hp =>
       val hpa = hp.split(":")
+      if (hpa.size == 1) {
+        throw new SparkException(s"Broker not the in correct format of <host>:<port> [$brokers]")
+      }
       (hpa(0), hpa(1).toInt)
     }
   }
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala
index 50bf7cbdb8dbf..d56cc01be9514 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala
@@ -36,14 +36,12 @@ import kafka.utils.VerifiableProperties
  * Starting and ending offsets are specified in advance,
  * so that you can control exactly-once semantics.
  * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
- * configuration parameters</a>.
- *   Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
- *   NOT zookeeper servers, specified in host1:port1,host2:port2 form.
- * @param batch Each KafkaRDDPartition in the batch corresponds to a
- *   range of offsets for a given Kafka topic/partition
+ * configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers" to be set
+ * with Kafka broker(s) specified in host1:port1,host2:port2 form.
+ * @param offsetRanges offset ranges that define the Kafka data belonging to this RDD
  * @param messageHandler function for translating each message into the desired type
  */
-private[spark]
+private[kafka]
 class KafkaRDD[
   K: ClassTag,
   V: ClassTag,
@@ -183,7 +181,7 @@ class KafkaRDD[
   }
 }
 
-private[spark]
+private[kafka]
 object KafkaRDD {
   import KafkaCluster.LeaderOffset
 
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDDPartition.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDDPartition.scala
index 36372e08f65f6..a842a6f17766f 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDDPartition.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDDPartition.scala
@@ -26,7 +26,7 @@ import org.apache.spark.Partition
   * @param host preferred kafka host, i.e. the leader at the time the rdd was created
   * @param port preferred kafka host's port
   */
-private[spark]
+private[kafka]
 class KafkaRDDPartition(
   val index: Int,
   val topic: String,
@@ -36,24 +36,3 @@ class KafkaRDDPartition(
   val host: String,
   val port: Int
 ) extends Partition
-
-private[spark]
-object KafkaRDDPartition {
-  def apply(
-    index: Int,
-    topic: String,
-    partition: Int,
-    fromOffset: Long,
-    untilOffset: Long,
-    host: String,
-    port: Int
-  ): KafkaRDDPartition = new KafkaRDDPartition(
-    index,
-    topic,
-    partition,
-    fromOffset,
-    untilOffset,
-    host,
-    port
-  )
-}
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index f8aa6c5c6263c..7a2c3abdcc24b 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -18,7 +18,9 @@
 package org.apache.spark.streaming.kafka
 
 import java.lang.{Integer => JInt}
+import java.lang.{Long => JLong}
 import java.util.{Map => JMap}
+import java.util.{Set => JSet}
 
 import scala.reflect.ClassTag
 import scala.collection.JavaConversions._
@@ -27,18 +29,19 @@ import kafka.common.TopicAndPartition
 import kafka.message.MessageAndMetadata
 import kafka.serializer.{Decoder, StringDecoder}
 
-
+import org.apache.spark.api.java.function.{Function => JFunction}
 import org.apache.spark.{SparkContext, SparkException}
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.StreamingContext
-import org.apache.spark.streaming.api.java.{JavaPairReceiverInputDStream, JavaStreamingContext}
+import org.apache.spark.streaming.api.java.{JavaPairInputDStream, JavaInputDStream, JavaPairReceiverInputDStream, JavaStreamingContext}
 import org.apache.spark.streaming.dstream.{InputDStream, ReceiverInputDStream}
+import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
 
 object KafkaUtils {
   /**
-   * Create an input stream that pulls messages from a Kafka Broker.
+   * Create an input stream that pulls messages from Kafka Brokers.
    * @param ssc       StreamingContext object
    * @param zkQuorum  Zookeeper quorum (hostname:port,hostname:port,..)
    * @param groupId   The group id for this consumer
@@ -62,7 +65,7 @@ object KafkaUtils {
   }
 
   /**
-   * Create an input stream that pulls messages from a Kafka Broker.
+   * Create an input stream that pulls messages from Kafka Brokers.
    * @param ssc         StreamingContext object
    * @param kafkaParams Map of kafka configuration parameters,
    *                    see http://kafka.apache.org/08/configuration.html
@@ -81,7 +84,7 @@ object KafkaUtils {
   }
 
   /**
-   * Create an input stream that pulls messages from a Kafka Broker.
+   * Create an input stream that pulls messages from Kafka Brokers.
    * Storage level of the data will be the default StorageLevel.MEMORY_AND_DISK_SER_2.
    * @param jssc      JavaStreamingContext object
    * @param zkQuorum  Zookeeper quorum (hostname:port,hostname:port,..)
@@ -99,7 +102,7 @@ object KafkaUtils {
   }
 
   /**
-   * Create an input stream that pulls messages from a Kafka Broker.
+   * Create an input stream that pulls messages from Kafka Brokers.
    * @param jssc      JavaStreamingContext object
    * @param zkQuorum  Zookeeper quorum (hostname:port,hostname:port,..).
    * @param groupId   The group id for this consumer.
@@ -119,10 +122,10 @@ object KafkaUtils {
   }
 
   /**
-   * Create an input stream that pulls messages from a Kafka Broker.
+   * Create an input stream that pulls messages from Kafka Brokers.
    * @param jssc      JavaStreamingContext object
-   * @param keyTypeClass Key type of RDD
-   * @param valueTypeClass value type of RDD
+   * @param keyTypeClass Key type of DStream
+   * @param valueTypeClass value type of Dstream
    * @param keyDecoderClass Type of kafka key decoder
    * @param valueDecoderClass Type of kafka value decoder
    * @param kafkaParams Map of kafka configuration parameters,
@@ -151,14 +154,14 @@ object KafkaUtils {
       jssc.ssc, kafkaParams.toMap, Map(topics.mapValues(_.intValue()).toSeq: _*), storageLevel)
   }
 
-  /** A batch-oriented interface for consuming from Kafka.
-   * Starting and ending offsets are specified in advance,
-   * so that you can control exactly-once semantics.
+  /**
+   * Create a RDD from Kafka using offset ranges for each topic and partition.
+   *
    * @param sc SparkContext object
    * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
-   * configuration parameters</a>.
-   *   Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
-   *   NOT zookeeper servers, specified in host1:port1,host2:port2 form.
+   *    configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
+   *    to be set with Kafka broker(s) (NOT zookeeper servers) specified in
+   *    host1:port1,host2:port2 form.
    * @param offsetRanges Each OffsetRange in the batch corresponds to a
    *   range of offsets for a given Kafka topic/partition
    */
@@ -166,12 +169,12 @@ object KafkaUtils {
   def createRDD[
     K: ClassTag,
     V: ClassTag,
-    U <: Decoder[_]: ClassTag,
-    T <: Decoder[_]: ClassTag] (
+    KD <: Decoder[K]: ClassTag,
+    VD <: Decoder[V]: ClassTag](
       sc: SparkContext,
       kafkaParams: Map[String, String],
       offsetRanges: Array[OffsetRange]
-  ): RDD[(K, V)] = {
+    ): RDD[(K, V)] = {
     val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message)
     val kc = new KafkaCluster(kafkaParams)
     val topics = offsetRanges.map(o => TopicAndPartition(o.topic, o.partition)).toSet
@@ -179,121 +182,196 @@ object KafkaUtils {
       errs => throw new SparkException(errs.mkString("\n")),
       ok => ok
     )
-    new KafkaRDD[K, V, U, T, (K, V)](sc, kafkaParams, offsetRanges, leaders, messageHandler)
+    new KafkaRDD[K, V, KD, VD, (K, V)](sc, kafkaParams, offsetRanges, leaders, messageHandler)
   }
 
-  /** A batch-oriented interface for consuming from Kafka.
-   * Starting and ending offsets are specified in advance,
-   * so that you can control exactly-once semantics.
+  /**
+   * :: Experimental ::
+   * Create a RDD from Kafka using offset ranges for each topic and partition. This allows you
+   * specify the Kafka leader to connect to (to optimize fetching) and access the message as well
+   * as the metadata.
+   *
    * @param sc SparkContext object
    * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
-   * configuration parameters</a>.
-   *   Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
-   *   NOT zookeeper servers, specified in host1:port1,host2:port2 form.
+   *    configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
+   *    to be set with Kafka broker(s) (NOT zookeeper servers) specified in
+   *    host1:port1,host2:port2 form.
    * @param offsetRanges Each OffsetRange in the batch corresponds to a
    *   range of offsets for a given Kafka topic/partition
    * @param leaders Kafka leaders for each offset range in batch
-   * @param messageHandler function for translating each message into the desired type
+   * @param messageHandler Function for translating each message and metadata into the desired type
    */
   @Experimental
   def createRDD[
     K: ClassTag,
     V: ClassTag,
-    U <: Decoder[_]: ClassTag,
-    T <: Decoder[_]: ClassTag,
-    R: ClassTag] (
+    KD <: Decoder[K]: ClassTag,
+    VD <: Decoder[V]: ClassTag,
+    R: ClassTag](
       sc: SparkContext,
       kafkaParams: Map[String, String],
       offsetRanges: Array[OffsetRange],
       leaders: Array[Leader],
       messageHandler: MessageAndMetadata[K, V] => R
-  ): RDD[R] = {
-
+    ): RDD[R] = {
     val leaderMap = leaders
       .map(l => TopicAndPartition(l.topic, l.partition) -> (l.host, l.port))
       .toMap
-    new KafkaRDD[K, V, U, T, R](sc, kafkaParams, offsetRanges, leaderMap, messageHandler)
+    new KafkaRDD[K, V, KD, VD, R](sc, kafkaParams, offsetRanges, leaderMap, messageHandler)
   }
 
+
   /**
-   * This stream can guarantee that each message from Kafka is included in transformations
-   * (as opposed to output actions) exactly once, even in most failure situations.
+   * Create a RDD from Kafka using offset ranges for each topic and partition.
    *
-   * Points to note:
-   *
-   * Failure Recovery - You must checkpoint this stream, or save offsets yourself and provide them
-   * as the fromOffsets parameter on restart.
-   * Kafka must have sufficient log retention to obtain messages after failure.
-   *
-   * Getting offsets from the stream - see programming guide
+   * @param jsc JavaSparkContext object
+   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+   *    configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
+   *    to be set with Kafka broker(s) (NOT zookeeper servers) specified in
+   *    host1:port1,host2:port2 form.
+   * @param offsetRanges Each OffsetRange in the batch corresponds to a
+   *   range of offsets for a given Kafka topic/partition
+   */
+  @Experimental
+  def createRDD[K, V, KD <: Decoder[K], VD <: Decoder[V]](
+      jsc: JavaSparkContext,
+      keyClass: Class[K],
+      valueClass: Class[V],
+      keyDecoderClass: Class[KD],
+      valueDecoderClass: Class[VD],
+      kafkaParams: JMap[String, String],
+      offsetRanges: Array[OffsetRange]
+    ): JavaPairRDD[K, V] = {
+    implicit val keyCmt: ClassTag[K] = ClassTag(keyClass)
+    implicit val valueCmt: ClassTag[V] = ClassTag(valueClass)
+    implicit val keyDecoderCmt: ClassTag[KD] = ClassTag(keyDecoderClass)
+    implicit val valueDecoderCmt: ClassTag[VD] = ClassTag(valueDecoderClass)
+    new JavaPairRDD(createRDD[K, V, KD, VD](
+      jsc.sc, Map(kafkaParams.toSeq: _*), offsetRanges))
+  }
+
+  /**
+   * :: Experimental ::
+   * Create a RDD from Kafka using offset ranges for each topic and partition. This allows you
+   * specify the Kafka leader to connect to (to optimize fetching) and access the message as well
+   * as the metadata.
    *
-.  * Zookeeper - This does not use Zookeeper to store offsets.  For interop with Kafka monitors
-   * that depend on Zookeeper, you must store offsets in ZK yourself.
+   * @param jsc JavaSparkContext object
+   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+   *    configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
+   *    to be set with Kafka broker(s) (NOT zookeeper servers) specified in
+   *    host1:port1,host2:port2 form.
+   * @param offsetRanges Each OffsetRange in the batch corresponds to a
+   *   range of offsets for a given Kafka topic/partition
+   * @param leaders Kafka leaders for each offset range in batch
+   * @param messageHandler Function for translating each message and metadata into the desired type
+   */
+  @Experimental
+  def createRDD[K, V, KD <: Decoder[K], VD <: Decoder[V], R](
+      jsc: JavaSparkContext,
+      keyClass: Class[K],
+      valueClass: Class[V],
+      keyDecoderClass: Class[KD],
+      valueDecoderClass: Class[VD],
+      recordClass: Class[R],
+      kafkaParams: JMap[String, String],
+      offsetRanges: Array[OffsetRange],
+      leaders: Array[Leader],
+      messageHandler: JFunction[MessageAndMetadata[K, V], R]
+    ): JavaRDD[R] = {
+    implicit val keyCmt: ClassTag[K] = ClassTag(keyClass)
+    implicit val valueCmt: ClassTag[V] = ClassTag(valueClass)
+    implicit val keyDecoderCmt: ClassTag[KD] = ClassTag(keyDecoderClass)
+    implicit val valueDecoderCmt: ClassTag[VD] = ClassTag(valueDecoderClass)
+    implicit val recordCmt: ClassTag[R] = ClassTag(recordClass)
+    createRDD[K, V, KD, VD, R](
+      jsc.sc, Map(kafkaParams.toSeq: _*), offsetRanges, leaders, messageHandler.call _)
+  }
+
+  /**
+   * :: Experimental ::
+   * Create an input stream that directly pulls messages from Kafka Brokers
+   * without using any receiver. This stream can guarantee that each message
+   * from Kafka is included in transformations exactly once (see points below).
    *
-   * End-to-end semantics - This does not guarantee that any output operation will push each record
-   * exactly once. To ensure end-to-end exactly-once semantics (that is, receiving exactly once and
-   * outputting exactly once), you have to either ensure that the output operation is
-   * idempotent, or transactionally store offsets with the output. See the programming guide for
-   * more details.
+   * Points to note:
+   *  - No receivers: This stream does not use any receiver. It directly queries Kafka
+   *  - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
+   *    by the stream itself. For interoperability with Kafka monitoring tools that depend on 
+   *    Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
+   *    You can access the offsets used in each batch from the generated RDDs (see
+   *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
+   *  - Failure Recovery: To recover from driver failures, you have to enable checkpointing
+   *    in the [[StreamingContext]]. The information on consumed offset can be
+   *    recovered from the checkpoint. See the programming guide for details (constraints, etc.).
+   *  - End-to-end semantics: This stream ensures that every records is effectively received and
+   *    transformed exactly once, but gives no guarantees on whether the transformed data are
+   *    outputted exactly once. For end-to-end exactly-once semantics, you have to either ensure
+   *    that the output operation is idempotent, or use transactions to output records atomically.
+   *    See the programming guide for more details.
    *
    * @param ssc StreamingContext object
    * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
-   * configuration parameters</a>.
-   *   Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
-   *   NOT zookeeper servers, specified in host1:port1,host2:port2 form.
-   * @param messageHandler function for translating each message into the desired type
-   * @param fromOffsets per-topic/partition Kafka offsets defining the (inclusive)
-   *  starting point of the stream
+   *    configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
+   *    to be set with Kafka broker(s) (NOT zookeeper servers) specified in
+   *    host1:port1,host2:port2 form.
+   * @param fromOffsets Per-topic/partition Kafka offsets defining the (inclusive)
+   *    starting point of the stream
+   * @param messageHandler Function for translating each message and metadata into the desired type
    */
   @Experimental
   def createDirectStream[
     K: ClassTag,
     V: ClassTag,
-    U <: Decoder[_]: ClassTag,
-    T <: Decoder[_]: ClassTag,
+    KD <: Decoder[K]: ClassTag,
+    VD <: Decoder[V]: ClassTag,
     R: ClassTag] (
       ssc: StreamingContext,
       kafkaParams: Map[String, String],
       fromOffsets: Map[TopicAndPartition, Long],
       messageHandler: MessageAndMetadata[K, V] => R
   ): InputDStream[R] = {
-    new DirectKafkaInputDStream[K, V, U, T, R](
+    new DirectKafkaInputDStream[K, V, KD, VD, R](
       ssc, kafkaParams, fromOffsets, messageHandler)
   }
 
   /**
-   * This stream can guarantee that each message from Kafka is included in transformations
-   * (as opposed to output actions) exactly once, even in most failure situations.
+   * :: Experimental ::
+   * Create an input stream that directly pulls messages from Kafka Brokers
+   * without using any receiver. This stream can guarantee that each message
+   * from Kafka is included in transformations exactly once (see points below).
    *
    * Points to note:
-   *
-   * Failure Recovery - You must checkpoint this stream.
-   * Kafka must have sufficient log retention to obtain messages after failure.
-   *
-   * Getting offsets from the stream - see programming guide
-   *
-.  * Zookeeper - This does not use Zookeeper to store offsets.  For interop with Kafka monitors
-   * that depend on Zookeeper, you must store offsets in ZK yourself.
-   *
-   * End-to-end semantics - This does not guarantee that any output operation will push each record
-   * exactly once. To ensure end-to-end exactly-once semantics (that is, receiving exactly once and
-   * outputting exactly once), you have to ensure that the output operation is idempotent.
+   *  - No receivers: This stream does not use any receiver. It directly queries Kafka
+   *  - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
+   *    by the stream itself. For interoperability with Kafka monitoring tools that depend on 
+   *    Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
+   *    You can access the offsets used in each batch from the generated RDDs (see
+   *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
+   *  - Failure Recovery: To recover from driver failures, you have to enable checkpointing
+   *    in the [[StreamingContext]]. The information on consumed offset can be
+   *    recovered from the checkpoint. See the programming guide for details (constraints, etc.).
+   *  - End-to-end semantics: This stream ensures that every records is effectively received and
+   *    transformed exactly once, but gives no guarantees on whether the transformed data are
+   *    outputted exactly once. For end-to-end exactly-once semantics, you have to either ensure
+   *    that the output operation is idempotent, or use transactions to output records atomically.
+   *    See the programming guide for more details.
    *
    * @param ssc StreamingContext object
    * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
-   * configuration parameters</a>.
-   *   Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
-   *   NOT zookeeper servers, specified in host1:port1,host2:port2 form.
-   *   If starting without a checkpoint, "auto.offset.reset" may be set to "largest" or "smallest"
+   *   configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
+   *   to be set with Kafka broker(s) (NOT zookeeper servers), specified in
+   *   host1:port1,host2:port2 form.
+   *   If not starting from a checkpoint, "auto.offset.reset" may be set to "largest" or "smallest"
    *   to determine where the stream starts (defaults to "largest")
-   * @param topics names of the topics to consume
+   * @param topics Names of the topics to consume
    */
   @Experimental
   def createDirectStream[
     K: ClassTag,
     V: ClassTag,
-    U <: Decoder[_]: ClassTag,
-    T <: Decoder[_]: ClassTag] (
+    KD <: Decoder[K]: ClassTag,
+    VD <: Decoder[V]: ClassTag] (
       ssc: StreamingContext,
       kafkaParams: Map[String, String],
       topics: Set[String]
@@ -313,11 +391,128 @@ object KafkaUtils {
       val fromOffsets = leaderOffsets.map { case (tp, lo) =>
           (tp, lo.offset)
       }
-      new DirectKafkaInputDStream[K, V, U, T, (K, V)](
+      new DirectKafkaInputDStream[K, V, KD, VD, (K, V)](
         ssc, kafkaParams, fromOffsets, messageHandler)
     }).fold(
       errs => throw new SparkException(errs.mkString("\n")),
       ok => ok
     )
   }
+
+  /**
+   * :: Experimental ::
+   * Create an input stream that directly pulls messages from Kafka Brokers
+   * without using any receiver. This stream can guarantee that each message
+   * from Kafka is included in transformations exactly once (see points below).
+   *
+   * Points to note:
+   *  - No receivers: This stream does not use any receiver. It directly queries Kafka
+   *  - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
+   *    by the stream itself. For interoperability with Kafka monitoring tools that depend on 
+   *    Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
+   *    You can access the offsets used in each batch from the generated RDDs (see
+   *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
+   *  - Failure Recovery: To recover from driver failures, you have to enable checkpointing
+   *    in the [[StreamingContext]]. The information on consumed offset can be
+   *    recovered from the checkpoint. See the programming guide for details (constraints, etc.).
+   *  - End-to-end semantics: This stream ensures that every records is effectively received and
+   *    transformed exactly once, but gives no guarantees on whether the transformed data are
+   *    outputted exactly once. For end-to-end exactly-once semantics, you have to either ensure
+   *    that the output operation is idempotent, or use transactions to output records atomically.
+   *    See the programming guide for more details.
+   *
+   * @param jssc JavaStreamingContext object
+   * @param keyClass Class of the keys in the Kafka records
+   * @param valueClass Class of the values in the Kafka records
+   * @param keyDecoderClass Class of the key decoder
+   * @param valueDecoderClass Class of the value decoder
+   * @param recordClass Class of the records in DStream
+   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+   *   configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
+   *   to be set with Kafka broker(s) (NOT zookeeper servers), specified in
+   *   host1:port1,host2:port2 form.
+   * @param fromOffsets Per-topic/partition Kafka offsets defining the (inclusive)
+   *    starting point of the stream
+   * @param messageHandler Function for translating each message and metadata into the desired type
+   */
+  @Experimental
+  def createDirectStream[K, V, KD <: Decoder[K], VD <: Decoder[V], R](
+      jssc: JavaStreamingContext,
+      keyClass: Class[K],
+      valueClass: Class[V],
+      keyDecoderClass: Class[KD],
+      valueDecoderClass: Class[VD],
+      recordClass: Class[R],
+      kafkaParams: JMap[String, String],
+      fromOffsets: JMap[TopicAndPartition, JLong],
+      messageHandler: JFunction[MessageAndMetadata[K, V], R]
+    ): JavaInputDStream[R] = {
+    implicit val keyCmt: ClassTag[K] = ClassTag(keyClass)
+    implicit val valueCmt: ClassTag[V] = ClassTag(valueClass)
+    implicit val keyDecoderCmt: ClassTag[KD] = ClassTag(keyDecoderClass)
+    implicit val valueDecoderCmt: ClassTag[VD] = ClassTag(valueDecoderClass)
+    implicit val recordCmt: ClassTag[R] = ClassTag(recordClass)
+    createDirectStream[K, V, KD, VD, R](
+      jssc.ssc,
+      Map(kafkaParams.toSeq: _*),
+      Map(fromOffsets.mapValues { _.longValue() }.toSeq: _*),
+      messageHandler.call _
+    )
+  }
+
+  /**
+   * :: Experimental ::
+   * Create an input stream that directly pulls messages from Kafka Brokers
+   * without using any receiver. This stream can guarantee that each message
+   * from Kafka is included in transformations exactly once (see points below).
+   *
+   * Points to note:
+   *  - No receivers: This stream does not use any receiver. It directly queries Kafka
+   *  - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
+   *    by the stream itself. For interoperability with Kafka monitoring tools that depend on 
+   *    Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
+   *    You can access the offsets used in each batch from the generated RDDs (see
+   *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
+   *  - Failure Recovery: To recover from driver failures, you have to enable checkpointing
+   *    in the [[StreamingContext]]. The information on consumed offset can be
+   *    recovered from the checkpoint. See the programming guide for details (constraints, etc.).
+   *  - End-to-end semantics: This stream ensures that every records is effectively received and
+   *    transformed exactly once, but gives no guarantees on whether the transformed data are
+   *    outputted exactly once. For end-to-end exactly-once semantics, you have to either ensure
+   *    that the output operation is idempotent, or use transactions to output records atomically.
+   *    See the programming guide for more details.
+   *
+   * @param jssc JavaStreamingContext object
+   * @param keyClass Class of the keys in the Kafka records
+   * @param valueClass Class of the values in the Kafka records
+   * @param keyDecoderClass Class of the key decoder
+   * @param valueDecoderClass Class type of the value decoder
+   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+   *   configuration parameters</a>. Requires "metadata.broker.list" or "bootstrap.servers"
+   *   to be set with Kafka broker(s) (NOT zookeeper servers), specified in
+   *   host1:port1,host2:port2 form.
+   *   If not starting from a checkpoint, "auto.offset.reset" may be set to "largest" or "smallest"
+   *   to determine where the stream starts (defaults to "largest")
+   * @param topics Names of the topics to consume
+   */
+  @Experimental
+  def createDirectStream[K, V, KD <: Decoder[K], VD <: Decoder[V], R](
+      jssc: JavaStreamingContext,
+      keyClass: Class[K],
+      valueClass: Class[V],
+      keyDecoderClass: Class[KD],
+      valueDecoderClass: Class[VD],
+      kafkaParams: JMap[String, String],
+      topics: JSet[String]
+    ): JavaPairInputDStream[K, V] = {
+    implicit val keyCmt: ClassTag[K] = ClassTag(keyClass)
+    implicit val valueCmt: ClassTag[V] = ClassTag(valueClass)
+    implicit val keyDecoderCmt: ClassTag[KD] = ClassTag(keyDecoderClass)
+    implicit val valueDecoderCmt: ClassTag[VD] = ClassTag(valueDecoderClass)
+    createDirectStream[K, V, KD, VD](
+      jssc.ssc,
+      Map(kafkaParams.toSeq: _*),
+      Set(topics.toSeq: _*)
+    )
+  }
 }
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Leader.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Leader.scala
index 3454d92e72b47..c129a26836c0d 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Leader.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Leader.scala
@@ -19,17 +19,28 @@ package org.apache.spark.streaming.kafka
 
 import kafka.common.TopicAndPartition
 
-/** Host info for the leader of a Kafka TopicAndPartition */
+import org.apache.spark.annotation.Experimental
+
+/**
+ * :: Experimental ::
+ * Represent the host info for the leader of a Kafka partition.
+ */
+@Experimental
 final class Leader private(
-    /** kafka topic name */
+    /** Kafka topic name */
     val topic: String,
-    /** kafka partition id */
+    /** Kafka partition id */
     val partition: Int,
-    /** kafka hostname */
+    /** Leader's hostname */
     val host: String,
-    /** kafka host's port */
+    /** Leader's port */
     val port: Int) extends Serializable
 
+/**
+ * :: Experimental ::
+ * Companion object the provides methods to create instances of [[Leader]].
+ */
+@Experimental
 object Leader {
   def create(topic: String, partition: Int, host: String, port: Int): Leader =
     new Leader(topic, partition, host, port)
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
index 334c12e4627b4..9c3dfeb8f5928 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
@@ -19,16 +19,35 @@ package org.apache.spark.streaming.kafka
 
 import kafka.common.TopicAndPartition
 
-/** Something that has a collection of OffsetRanges */
+import org.apache.spark.annotation.Experimental
+
+/**
+ * :: Experimental ::
+ * Represents any object that has a collection of [[OffsetRange]]s. This can be used access the
+ * offset ranges in RDDs generated by the direct Kafka DStream (see
+ * [[KafkaUtils.createDirectStream()]]).
+ * {{{
+ *   KafkaUtils.createDirectStream(...).foreachRDD { rdd =>
+ *      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
+ *      ...
+ *   }
+ * }}}
+ */
+@Experimental
 trait HasOffsetRanges {
   def offsetRanges: Array[OffsetRange]
 }
 
-/** Represents a range of offsets from a single Kafka TopicAndPartition */
+/**
+ * :: Experimental ::
+ * Represents a range of offsets from a single Kafka TopicAndPartition. Instances of this class
+ * can be created with `OffsetRange.create()`.
+ */
+@Experimental
 final class OffsetRange private(
-    /** kafka topic name */
+    /** Kafka topic name */
     val topic: String,
-    /** kafka partition id */
+    /** Kafka partition id */
     val partition: Int,
     /** inclusive starting offset */
     val fromOffset: Long,
@@ -36,11 +55,33 @@ final class OffsetRange private(
     val untilOffset: Long) extends Serializable {
   import OffsetRange.OffsetRangeTuple
 
+  override def equals(obj: Any): Boolean = obj match {
+    case that: OffsetRange =>
+      this.topic == that.topic &&
+        this.partition == that.partition &&
+        this.fromOffset == that.fromOffset &&
+        this.untilOffset == that.untilOffset
+    case _ => false
+  }
+
+  override def hashCode(): Int = {
+    toTuple.hashCode()
+  }
+
+  override def toString(): String = {
+    s"OffsetRange(topic: '$topic', partition: $partition, range: [$fromOffset -> $untilOffset]"
+  }
+
   /** this is to avoid ClassNotFoundException during checkpoint restore */
   private[streaming]
   def toTuple: OffsetRangeTuple = (topic, partition, fromOffset, untilOffset)
 }
 
+/**
+ * :: Experimental ::
+ * Companion object the provides methods to create instances of [[OffsetRange]].
+ */
+@Experimental
 object OffsetRange {
   def create(topic: String, partition: Int, fromOffset: Long, untilOffset: Long): OffsetRange =
     new OffsetRange(topic, partition, fromOffset, untilOffset)
@@ -61,10 +102,10 @@ object OffsetRange {
     new OffsetRange(topicAndPartition.topic, topicAndPartition.partition, fromOffset, untilOffset)
 
   /** this is to avoid ClassNotFoundException during checkpoint restore */
-  private[spark]
+  private[kafka]
   type OffsetRangeTuple = (String, Int, Long, Long)
 
-  private[streaming]
+  private[kafka]
   def apply(t: OffsetRangeTuple) =
     new OffsetRange(t._1, t._2, t._3, t._4)
 }
diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java
new file mode 100644
index 0000000000000..1334cc8fd1b57
--- /dev/null
+++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaDirectKafkaStreamSuite.java
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka;
+
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+
+import scala.Tuple2;
+
+import junit.framework.Assert;
+
+import kafka.common.TopicAndPartition;
+import kafka.message.MessageAndMetadata;
+import kafka.serializer.StringDecoder;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.streaming.api.java.JavaDStream;
+import org.apache.spark.streaming.Durations;
+import org.apache.spark.streaming.api.java.JavaStreamingContext;
+
+import org.junit.Test;
+import org.junit.After;
+import org.junit.Before;
+
+public class JavaDirectKafkaStreamSuite implements Serializable {
+  private transient JavaStreamingContext ssc = null;
+  private transient Random random = new Random();
+  private transient KafkaStreamSuiteBase suiteBase = null;
+
+  @Before
+  public void setUp() {
+      suiteBase = new KafkaStreamSuiteBase() { };
+      suiteBase.setupKafka();
+      System.clearProperty("spark.driver.port");
+      SparkConf sparkConf = new SparkConf()
+              .setMaster("local[4]").setAppName(this.getClass().getSimpleName());
+      ssc = new JavaStreamingContext(sparkConf, Durations.milliseconds(200));
+  }
+
+  @After
+  public void tearDown() {
+      ssc.stop();
+      ssc = null;
+      System.clearProperty("spark.driver.port");
+      suiteBase.tearDownKafka();
+  }
+
+  @Test
+  public void testKafkaStream() throws InterruptedException {
+    String topic1 = "topic1";
+    String topic2 = "topic2";
+
+    String[] topic1data = createTopicAndSendData(topic1);
+    String[] topic2data = createTopicAndSendData(topic2);
+
+    HashSet<String> sent = new HashSet<String>();
+    sent.addAll(Arrays.asList(topic1data));
+    sent.addAll(Arrays.asList(topic2data));
+
+    HashMap<String, String> kafkaParams = new HashMap<String, String>();
+    kafkaParams.put("metadata.broker.list", suiteBase.brokerAddress());
+    kafkaParams.put("auto.offset.reset", "smallest");
+
+    JavaDStream<String> stream1 = KafkaUtils.createDirectStream(
+        ssc,
+        String.class,
+        String.class,
+        StringDecoder.class,
+        StringDecoder.class,
+        kafkaParams,
+        topicToSet(topic1)
+    ).map(
+        new Function<Tuple2<String, String>, String>() {
+          @Override
+          public String call(scala.Tuple2<String, String> kv) throws Exception {
+            return kv._2();
+          }
+        }
+    );
+
+    JavaDStream<String> stream2 = KafkaUtils.createDirectStream(
+        ssc,
+        String.class,
+        String.class,
+        StringDecoder.class,
+        StringDecoder.class,
+        String.class,
+        kafkaParams,
+        topicOffsetToMap(topic2, (long) 0),
+        new Function<MessageAndMetadata<String, String>, String>() {
+          @Override
+          public String call(MessageAndMetadata<String, String> msgAndMd) throws Exception {
+            return msgAndMd.message();
+          }
+        }
+    );
+    JavaDStream<String> unifiedStream = stream1.union(stream2);
+
+    final HashSet<String> result = new HashSet<String>();
+    unifiedStream.foreachRDD(
+        new Function<JavaRDD<String>, Void>() {
+          @Override
+          public Void call(org.apache.spark.api.java.JavaRDD<String> rdd) throws Exception {
+            result.addAll(rdd.collect());
+            return null;
+          }
+        }
+    );
+    ssc.start();
+    long startTime = System.currentTimeMillis();
+    boolean matches = false;
+    while (!matches && System.currentTimeMillis() - startTime < 20000) {
+      matches = sent.size() == result.size();
+      Thread.sleep(50);
+    }
+    Assert.assertEquals(sent, result);
+    ssc.stop();
+  }
+
+  private HashSet<String> topicToSet(String topic) {
+    HashSet<String> topicSet = new HashSet<String>();
+    topicSet.add(topic);
+    return topicSet;
+  }
+
+  private HashMap<TopicAndPartition, Long> topicOffsetToMap(String topic, Long offsetToStart) {
+    HashMap<TopicAndPartition, Long> topicMap = new HashMap<TopicAndPartition, Long>();
+    topicMap.put(new TopicAndPartition(topic, 0), offsetToStart);
+    return topicMap;
+  }
+
+  private  String[] createTopicAndSendData(String topic) {
+    String[] data = { topic + "-1", topic + "-2", topic + "-3"};
+    suiteBase.createTopic(topic);
+    suiteBase.sendMessages(topic, data);
+    return data;
+  }
+}
diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java
index 6e1abf3f385ee..208cc51b29876 100644
--- a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java
+++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaStreamSuite.java
@@ -79,9 +79,10 @@ public void testKafkaStream() throws InterruptedException {
 
     suiteBase.createTopic(topic);
     HashMap<String, Object> tmp = new HashMap<String, Object>(sent);
-    suiteBase.produceAndSendMessage(topic,
+    suiteBase.sendMessages(topic,
         JavaConverters.mapAsScalaMapConverter(tmp).asScala().toMap(
-            Predef.<Tuple2<String, Object>>conforms()));
+            Predef.<Tuple2<String, Object>>conforms())
+    );
 
     HashMap<String, String> kafkaParams = new HashMap<String, String>();
     kafkaParams.put("zookeeper.connect", suiteBase.zkAddress());
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
new file mode 100644
index 0000000000000..b25c2120d54f7
--- /dev/null
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
@@ -0,0 +1,302 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka
+
+import java.io.File
+
+import scala.collection.mutable
+import scala.concurrent.duration._
+import scala.language.postfixOps
+
+import kafka.serializer.StringDecoder
+import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
+import org.scalatest.concurrent.{Eventually, Timeouts}
+
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.{Milliseconds, StreamingContext, Time}
+import org.apache.spark.streaming.dstream.{DStream, InputDStream}
+import org.apache.spark.util.Utils
+import kafka.common.TopicAndPartition
+import kafka.message.MessageAndMetadata
+
+class DirectKafkaStreamSuite extends KafkaStreamSuiteBase
+  with BeforeAndAfter with BeforeAndAfterAll with Eventually {
+  val sparkConf = new SparkConf()
+    .setMaster("local[4]")
+    .setAppName(this.getClass.getSimpleName)
+
+  var sc: SparkContext = _
+  var ssc: StreamingContext = _
+  var testDir: File = _
+
+  override def beforeAll {
+    setupKafka()
+  }
+
+  override def afterAll {
+    tearDownKafka()
+  }
+
+  after {
+    if (ssc != null) {
+      ssc.stop()
+      sc = null
+    }
+    if (sc != null) {
+      sc.stop()
+    }
+    if (testDir != null) {
+      Utils.deleteRecursively(testDir)
+    }
+  }
+
+
+  test("basic stream receiving with multiple topics and smallest starting offset") {
+    val topics = Set("basic1", "basic2", "basic3")
+    val data = Map("a" -> 7, "b" -> 9)
+    topics.foreach { t =>
+      createTopic(t)
+      sendMessages(t, data)
+    }
+    val kafkaParams = Map(
+      "metadata.broker.list" -> s"$brokerAddress",
+      "auto.offset.reset" -> "smallest"
+    )
+
+    ssc = new StreamingContext(sparkConf, Milliseconds(200))
+    val stream = withClue("Error creating direct stream") {
+      KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
+        ssc, kafkaParams, topics)
+    }
+    var total = 0L
+
+    stream.foreachRDD { rdd =>
+    // Get the offset ranges in the RDD
+      val offsets = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
+      val collected = rdd.mapPartitionsWithIndex { (i, iter) =>
+      // For each partition, get size of the range in the partition,
+      // and the number of items in the partition
+        val off = offsets(i)
+        val all = iter.toSeq
+        val partSize = all.size
+        val rangeSize = off.untilOffset - off.fromOffset
+        Iterator((partSize, rangeSize))
+      }.collect
+
+      // Verify whether number of elements in each partition
+      // matches with the corresponding offset range
+      collected.foreach { case (partSize, rangeSize) =>
+        assert(partSize === rangeSize, "offset ranges are wrong")
+      }
+      total += collected.size  // Add up all the collected items
+    }
+    ssc.start()
+    eventually(timeout(20000.milliseconds), interval(200.milliseconds)) {
+      assert(total === data.values.sum * topics.size, "didn't get all messages")
+    }
+    ssc.stop()
+  }
+
+  test("receiving from largest starting offset") {
+    val topic = "largest"
+    val topicPartition = TopicAndPartition(topic, 0)
+    val data = Map("a" -> 10)
+    createTopic(topic)
+    val kafkaParams = Map(
+      "metadata.broker.list" -> s"$brokerAddress",
+      "auto.offset.reset" -> "largest"
+    )
+    val kc = new KafkaCluster(kafkaParams)
+    def getLatestOffset(): Long = {
+      kc.getLatestLeaderOffsets(Set(topicPartition)).right.get(topicPartition).offset
+    }
+
+    // Send some initial messages before starting context
+    sendMessages(topic, data)
+    eventually(timeout(10 seconds), interval(20 milliseconds)) {
+      assert(getLatestOffset() > 3)
+    }
+    val offsetBeforeStart = getLatestOffset()
+
+    // Setup context and kafka stream with largest offset
+    ssc = new StreamingContext(sparkConf, Milliseconds(200))
+    val stream = withClue("Error creating direct stream") {
+      KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
+        ssc, kafkaParams, Set(topic))
+    }
+    assert(
+      stream.asInstanceOf[DirectKafkaInputDStream[_, _, _, _, _]]
+        .fromOffsets(topicPartition) >= offsetBeforeStart,
+      "Start offset not from latest"
+    )
+
+    val collectedData = new mutable.ArrayBuffer[String]()
+    stream.map { _._2 }.foreachRDD { rdd => collectedData ++= rdd.collect() }
+    ssc.start()
+    val newData = Map("b" -> 10)
+    sendMessages(topic, newData)
+    eventually(timeout(10 seconds), interval(50 milliseconds)) {
+      collectedData.contains("b")
+    }
+    assert(!collectedData.contains("a"))
+  }
+
+
+  test("creating stream by offset") {
+    val topic = "offset"
+    val topicPartition = TopicAndPartition(topic, 0)
+    val data = Map("a" -> 10)
+    createTopic(topic)
+    val kafkaParams = Map(
+      "metadata.broker.list" -> s"$brokerAddress",
+      "auto.offset.reset" -> "largest"
+    )
+    val kc = new KafkaCluster(kafkaParams)
+    def getLatestOffset(): Long = {
+      kc.getLatestLeaderOffsets(Set(topicPartition)).right.get(topicPartition).offset
+    }
+
+    // Send some initial messages before starting context
+    sendMessages(topic, data)
+    eventually(timeout(10 seconds), interval(20 milliseconds)) {
+      assert(getLatestOffset() >= 10)
+    }
+    val offsetBeforeStart = getLatestOffset()
+
+    // Setup context and kafka stream with largest offset
+    ssc = new StreamingContext(sparkConf, Milliseconds(200))
+    val stream = withClue("Error creating direct stream") {
+      KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, String](
+        ssc, kafkaParams, Map(topicPartition -> 11L),
+        (m: MessageAndMetadata[String, String]) => m.message())
+    }
+    assert(
+      stream.asInstanceOf[DirectKafkaInputDStream[_, _, _, _, _]]
+        .fromOffsets(topicPartition) >= offsetBeforeStart,
+      "Start offset not from latest"
+    )
+
+    val collectedData = new mutable.ArrayBuffer[String]()
+    stream.foreachRDD { rdd => collectedData ++= rdd.collect() }
+    ssc.start()
+    val newData = Map("b" -> 10)
+    sendMessages(topic, newData)
+    eventually(timeout(10 seconds), interval(50 milliseconds)) {
+      collectedData.contains("b")
+    }
+    assert(!collectedData.contains("a"))
+  }
+
+  // Test to verify the offset ranges can be recovered from the checkpoints
+  test("offset recovery") {
+    val topic = "recovery"
+    createTopic(topic)
+    testDir = Utils.createTempDir()
+
+    val kafkaParams = Map(
+      "metadata.broker.list" -> s"$brokerAddress",
+      "auto.offset.reset" -> "smallest"
+    )
+
+    // Send data to Kafka and wait for it to be received
+    def sendDataAndWaitForReceive(data: Seq[Int]) {
+      val strings = data.map { _.toString}
+      sendMessages(topic, strings.map { _ -> 1}.toMap)
+      eventually(timeout(10 seconds), interval(50 milliseconds)) {
+        assert(strings.forall { DirectKafkaStreamSuite.collectedData.contains })
+      }
+    }
+
+    // Setup the streaming context
+    ssc = new StreamingContext(sparkConf, Milliseconds(100))
+    val kafkaStream = withClue("Error creating direct stream") {
+      KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
+        ssc, kafkaParams, Set(topic))
+    }
+    val keyedStream = kafkaStream.map { v => "key" -> v._2.toInt }
+    val stateStream = keyedStream.updateStateByKey { (values: Seq[Int], state: Option[Int]) =>
+      Some(values.sum + state.getOrElse(0))
+    }
+    ssc.checkpoint(testDir.getAbsolutePath)
+
+    // This is to collect the raw data received from Kafka
+    kafkaStream.foreachRDD { (rdd: RDD[(String, String)], time: Time) =>
+      val data = rdd.map { _._2 }.collect()
+      DirectKafkaStreamSuite.collectedData.appendAll(data)
+    }
+
+    // This is ensure all the data is eventually receiving only once
+    stateStream.foreachRDD { (rdd: RDD[(String, Int)]) =>
+      rdd.collect().headOption.foreach { x => DirectKafkaStreamSuite.total = x._2 }
+    }
+    ssc.start()
+
+    // Send some data and wait for them to be received
+    for (i <- (1 to 10).grouped(4)) {
+      sendDataAndWaitForReceive(i)
+    }
+
+    // Verify that offset ranges were generated
+    val offsetRangesBeforeStop = getOffsetRanges(kafkaStream)
+    assert(offsetRangesBeforeStop.size >= 1, "No offset ranges generated")
+    assert(
+      offsetRangesBeforeStop.head._2.forall { _.fromOffset === 0 },
+      "starting offset not zero"
+    )
+    ssc.stop()
+    logInfo("====== RESTARTING ========")
+
+    // Recover context from checkpoints
+    ssc = new StreamingContext(testDir.getAbsolutePath)
+    val recoveredStream = ssc.graph.getInputStreams().head.asInstanceOf[DStream[(String, String)]]
+
+    // Verify offset ranges have been recovered
+    val recoveredOffsetRanges = getOffsetRanges(recoveredStream)
+    assert(recoveredOffsetRanges.size > 0, "No offset ranges recovered")
+    val earlierOffsetRangesAsSets = offsetRangesBeforeStop.map { x => (x._1, x._2.toSet) }
+    assert(
+      recoveredOffsetRanges.forall { or =>
+        earlierOffsetRangesAsSets.contains((or._1, or._2.toSet))
+      },
+      "Recovered ranges are not the same as the ones generated"
+    )
+
+    // Restart context, give more data and verify the total at the end
+    // If the total is write that means each records has been received only once
+    ssc.start()
+    sendDataAndWaitForReceive(11 to 20)
+    eventually(timeout(10 seconds), interval(50 milliseconds)) {
+      assert(DirectKafkaStreamSuite.total === (1 to 20).sum)
+    }
+    ssc.stop()
+  }
+
+  /** Get the generated offset ranges from the DirectKafkaStream */
+  private def getOffsetRanges[K, V](
+      kafkaStream: DStream[(K, V)]): Seq[(Time, Array[OffsetRange])] = {
+    kafkaStream.generatedRDDs.mapValues { rdd =>
+      rdd.asInstanceOf[KafkaRDD[K, V, _, _, (K, V)]].offsetRanges
+    }.toSeq.sortBy { _._1 }
+  }
+}
+
+object DirectKafkaStreamSuite {
+  val collectedData = new mutable.ArrayBuffer[String]()
+  var total = -1L
+}
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaClusterSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaClusterSuite.scala
index e57c8f6987fdc..fc9275b7207be 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaClusterSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaClusterSuite.scala
@@ -19,33 +19,29 @@ package org.apache.spark.streaming.kafka
 
 import scala.util.Random
 
-import org.scalatest.BeforeAndAfter
 import kafka.common.TopicAndPartition
+import org.scalatest.BeforeAndAfterAll
 
-class KafkaClusterSuite extends KafkaStreamSuiteBase with BeforeAndAfter {
-  val brokerHost = "localhost"
-
-  val kafkaParams = Map("metadata.broker.list" -> s"$brokerHost:$brokerPort")
-
-  val kc = new KafkaCluster(kafkaParams)
-
+class KafkaClusterSuite extends KafkaStreamSuiteBase with BeforeAndAfterAll {
   val topic = "kcsuitetopic" + Random.nextInt(10000)
-
   val topicAndPartition = TopicAndPartition(topic, 0)
+  var kc: KafkaCluster = null
 
-  before {
+  override def beforeAll() {
     setupKafka()
     createTopic(topic)
-    produceAndSendMessage(topic, Map("a" -> 1))
+    sendMessages(topic, Map("a" -> 1))
+    kc = new KafkaCluster(Map("metadata.broker.list" -> s"$brokerAddress"))
   }
 
-  after {
+  override def afterAll() {
     tearDownKafka()
   }
 
   test("metadata apis") {
-    val leader = kc.findLeaders(Set(topicAndPartition)).right.get
-    assert(leader(topicAndPartition) === (brokerHost, brokerPort), "didn't get leader")
+    val leader = kc.findLeaders(Set(topicAndPartition)).right.get(topicAndPartition)
+    val leaderAddress = s"${leader._1}:${leader._2}"
+    assert(leaderAddress === brokerAddress, "didn't get leader")
 
     val parts = kc.getPartitions(Set(topic)).right.get
     assert(parts(topicAndPartition), "didn't get partitions")
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaDirectStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaDirectStreamSuite.scala
deleted file mode 100644
index 0891ce344f16a..0000000000000
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaDirectStreamSuite.scala
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.streaming.kafka
-
-import scala.util.Random
-import scala.concurrent.duration._
-
-import org.scalatest.BeforeAndAfter
-import org.scalatest.concurrent.Eventually
-
-import kafka.serializer.StringDecoder
-
-import org.apache.spark.SparkConf
-import org.apache.spark.storage.StorageLevel
-import org.apache.spark.streaming.{Milliseconds, StreamingContext}
-
-class KafkaDirectStreamSuite extends KafkaStreamSuiteBase with BeforeAndAfter with Eventually {
-  val sparkConf = new SparkConf()
-    .setMaster("local[4]")
-    .setAppName(this.getClass.getSimpleName)
-
-  val brokerHost = "localhost"
-
-  val kafkaParams = Map(
-    "metadata.broker.list" -> s"$brokerHost:$brokerPort",
-    "auto.offset.reset" -> "smallest"
-  )
-
-  var ssc: StreamingContext = _
-
-  before {
-    setupKafka()
-
-    ssc = new StreamingContext(sparkConf, Milliseconds(500))
-  }
-
-  after {
-    if (ssc != null) {
-      ssc.stop()
-    }
-    tearDownKafka()
-  }
-
-  test("multi topic stream") {
-    val topics = Set("newA", "newB")
-    val data = Map("a" -> 7, "b" -> 9)
-    topics.foreach { t =>
-      createTopic(t)
-      produceAndSendMessage(t, data)
-    }
-    val stream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
-      ssc, kafkaParams, topics)
-    var total = 0L;
-
-    stream.foreachRDD { rdd =>
-      val offsets = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
-      val collected = rdd.mapPartitionsWithIndex { (i, iter) =>
-        val off = offsets(i)
-        val all = iter.toSeq
-        val partSize = all.size
-        val rangeSize = off.untilOffset - off.fromOffset
-        all.map { _ =>
-          (partSize, rangeSize)
-        }.toIterator
-      }.collect
-      collected.foreach { case (partSize, rangeSize) =>
-          assert(partSize === rangeSize, "offset ranges are wrong")
-      }
-      total += collected.size
-    }
-    ssc.start()
-    eventually(timeout(20000.milliseconds), interval(200.milliseconds)) {
-      assert(total === data.values.sum * topics.size, "didn't get all messages")
-    }
-    ssc.stop()
-  }
-}
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala
index 9b9e3f5fce8bd..6774db854a0d0 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala
@@ -46,9 +46,9 @@ class KafkaRDDSuite extends KafkaStreamSuiteBase with BeforeAndAfter {
     val topic = "topic1"
     val sent = Map("a" -> 5, "b" -> 3, "c" -> 10)
     createTopic(topic)
-    produceAndSendMessage(topic, sent)
+    sendMessages(topic, sent)
 
-    val kafkaParams = Map("metadata.broker.list" -> s"localhost:$brokerPort",
+    val kafkaParams = Map("metadata.broker.list" -> brokerAddress,
       "group.id" -> s"test-consumer-${Random.nextInt(10000)}")
 
     val kc = new KafkaCluster(kafkaParams)
@@ -65,14 +65,14 @@ class KafkaRDDSuite extends KafkaStreamSuiteBase with BeforeAndAfter {
 
     val rdd2 = getRdd(kc, Set(topic))
     val sent2 = Map("d" -> 1)
-    produceAndSendMessage(topic, sent2)
+    sendMessages(topic, sent2)
     // this is the "0 messages" case
     // make sure we dont get anything, since messages were sent after rdd was defined
     assert(rdd2.isDefined)
     assert(rdd2.get.count === 0)
 
     val rdd3 = getRdd(kc, Set(topic))
-    produceAndSendMessage(topic, Map("extra" -> 22))
+    sendMessages(topic, Map("extra" -> 22))
     // this is the "exactly 1 message" case
     // make sure we get exactly one message, despite there being lots more available
     assert(rdd3.isDefined)
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala
index f207dc6d4fa04..e4966eebb9b34 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala
@@ -48,30 +48,41 @@ import org.apache.spark.util.Utils
  */
 abstract class KafkaStreamSuiteBase extends FunSuite with Eventually with Logging {
 
-  var zkAddress: String = _
-  var zkClient: ZkClient = _
-
   private val zkHost = "localhost"
+  private var zkPort: Int = 0
   private val zkConnectionTimeout = 6000
   private val zkSessionTimeout = 6000
   private var zookeeper: EmbeddedZookeeper = _
-  private var zkPort: Int = 0
-  protected var brokerPort = 9092
+  private val brokerHost = "localhost"
+  private var brokerPort = 9092
   private var brokerConf: KafkaConfig = _
   private var server: KafkaServer = _
   private var producer: Producer[String, String] = _
+  private var zkReady = false
+  private var brokerReady = false
+
+  protected var zkClient: ZkClient = _
+
+  def zkAddress: String = {
+    assert(zkReady, "Zookeeper not setup yet or already torn down, cannot get zookeeper address")
+    s"$zkHost:$zkPort"
+  }
+
+  def brokerAddress: String = {
+    assert(brokerReady, "Kafka not setup yet or already torn down, cannot get broker address")
+    s"$brokerHost:$brokerPort"
+  }
 
   def setupKafka() {
     // Zookeeper server startup
     zookeeper = new EmbeddedZookeeper(s"$zkHost:$zkPort")
     // Get the actual zookeeper binding port
     zkPort = zookeeper.actualPort
-    zkAddress = s"$zkHost:$zkPort"
-    logInfo("==================== 0 ====================")
+    zkReady = true
+    logInfo("==================== Zookeeper Started ====================")
 
-    zkClient = new ZkClient(zkAddress, zkSessionTimeout, zkConnectionTimeout,
-      ZKStringSerializer)
-    logInfo("==================== 1 ====================")
+    zkClient = new ZkClient(zkAddress, zkSessionTimeout, zkConnectionTimeout, ZKStringSerializer)
+    logInfo("==================== Zookeeper Client Created ====================")
 
     // Kafka broker startup
     var bindSuccess: Boolean = false
@@ -80,9 +91,8 @@ abstract class KafkaStreamSuiteBase extends FunSuite with Eventually with Loggin
         val brokerProps = getBrokerConfig()
         brokerConf = new KafkaConfig(brokerProps)
         server = new KafkaServer(brokerConf)
-        logInfo("==================== 2 ====================")
         server.startup()
-        logInfo("==================== 3 ====================")
+        logInfo("==================== Kafka Broker Started ====================")
         bindSuccess = true
       } catch {
         case e: KafkaException =>
@@ -94,10 +104,13 @@ abstract class KafkaStreamSuiteBase extends FunSuite with Eventually with Loggin
     }
 
     Thread.sleep(2000)
-    logInfo("==================== 4 ====================")
+    logInfo("==================== Kafka + Zookeeper Ready ====================")
+    brokerReady = true
   }
 
   def tearDownKafka() {
+    brokerReady = false
+    zkReady = false
     if (producer != null) {
       producer.close()
       producer = null
@@ -121,26 +134,23 @@ abstract class KafkaStreamSuiteBase extends FunSuite with Eventually with Loggin
     }
   }
 
-  private def createTestMessage(topic: String, sent: Map[String, Int])
-    : Seq[KeyedMessage[String, String]] = {
-    val messages = for ((s, freq) <- sent; i <- 0 until freq) yield {
-      new KeyedMessage[String, String](topic, s)
-    }
-    messages.toSeq
-  }
-
   def createTopic(topic: String) {
     AdminUtils.createTopic(zkClient, topic, 1, 1)
-    logInfo("==================== 5 ====================")
     // wait until metadata is propagated
     waitUntilMetadataIsPropagated(topic, 0)
+    logInfo(s"==================== Topic $topic Created ====================")
   }
 
-  def produceAndSendMessage(topic: String, sent: Map[String, Int]) {
+  def sendMessages(topic: String, messageToFreq: Map[String, Int]) {
+    val messages = messageToFreq.flatMap { case (s, freq) => Seq.fill(freq)(s) }.toArray
+    sendMessages(topic, messages)
+  }
+  
+  def sendMessages(topic: String, messages: Array[String]) {
     producer = new Producer[String, String](new ProducerConfig(getProducerConfig()))
-    producer.send(createTestMessage(topic, sent): _*)
+    producer.send(messages.map { new KeyedMessage[String, String](topic, _ ) }: _*)
     producer.close()
-    logInfo("==================== 6 ====================")
+    logInfo(s"==================== Sent Messages: ${messages.mkString(", ")} ====================")
   }
 
   private def getBrokerConfig(): Properties = {
@@ -218,7 +228,7 @@ class KafkaStreamSuite extends KafkaStreamSuiteBase with BeforeAndAfter {
     val topic = "topic1"
     val sent = Map("a" -> 5, "b" -> 3, "c" -> 10)
     createTopic(topic)
-    produceAndSendMessage(topic, sent)
+    sendMessages(topic, sent)
 
     val kafkaParams = Map("zookeeper.connect" -> zkAddress,
       "group.id" -> s"test-consumer-${Random.nextInt(10000)}",
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/ReliableKafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/ReliableKafkaStreamSuite.scala
index 64ccc92c81fa9..fc53c23abda85 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/ReliableKafkaStreamSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/ReliableKafkaStreamSuite.scala
@@ -79,7 +79,7 @@ class ReliableKafkaStreamSuite extends KafkaStreamSuiteBase with BeforeAndAfter
   test("Reliable Kafka input stream with single topic") {
     var topic = "test-topic"
     createTopic(topic)
-    produceAndSendMessage(topic, data)
+    sendMessages(topic, data)
 
     // Verify whether the offset of this group/topic/partition is 0 before starting.
     assert(getCommitOffset(groupId, topic, 0) === None)
@@ -111,7 +111,7 @@ class ReliableKafkaStreamSuite extends KafkaStreamSuiteBase with BeforeAndAfter
     val topics = Map("topic1" -> 1, "topic2" -> 1, "topic3" -> 1)
     topics.foreach { case (t, _) =>
       createTopic(t)
-      produceAndSendMessage(t, data)
+      sendMessages(t, data)
     }
 
     // Before started, verify all the group/topic/partition offsets are 0.

From 2d1e916730492f5d61b97da6c483d3223ca44315 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 10 Feb 2015 09:19:01 +0000
Subject: [PATCH 089/357] SPARK-5239 [CORE] JdbcRDD throws
 "java.lang.AbstractMethodError: oracle.jdbc.driver.xxxxxx.isClosed()Z"

This is a completion of https://github.com/apache/spark/pull/4033 which was withdrawn for some reason.

Author: Sean Owen <sowen@cloudera.com>

Closes #4470 from srowen/SPARK-5239.2 and squashes the following commits:

2398bde [Sean Owen] Avoid use of JDBC4-only isClosed()
---
 core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala      | 6 +++---
 .../src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala  | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
index 642a12c1edf6c..4fe7622bda00f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
@@ -99,21 +99,21 @@ class JdbcRDD[T: ClassTag](
 
     override def close() {
       try {
-        if (null != rs && ! rs.isClosed()) {
+        if (null != rs) {
           rs.close()
         }
       } catch {
         case e: Exception => logWarning("Exception closing resultset", e)
       }
       try {
-        if (null != stmt && ! stmt.isClosed()) {
+        if (null != stmt) {
           stmt.close()
         }
       } catch {
         case e: Exception => logWarning("Exception closing statement", e)
       }
       try {
-        if (null != conn && ! conn.isClosed()) {
+        if (null != conn) {
           conn.close()
         }
         logInfo("closed connection")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
index 0bec32cca1325..87304ce2496b4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRDD.scala
@@ -370,21 +370,21 @@ private[sql] class JDBCRDD(
     def close() {
       if (closed) return
       try {
-        if (null != rs && ! rs.isClosed()) {
+        if (null != rs) {
           rs.close()
         }
       } catch {
         case e: Exception => logWarning("Exception closing resultset", e)
       }
       try {
-        if (null != stmt && ! stmt.isClosed()) {
+        if (null != stmt) {
           stmt.close()
         }
       } catch {
         case e: Exception => logWarning("Exception closing statement", e)
       }
       try {
-        if (null != conn && ! conn.isClosed()) {
+        if (null != conn) {
           conn.close()
         }
         logInfo("closed connection")

From ba667935f8670293f10b8bbe1e317b28d00f9875 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 10 Feb 2015 02:28:47 -0800
Subject: [PATCH 090/357] [SPARK-5700] [SQL] [Build] Bumps jets3t to 0.9.3 for
 hadoop-2.3 and hadoop-2.4 profiles

This is a follow-up PR for #4454 and #4484. JetS3t 0.9.2 contains a log4j.properties file inside the artifact and breaks our tests (see SPARK-5696). This is fixed in 0.9.3.

This PR also reverts hotfix changes introduced in #4484. The reason is that asking users to configure HiveThriftServer2 logging configurations in hive-log4j.properties can be unintuitive.

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4499)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>

Closes #4499 from liancheng/spark-5700 and squashes the following commits:

4f020c7 [Cheng Lian] Bumps jets3t to 0.9.3 for hadoop-2.3 and hadoop-2.4 profiles
---
 pom.xml                                                       | 4 ++--
 .../spark/sql/hive/thriftserver/HiveThriftServer2.scala       | 3 ---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/pom.xml b/pom.xml
index a9e968af25453..56e37d42265c0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1578,7 +1578,7 @@
       <properties>
         <hadoop.version>2.3.0</hadoop.version>
         <protobuf.version>2.5.0</protobuf.version>
-        <jets3t.version>0.9.2</jets3t.version>
+        <jets3t.version>0.9.3</jets3t.version>
         <hbase.version>0.98.7-hadoop2</hbase.version>
         <commons.math3.version>3.1.1</commons.math3.version>
         <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
@@ -1591,7 +1591,7 @@
       <properties>
         <hadoop.version>2.4.0</hadoop.version>
         <protobuf.version>2.5.0</protobuf.version>
-        <jets3t.version>0.9.2</jets3t.version>
+        <jets3t.version>0.9.3</jets3t.version>
         <hbase.version>0.98.7-hadoop2</hbase.version>
         <commons.math3.version>3.1.1</commons.math3.version>
         <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
index 525777aa454c4..6e07df18b0e15 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.hive.thriftserver
 
 import org.apache.commons.logging.LogFactory
-import org.apache.hadoop.hive.common.LogUtils
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hive.service.cli.thrift.{ThriftBinaryCLIService, ThriftHttpCLIService}
@@ -55,8 +54,6 @@ object HiveThriftServer2 extends Logging {
       System.exit(-1)
     }
 
-    LogUtils.initHiveLog4j()
-
     logInfo("Starting SparkContext")
     SparkSQLEnv.init()
 

From 50820f15277187e8522520a3ecae412abbdb4397 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Tue, 10 Feb 2015 15:45:38 +0000
Subject: [PATCH 091/357] [SPARK-1805] [EC2] Validate instance types

Addresses [SPARK-1805](https://issues.apache.org/jira/browse/SPARK-1805), though doesn't resolve it completely.

Error out quickly if the user asks for the master and slaves to have different AMI virtualization types, since we don't currently support that.

In addition to that, we print warnings if the inputted instance types are not recognized, though I would prefer if we errored out. Elsewhere in the script it seems [we allow unrecognized instance types](https://github.com/apache/spark/blob/5de14cc2763a8211f77eeb55940dec025822eb78/ec2/spark_ec2.py#L331), though I think we should remove that.

It's messy, but it should serve us until we enhance spark-ec2 to support clusters with mixed virtualization types.

Author: Nicholas Chammas <nicholas.chammas@gmail.com>

Closes #4455 from nchammas/ec2-master-slave-different-virtualization and squashes the following commits:

ce28609 [Nicholas Chammas] fix style
b0adba0 [Nicholas Chammas] validate input instance types
---
 ec2/spark_ec2.py | 132 +++++++++++++++++++++++++++++------------------
 1 file changed, 81 insertions(+), 51 deletions(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 3e4c49c0e1db6..fe510f12bcec6 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -112,6 +112,7 @@ def parse_args():
         version="%prog {v}".format(v=SPARK_EC2_VERSION),
         usage="%prog [options] <action> <cluster_name>\n\n"
         + "<action> can be: launch, destroy, login, stop, start, get-master, reboot-slaves")
+
     parser.add_option(
         "-s", "--slaves", type="int", default=1,
         help="Number of slaves to launch (default: %default)")
@@ -139,7 +140,9 @@ def parse_args():
         help="Availability zone to launch instances in, or 'all' to spread " +
              "slaves across multiple (an additional $0.01/Gb for bandwidth" +
              "between zones applies) (default: a single zone chosen at random)")
-    parser.add_option("-a", "--ami", help="Amazon Machine Image ID to use")
+    parser.add_option(
+        "-a", "--ami",
+        help="Amazon Machine Image ID to use")
     parser.add_option(
         "-v", "--spark-version", default=DEFAULT_SPARK_VERSION,
         help="Version of Spark to use: 'X.Y.Z' or a specific git hash (default: %default)")
@@ -179,10 +182,11 @@ def parse_args():
              "Only possible on EBS-backed AMIs. " +
              "EBS volumes are only attached if --ebs-vol-size > 0." +
              "Only support up to 8 EBS volumes.")
-    parser.add_option("--placement-group", type="string", default=None,
-                      help="Which placement group to try and launch " +
-                      "instances into. Assumes placement group is already " +
-                      "created.")
+    parser.add_option(
+        "--placement-group", type="string", default=None,
+        help="Which placement group to try and launch " +
+             "instances into. Assumes placement group is already " +
+             "created.")
     parser.add_option(
         "--swap", metavar="SWAP", type="int", default=1024,
         help="Swap space to set up per node, in MB (default: %default)")
@@ -226,9 +230,11 @@ def parse_args():
         "--copy-aws-credentials", action="store_true", default=False,
         help="Add AWS credentials to hadoop configuration to allow Spark to access S3")
     parser.add_option(
-        "--subnet-id", default=None, help="VPC subnet to launch instances in")
+        "--subnet-id", default=None,
+        help="VPC subnet to launch instances in")
     parser.add_option(
-        "--vpc-id", default=None, help="VPC to launch instances in")
+        "--vpc-id", default=None,
+        help="VPC to launch instances in")
 
     (opts, args) = parser.parse_args()
     if len(args) != 2:
@@ -290,52 +296,54 @@ def is_active(instance):
     return (instance.state in ['pending', 'running', 'stopping', 'stopped'])
 
 
-# Attempt to resolve an appropriate AMI given the architecture and region of the request.
 # Source: http://aws.amazon.com/amazon-linux-ami/instance-type-matrix/
 # Last Updated: 2014-06-20
 # For easy maintainability, please keep this manually-inputted dictionary sorted by key.
+EC2_INSTANCE_TYPES = {
+    "c1.medium":   "pvm",
+    "c1.xlarge":   "pvm",
+    "c3.2xlarge":  "pvm",
+    "c3.4xlarge":  "pvm",
+    "c3.8xlarge":  "pvm",
+    "c3.large":    "pvm",
+    "c3.xlarge":   "pvm",
+    "cc1.4xlarge": "hvm",
+    "cc2.8xlarge": "hvm",
+    "cg1.4xlarge": "hvm",
+    "cr1.8xlarge": "hvm",
+    "hi1.4xlarge": "pvm",
+    "hs1.8xlarge": "pvm",
+    "i2.2xlarge":  "hvm",
+    "i2.4xlarge":  "hvm",
+    "i2.8xlarge":  "hvm",
+    "i2.xlarge":   "hvm",
+    "m1.large":    "pvm",
+    "m1.medium":   "pvm",
+    "m1.small":    "pvm",
+    "m1.xlarge":   "pvm",
+    "m2.2xlarge":  "pvm",
+    "m2.4xlarge":  "pvm",
+    "m2.xlarge":   "pvm",
+    "m3.2xlarge":  "hvm",
+    "m3.large":    "hvm",
+    "m3.medium":   "hvm",
+    "m3.xlarge":   "hvm",
+    "r3.2xlarge":  "hvm",
+    "r3.4xlarge":  "hvm",
+    "r3.8xlarge":  "hvm",
+    "r3.large":    "hvm",
+    "r3.xlarge":   "hvm",
+    "t1.micro":    "pvm",
+    "t2.medium":   "hvm",
+    "t2.micro":    "hvm",
+    "t2.small":    "hvm",
+}
+
+
+# Attempt to resolve an appropriate AMI given the architecture and region of the request.
 def get_spark_ami(opts):
-    instance_types = {
-        "c1.medium":   "pvm",
-        "c1.xlarge":   "pvm",
-        "c3.2xlarge":  "pvm",
-        "c3.4xlarge":  "pvm",
-        "c3.8xlarge":  "pvm",
-        "c3.large":    "pvm",
-        "c3.xlarge":   "pvm",
-        "cc1.4xlarge": "hvm",
-        "cc2.8xlarge": "hvm",
-        "cg1.4xlarge": "hvm",
-        "cr1.8xlarge": "hvm",
-        "hi1.4xlarge": "pvm",
-        "hs1.8xlarge": "pvm",
-        "i2.2xlarge":  "hvm",
-        "i2.4xlarge":  "hvm",
-        "i2.8xlarge":  "hvm",
-        "i2.xlarge":   "hvm",
-        "m1.large":    "pvm",
-        "m1.medium":   "pvm",
-        "m1.small":    "pvm",
-        "m1.xlarge":   "pvm",
-        "m2.2xlarge":  "pvm",
-        "m2.4xlarge":  "pvm",
-        "m2.xlarge":   "pvm",
-        "m3.2xlarge":  "hvm",
-        "m3.large":    "hvm",
-        "m3.medium":   "hvm",
-        "m3.xlarge":   "hvm",
-        "r3.2xlarge":  "hvm",
-        "r3.4xlarge":  "hvm",
-        "r3.8xlarge":  "hvm",
-        "r3.large":    "hvm",
-        "r3.xlarge":   "hvm",
-        "t1.micro":    "pvm",
-        "t2.medium":   "hvm",
-        "t2.micro":    "hvm",
-        "t2.small":    "hvm",
-    }
-    if opts.instance_type in instance_types:
-        instance_type = instance_types[opts.instance_type]
+    if opts.instance_type in EC2_INSTANCE_TYPES:
+        instance_type = EC2_INSTANCE_TYPES[opts.instance_type]
     else:
         instance_type = "pvm"
         print >> stderr,\
@@ -605,8 +613,6 @@ def launch_cluster(conn, opts, cluster_name):
 
 # Get the EC2 instances in an existing cluster if available.
 # Returns a tuple of lists of EC2 instance objects for the masters and slaves
-
-
 def get_existing_cluster(conn, opts, cluster_name, die_on_error=True):
     print "Searching for existing cluster " + cluster_name + "..."
     reservations = conn.get_all_reservations()
@@ -1050,6 +1056,30 @@ def real_main():
             print >> stderr, 'You can fix this with: chmod 400 "{f}"'.format(f=opts.identity_file)
             sys.exit(1)
 
+    if opts.instance_type not in EC2_INSTANCE_TYPES:
+        print >> stderr, "Warning: Unrecognized EC2 instance type for instance-type: {t}".format(
+            t=opts.instance_type)
+
+    if opts.master_instance_type != "":
+        if opts.master_instance_type not in EC2_INSTANCE_TYPES:
+            print >> stderr, \
+                "Warning: Unrecognized EC2 instance type for master-instance-type: {t}".format(
+                    t=opts.master_instance_type)
+        # Since we try instance types even if we can't resolve them, we check if they resolve first
+        # and, if they do, see if they resolve to the same virtualization type.
+        if opts.instance_type in EC2_INSTANCE_TYPES and \
+           opts.master_instance_type in EC2_INSTANCE_TYPES:
+            if EC2_INSTANCE_TYPES[opts.instance_type] != \
+               EC2_INSTANCE_TYPES[opts.master_instance_type]:
+                print >> stderr, \
+                    "Error: spark-ec2 currently does not support having a master and slaves with " + \
+                    "different AMI virtualization types."
+                print >> stderr, "master instance virtualization type: {t}".format(
+                    t=EC2_INSTANCE_TYPES[opts.master_instance_type])
+                print >> stderr, "slave instance virtualization type: {t}".format(
+                    t=EC2_INSTANCE_TYPES[opts.instance_type])
+                sys.exit(1)
+
     if opts.ebs_vol_num > 8:
         print >> stderr, "ebs-vol-num cannot be greater than 8"
         sys.exit(1)

From 6cc96cf0c3ea87ab65d42a59725959d94701577b Mon Sep 17 00:00:00 2001
From: JqueryFan <firing@126.com>
Date: Tue, 10 Feb 2015 17:37:32 +0000
Subject: [PATCH 092/357] [Spark-5717] [MLlib] add stop and reorganize import

Trivial. add sc stop and reorganize import
https://issues.apache.org/jira/browse/SPARK-5717

Author: JqueryFan <firing@126.com>
Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #4503 from hhbyyh/scstop and squashes the following commits:

7837a2c [JqueryFan] revert import change
2e85cc1 [Yuhao Yang] add stop and reorganize import
---
 .../java/org/apache/spark/examples/mllib/JavaLDAExample.java    | 1 +
 .../main/scala/org/apache/spark/examples/mllib/LDAExample.scala | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java
index f394ff2084463..36207ae38d9a9 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLDAExample.java
@@ -71,5 +71,6 @@ public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) {
       }
       System.out.println();
     }
+    sc.stop();
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala
index 0e1b27a8bd2ee..11399a7633638 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LDAExample.scala
@@ -159,7 +159,7 @@ object LDAExample {
       }
       println()
     }
-
+    sc.stop()
   }
 
   /**

From c7ad80ae4256c88e380e7488d48cf6eb14a92d76 Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Tue, 10 Feb 2015 11:08:21 -0800
Subject: [PATCH 093/357] [SPARK-5716] [SQL] Support TOK_CHARSETLITERAL in
 HiveQl

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #4502 from adrian-wang/utf8 and squashes the following commits:

4d7b0ee [Daoyuan Wang] remove useless import
606f981 [Daoyuan Wang] support TOK_CHARSETLITERAL in HiveQl
---
 .../spark/sql/hive/execution/HiveCompatibilitySuite.scala     | 1 +
 .../src/main/scala/org/apache/spark/sql/hive/HiveQl.scala     | 4 ++++
 .../golden/inputddl5-0-ebbf2aec5f76af7225c2efaf870b8ba7       | 0
 .../golden/inputddl5-1-2691407ccdc5c848a4ba2aecb6dbad75       | 0
 .../golden/inputddl5-2-ca2faacf63dc4785f8bfd2ecc397e69b       | 1 +
 .../golden/inputddl5-3-4f28c7412a05cff89c0bd86b65aa7ce        | 1 +
 .../golden/inputddl5-4-bd7e25cff73f470d2e2336876342b783       | 1 +
 7 files changed, 8 insertions(+)
 create mode 100644 sql/hive/src/test/resources/golden/inputddl5-0-ebbf2aec5f76af7225c2efaf870b8ba7
 create mode 100644 sql/hive/src/test/resources/golden/inputddl5-1-2691407ccdc5c848a4ba2aecb6dbad75
 create mode 100644 sql/hive/src/test/resources/golden/inputddl5-2-ca2faacf63dc4785f8bfd2ecc397e69b
 create mode 100644 sql/hive/src/test/resources/golden/inputddl5-3-4f28c7412a05cff89c0bd86b65aa7ce
 create mode 100644 sql/hive/src/test/resources/golden/inputddl5-4-bd7e25cff73f470d2e2336876342b783

diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index a6266f611c219..e443e5bd5f54d 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -518,6 +518,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "inputddl2",
     "inputddl3",
     "inputddl4",
+    "inputddl5",
     "inputddl6",
     "inputddl7",
     "inputddl8",
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index f51af62d3340b..969868aef2917 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive
 
 import java.sql.Date
+
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.hadoop.hive.conf.HiveConf
@@ -1237,6 +1238,9 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
     case ast: ASTNode if ast.getType == HiveParser.TOK_DATELITERAL =>
       Literal(Date.valueOf(ast.getText.substring(1, ast.getText.length - 1)))
 
+    case ast: ASTNode if ast.getType == HiveParser.TOK_CHARSETLITERAL =>
+      Literal(BaseSemanticAnalyzer.charSetString(ast.getChild(0).getText, ast.getChild(1).getText))
+
     case a: ASTNode =>
       throw new NotImplementedError(
         s"""No parse rules for ASTNode type: ${a.getType}, text: ${a.getText} :
diff --git a/sql/hive/src/test/resources/golden/inputddl5-0-ebbf2aec5f76af7225c2efaf870b8ba7 b/sql/hive/src/test/resources/golden/inputddl5-0-ebbf2aec5f76af7225c2efaf870b8ba7
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/inputddl5-1-2691407ccdc5c848a4ba2aecb6dbad75 b/sql/hive/src/test/resources/golden/inputddl5-1-2691407ccdc5c848a4ba2aecb6dbad75
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/inputddl5-2-ca2faacf63dc4785f8bfd2ecc397e69b b/sql/hive/src/test/resources/golden/inputddl5-2-ca2faacf63dc4785f8bfd2ecc397e69b
new file mode 100644
index 0000000000000..518a70918b2c7
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/inputddl5-2-ca2faacf63dc4785f8bfd2ecc397e69b
@@ -0,0 +1 @@
+name                	string              	                    
diff --git a/sql/hive/src/test/resources/golden/inputddl5-3-4f28c7412a05cff89c0bd86b65aa7ce b/sql/hive/src/test/resources/golden/inputddl5-3-4f28c7412a05cff89c0bd86b65aa7ce
new file mode 100644
index 0000000000000..33398360345d7
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/inputddl5-3-4f28c7412a05cff89c0bd86b65aa7ce
@@ -0,0 +1 @@
+邵铮
diff --git a/sql/hive/src/test/resources/golden/inputddl5-4-bd7e25cff73f470d2e2336876342b783 b/sql/hive/src/test/resources/golden/inputddl5-4-bd7e25cff73f470d2e2336876342b783
new file mode 100644
index 0000000000000..d00491fd7e5bb
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/inputddl5-4-bd7e25cff73f470d2e2336876342b783
@@ -0,0 +1 @@
+1

From 69bc3bb6cffe82aee5ecd0b09410a847ba486b15 Mon Sep 17 00:00:00 2001
From: Sandy Ryza <sandy@cloudera.com>
Date: Tue, 10 Feb 2015 11:07:25 -0800
Subject: [PATCH 094/357] SPARK-4136. Under dynamic allocation, cancel
 outstanding executor requests when no longer needed

This takes advantage of the changes made in SPARK-4337 to cancel pending requests to YARN when they are no longer needed.

Each time the timer in `ExecutorAllocationManager` strikes, we compute `maxNumNeededExecutors`, the maximum number of executors we could fill with the current load.  This is calculated as the total number of running and pending tasks divided by the number of cores per executor.  If `maxNumNeededExecutors` is below the total number of running and pending executors, we call `requestTotalExecutors(maxNumNeededExecutors)` to let the cluster manager know that it should cancel any pending requests above this amount.  If not, `maxNumNeededExecutors` is just used as a bound in alongside the configured `maxExecutors` to limit the number of new requests.

The patch modifies the API exposed by `ExecutorAllocationClient` for requesting additional executors by moving from `requestExecutors` to `requestTotalExecutors`.  This makes the communication between the `ExecutorAllocationManager` and the `YarnAllocator` easier to reason about and removes some state that needed to be kept in the `CoarseGrainedSchedulerBackend`.  I think an argument can be made that this makes for a less attractive user-facing API in `SparkContext`, but I'm having trouble envisioning situations where a user would want to use either of these APIs.

This will likely break some tests, but I wanted to get feedback on the approach before adding tests and polishing.

Author: Sandy Ryza <sandy@cloudera.com>

Closes #4168 from sryza/sandy-spark-4136 and squashes the following commits:

37ce77d [Sandy Ryza] Warn on negative number
cd3b2ff [Sandy Ryza] SPARK-4136
---
 .../spark/ExecutorAllocationClient.scala      |   8 +
 .../spark/ExecutorAllocationManager.scala     | 149 ++++++++++++------
 .../scala/org/apache/spark/SparkContext.scala |  21 ++-
 .../CoarseGrainedSchedulerBackend.scala       |  20 ++-
 .../ExecutorAllocationManagerSuite.scala      |  36 ++++-
 5 files changed, 184 insertions(+), 50 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
index a46a81eabd965..079055e00c6c3 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
@@ -19,9 +19,17 @@ package org.apache.spark
 
 /**
  * A client that communicates with the cluster manager to request or kill executors.
+ * This is currently supported only in YARN mode.
  */
 private[spark] trait ExecutorAllocationClient {
 
+  /**
+   * Express a preference to the cluster manager for a given total number of executors.
+   * This can result in canceling pending requests or filing additional requests.
+   * Return whether the request is acknowledged by the cluster manager.
+   */
+  private[spark] def requestTotalExecutors(numExecutors: Int): Boolean
+
   /**
    * Request an additional number of executors from the cluster manager.
    * Return whether the request is acknowledged by the cluster manager.
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 02d54bf3b53cc..998695b6ac8ab 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -201,18 +201,34 @@ private[spark] class ExecutorAllocationManager(
   }
 
   /**
-   * If the add time has expired, request new executors and refresh the add time.
-   * If the remove time for an existing executor has expired, kill the executor.
+   * The number of executors we would have if the cluster manager were to fulfill all our existing
+   * requests.
+   */
+  private def targetNumExecutors(): Int =
+    numExecutorsPending + executorIds.size - executorsPendingToRemove.size
+
+  /**
+   * The maximum number of executors we would need under the current load to satisfy all running
+   * and pending tasks, rounded up.
+   */
+  private def maxNumExecutorsNeeded(): Int = {
+    val numRunningOrPendingTasks = listener.totalPendingTasks + listener.totalRunningTasks
+    (numRunningOrPendingTasks + tasksPerExecutor - 1) / tasksPerExecutor
+  }
+
+  /**
+   * This is called at a fixed interval to regulate the number of pending executor requests
+   * and number of executors running.
+   *
+   * First, adjust our requested executors based on the add time and our current needs.
+   * Then, if the remove time for an existing executor has expired, kill the executor.
+   *
    * This is factored out into its own method for testing.
    */
   private def schedule(): Unit = synchronized {
     val now = clock.getTimeMillis
-    if (addTime != NOT_SET && now >= addTime) {
-      addExecutors()
-      logDebug(s"Starting timer to add more executors (to " +
-        s"expire in $sustainedSchedulerBacklogTimeout seconds)")
-      addTime += sustainedSchedulerBacklogTimeout * 1000
-    }
+
+    addOrCancelExecutorRequests(now)
 
     removeTimes.retain { case (executorId, expireTime) =>
       val expired = now >= expireTime
@@ -223,59 +239,89 @@ private[spark] class ExecutorAllocationManager(
     }
   }
 
+  /**
+   * Check to see whether our existing allocation and the requests we've made previously exceed our
+   * current needs. If so, let the cluster manager know so that it can cancel pending requests that
+   * are unneeded.
+   *
+   * If not, and the add time has expired, see if we can request new executors and refresh the add
+   * time.
+   *
+   * @return the delta in the target number of executors.
+   */
+  private def addOrCancelExecutorRequests(now: Long): Int = synchronized {
+    val currentTarget = targetNumExecutors
+    val maxNeeded = maxNumExecutorsNeeded
+
+    if (maxNeeded < currentTarget) {
+      // The target number exceeds the number we actually need, so stop adding new
+      // executors and inform the cluster manager to cancel the extra pending requests.
+      val newTotalExecutors = math.max(maxNeeded, minNumExecutors)
+      client.requestTotalExecutors(newTotalExecutors)
+      numExecutorsToAdd = 1
+      updateNumExecutorsPending(newTotalExecutors)
+    } else if (addTime != NOT_SET && now >= addTime) {
+      val delta = addExecutors(maxNeeded)
+      logDebug(s"Starting timer to add more executors (to " +
+        s"expire in $sustainedSchedulerBacklogTimeout seconds)")
+      addTime += sustainedSchedulerBacklogTimeout * 1000
+      delta
+    } else {
+      0
+    }
+  }
+
   /**
    * Request a number of executors from the cluster manager.
    * If the cap on the number of executors is reached, give up and reset the
    * number of executors to add next round instead of continuing to double it.
-   * Return the number actually requested.
+   *
+   * @param maxNumExecutorsNeeded the maximum number of executors all currently running or pending
+   *                              tasks could fill
+   * @return the number of additional executors actually requested.
    */
-  private def addExecutors(): Int = synchronized {
-    // Do not request more executors if we have already reached the upper bound
-    val numExistingExecutors = executorIds.size + numExecutorsPending
-    if (numExistingExecutors >= maxNumExecutors) {
+  private def addExecutors(maxNumExecutorsNeeded: Int): Int = {
+    // Do not request more executors if it would put our target over the upper bound
+    val currentTarget = targetNumExecutors
+    if (currentTarget >= maxNumExecutors) {
       logDebug(s"Not adding executors because there are already ${executorIds.size} " +
         s"registered and $numExecutorsPending pending executor(s) (limit $maxNumExecutors)")
       numExecutorsToAdd = 1
       return 0
     }
 
-    // The number of executors needed to satisfy all pending tasks is the number of tasks pending
-    // divided by the number of tasks each executor can fit, rounded up.
-    val maxNumExecutorsPending =
-      (listener.totalPendingTasks() + tasksPerExecutor - 1) / tasksPerExecutor
-    if (numExecutorsPending >= maxNumExecutorsPending) {
-      logDebug(s"Not adding executors because there are already $numExecutorsPending " +
-        s"pending and pending tasks could only fill $maxNumExecutorsPending")
-      numExecutorsToAdd = 1
-      return 0
-    }
-
-    // It's never useful to request more executors than could satisfy all the pending tasks, so
-    // cap request at that amount.
-    // Also cap request with respect to the configured upper bound.
-    val maxNumExecutorsToAdd = math.min(
-      maxNumExecutorsPending - numExecutorsPending,
-      maxNumExecutors - numExistingExecutors)
-    assert(maxNumExecutorsToAdd > 0)
-
-    val actualNumExecutorsToAdd = math.min(numExecutorsToAdd, maxNumExecutorsToAdd)
-
-    val newTotalExecutors = numExistingExecutors + actualNumExecutorsToAdd
-    val addRequestAcknowledged = testing || client.requestExecutors(actualNumExecutorsToAdd)
+    val actualMaxNumExecutors = math.min(maxNumExecutors, maxNumExecutorsNeeded)
+    val newTotalExecutors = math.min(currentTarget + numExecutorsToAdd, actualMaxNumExecutors)
+    val addRequestAcknowledged = testing || client.requestTotalExecutors(newTotalExecutors)
     if (addRequestAcknowledged) {
-      logInfo(s"Requesting $actualNumExecutorsToAdd new executor(s) because " +
-        s"tasks are backlogged (new desired total will be $newTotalExecutors)")
-      numExecutorsToAdd =
-        if (actualNumExecutorsToAdd == numExecutorsToAdd) numExecutorsToAdd * 2 else 1
-      numExecutorsPending += actualNumExecutorsToAdd
-      actualNumExecutorsToAdd
+      val delta = updateNumExecutorsPending(newTotalExecutors)
+      logInfo(s"Requesting $delta new executor(s) because tasks are backlogged" +
+        s" (new desired total will be $newTotalExecutors)")
+      numExecutorsToAdd = if (delta == numExecutorsToAdd) {
+        numExecutorsToAdd * 2
+      } else {
+        1
+      }
+      delta
     } else {
-      logWarning(s"Unable to reach the cluster manager " +
-        s"to request $actualNumExecutorsToAdd executors!")
+      logWarning(
+        s"Unable to reach the cluster manager to request $newTotalExecutors total executors!")
       0
     }
   }
 
+  /**
+   * Given the new target number of executors, update the number of pending executor requests,
+   * and return the delta from the old number of pending requests.
+   */
+  private def updateNumExecutorsPending(newTotalExecutors: Int): Int = {
+    val newNumExecutorsPending =
+      newTotalExecutors - executorIds.size + executorsPendingToRemove.size
+    val delta = newNumExecutorsPending - numExecutorsPending
+    numExecutorsPending = newNumExecutorsPending
+    delta
+  }
+
   /**
    * Request the cluster manager to remove the given executor.
    * Return whether the request is received.
@@ -415,6 +461,8 @@ private[spark] class ExecutorAllocationManager(
     private val stageIdToNumTasks = new mutable.HashMap[Int, Int]
     private val stageIdToTaskIndices = new mutable.HashMap[Int, mutable.HashSet[Int]]
     private val executorIdToTaskIds = new mutable.HashMap[String, mutable.HashSet[Long]]
+    // Number of tasks currently running on the cluster.  Should be 0 when no stages are active.
+    private var numRunningTasks: Int = _
 
     override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = {
       val stageId = stageSubmitted.stageInfo.stageId
@@ -435,6 +483,10 @@ private[spark] class ExecutorAllocationManager(
         // This is needed in case the stage is aborted for any reason
         if (stageIdToNumTasks.isEmpty) {
           allocationManager.onSchedulerQueueEmpty()
+          if (numRunningTasks != 0) {
+            logWarning("No stages are running, but numRunningTasks != 0")
+            numRunningTasks = 0
+          }
         }
       }
     }
@@ -446,6 +498,7 @@ private[spark] class ExecutorAllocationManager(
       val executorId = taskStart.taskInfo.executorId
 
       allocationManager.synchronized {
+        numRunningTasks += 1
         // This guards against the race condition in which the `SparkListenerTaskStart`
         // event is posted before the `SparkListenerBlockManagerAdded` event, which is
         // possible because these events are posted in different threads. (see SPARK-4951)
@@ -475,7 +528,8 @@ private[spark] class ExecutorAllocationManager(
       val executorId = taskEnd.taskInfo.executorId
       val taskId = taskEnd.taskInfo.taskId
       allocationManager.synchronized {
-        // If the executor is no longer running scheduled any tasks, mark it as idle
+        numRunningTasks -= 1
+        // If the executor is no longer running any scheduled tasks, mark it as idle
         if (executorIdToTaskIds.contains(executorId)) {
           executorIdToTaskIds(executorId) -= taskId
           if (executorIdToTaskIds(executorId).isEmpty) {
@@ -514,6 +568,11 @@ private[spark] class ExecutorAllocationManager(
       }.sum
     }
 
+    /**
+     * The number of tasks currently running across all stages.
+     */
+    def totalRunningTasks(): Int = numRunningTasks
+
     /**
      * Return true if an executor is not currently running a task, and false otherwise.
      *
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 8d3c3d000adf3..04ca5d1019e4b 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1103,10 +1103,27 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     listenerBus.addListener(listener)
   }
 
+  /**
+   * Express a preference to the cluster manager for a given total number of executors.
+   * This can result in canceling pending requests or filing additional requests.
+   * This is currently only supported in YARN mode. Return whether the request is received.
+   */
+  private[spark] override def requestTotalExecutors(numExecutors: Int): Boolean = {
+    assert(master.contains("yarn") || dynamicAllocationTesting,
+      "Requesting executors is currently only supported in YARN mode")
+    schedulerBackend match {
+      case b: CoarseGrainedSchedulerBackend =>
+        b.requestTotalExecutors(numExecutors)
+      case _ =>
+        logWarning("Requesting executors is only supported in coarse-grained mode")
+        false
+    }
+  }
+
   /**
    * :: DeveloperApi ::
    * Request an additional number of executors from the cluster manager.
-   * This is currently only supported in Yarn mode. Return whether the request is received.
+   * This is currently only supported in YARN mode. Return whether the request is received.
    */
   @DeveloperApi
   override def requestExecutors(numAdditionalExecutors: Int): Boolean = {
@@ -1124,7 +1141,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   /**
    * :: DeveloperApi ::
    * Request that the cluster manager kill the specified executors.
-   * This is currently only supported in Yarn mode. Return whether the request is received.
+   * This is currently only supported in YARN mode. Return whether the request is received.
    */
   @DeveloperApi
   override def killExecutors(executorIds: Seq[String]): Boolean = {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index f9ca93432bf41..99986c32b0fde 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -311,7 +311,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
 
   /**
    * Request an additional number of executors from the cluster manager.
-   * Return whether the request is acknowledged.
+   * @return whether the request is acknowledged.
    */
   final override def requestExecutors(numAdditionalExecutors: Int): Boolean = synchronized {
     if (numAdditionalExecutors < 0) {
@@ -327,6 +327,22 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
     doRequestTotalExecutors(newTotal)
   }
 
+  /**
+   * Express a preference to the cluster manager for a given total number of executors. This can
+   * result in canceling pending requests or filing additional requests.
+   * @return whether the request is acknowledged.
+   */
+  final override def requestTotalExecutors(numExecutors: Int): Boolean = synchronized {
+    if (numAdditionalExecutors < 0) {
+      throw new IllegalArgumentException(
+        "Attempted to request a negative number of executor(s) " +
+          s"$numExecutors from the cluster manager. Please specify a positive number!")
+    }
+    numPendingExecutors =
+      math.max(numExecutors - numExistingExecutors + executorsPendingToRemove.size, 0)
+    doRequestTotalExecutors(numExecutors)
+  }
+
   /**
    * Request executors from the cluster manager by specifying the total number desired,
    * including existing pending and running executors.
@@ -337,7 +353,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
    * insufficient resources to satisfy the first request. We make the assumption here that the
    * cluster manager will eventually fulfill all requests when resources free up.
    *
-   * Return whether the request is acknowledged.
+   * @return whether the request is acknowledged.
    */
   protected def doRequestTotalExecutors(requestedTotal: Int): Boolean = false
 
diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
index 9eb87f016068d..5d96eabd34eee 100644
--- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -175,6 +175,33 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     assert(numExecutorsPending(manager) === 9)
   }
 
+  test("cancel pending executors when no longer needed") {
+    sc = createSparkContext(1, 10)
+    val manager = sc.executorAllocationManager.get
+    sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(2, 5)))
+
+    assert(numExecutorsPending(manager) === 0)
+    assert(numExecutorsToAdd(manager) === 1)
+    assert(addExecutors(manager) === 1)
+    assert(numExecutorsPending(manager) === 1)
+    assert(numExecutorsToAdd(manager) === 2)
+    assert(addExecutors(manager) === 2)
+    assert(numExecutorsPending(manager) === 3)
+
+    val task1Info = createTaskInfo(0, 0, "executor-1")
+    sc.listenerBus.postToAll(SparkListenerTaskStart(2, 0, task1Info))
+
+    assert(numExecutorsToAdd(manager) === 4)
+    assert(addExecutors(manager) === 2)
+
+    val task2Info = createTaskInfo(1, 0, "executor-1")
+    sc.listenerBus.postToAll(SparkListenerTaskStart(2, 0, task2Info))
+    sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, null, task1Info, null))
+    sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, null, task2Info, null))
+
+    assert(adjustRequestedExecutors(manager) === -1)
+  }
+
   test("remove executors") {
     sc = createSparkContext(5, 10)
     val manager = sc.executorAllocationManager.get
@@ -679,6 +706,7 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester {
 
   private val _numExecutorsToAdd = PrivateMethod[Int]('numExecutorsToAdd)
   private val _numExecutorsPending = PrivateMethod[Int]('numExecutorsPending)
+  private val _maxNumExecutorsNeeded = PrivateMethod[Int]('maxNumExecutorsNeeded)
   private val _executorsPendingToRemove =
     PrivateMethod[collection.Set[String]]('executorsPendingToRemove)
   private val _executorIds = PrivateMethod[collection.Set[String]]('executorIds)
@@ -686,6 +714,7 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester {
   private val _removeTimes = PrivateMethod[collection.Map[String, Long]]('removeTimes)
   private val _schedule = PrivateMethod[Unit]('schedule)
   private val _addExecutors = PrivateMethod[Int]('addExecutors)
+  private val _addOrCancelExecutorRequests = PrivateMethod[Int]('addOrCancelExecutorRequests)
   private val _removeExecutor = PrivateMethod[Boolean]('removeExecutor)
   private val _onExecutorAdded = PrivateMethod[Unit]('onExecutorAdded)
   private val _onExecutorRemoved = PrivateMethod[Unit]('onExecutorRemoved)
@@ -724,7 +753,12 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester {
   }
 
   private def addExecutors(manager: ExecutorAllocationManager): Int = {
-    manager invokePrivate _addExecutors()
+    val maxNumExecutorsNeeded = manager invokePrivate _maxNumExecutorsNeeded()
+    manager invokePrivate _addExecutors(maxNumExecutorsNeeded)
+  }
+
+  private def adjustRequestedExecutors(manager: ExecutorAllocationManager): Int = {
+    manager invokePrivate _addOrCancelExecutorRequests(0L)
   }
 
   private def removeExecutor(manager: ExecutorAllocationManager, id: String): Boolean = {

From b640c841fca92bb0bca77267db2965ff8f79586f Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Tue, 10 Feb 2015 11:18:01 -0800
Subject: [PATCH 095/357] [HOTFIX][SPARK-4136] Fix compilation and tests

---
 .../org/apache/spark/ExecutorAllocationClient.scala    |  8 ++++----
 .../cluster/CoarseGrainedSchedulerBackend.scala        |  2 +-
 .../apache/spark/ExecutorAllocationManagerSuite.scala  | 10 ++++------
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
index 079055e00c6c3..443830f8d03b6 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
@@ -26,25 +26,25 @@ private[spark] trait ExecutorAllocationClient {
   /**
    * Express a preference to the cluster manager for a given total number of executors.
    * This can result in canceling pending requests or filing additional requests.
-   * Return whether the request is acknowledged by the cluster manager.
+   * @return whether the request is acknowledged by the cluster manager.
    */
   private[spark] def requestTotalExecutors(numExecutors: Int): Boolean
 
   /**
    * Request an additional number of executors from the cluster manager.
-   * Return whether the request is acknowledged by the cluster manager.
+   * @return whether the request is acknowledged by the cluster manager.
    */
   def requestExecutors(numAdditionalExecutors: Int): Boolean
 
   /**
    * Request that the cluster manager kill the specified executors.
-   * Return whether the request is acknowledged by the cluster manager.
+   * @return whether the request is acknowledged by the cluster manager.
    */
   def killExecutors(executorIds: Seq[String]): Boolean
 
   /**
    * Request that the cluster manager kill the specified executor.
-   * Return whether the request is acknowledged by the cluster manager.
+   * @return whether the request is acknowledged by the cluster manager.
    */
   def killExecutor(executorId: String): Boolean = killExecutors(Seq(executorId))
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 99986c32b0fde..6f77fa32ce37b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -333,7 +333,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val actorSyste
    * @return whether the request is acknowledged.
    */
   final override def requestTotalExecutors(numExecutors: Int): Boolean = synchronized {
-    if (numAdditionalExecutors < 0) {
+    if (numExecutors < 0) {
       throw new IllegalArgumentException(
         "Attempted to request a negative number of executor(s) " +
           s"$numExecutors from the cluster manager. Please specify a positive number!")
diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
index 5d96eabd34eee..d3123e854016b 100644
--- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -297,15 +297,15 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     assert(removeExecutor(manager, "5"))
     assert(removeExecutor(manager, "6"))
     assert(executorIds(manager).size === 10)
-    assert(addExecutors(manager) === 0) // still at upper limit
+    assert(addExecutors(manager) === 1)
     onExecutorRemoved(manager, "3")
     onExecutorRemoved(manager, "4")
     assert(executorIds(manager).size === 8)
 
     // Add succeeds again, now that we are no longer at the upper limit
     // Number of executors added restarts at 1
-    assert(addExecutors(manager) === 1)
-    assert(addExecutors(manager) === 1) // upper limit reached again
+    assert(addExecutors(manager) === 2)
+    assert(addExecutors(manager) === 1) // upper limit reached
     assert(addExecutors(manager) === 0)
     assert(executorIds(manager).size === 8)
     onExecutorRemoved(manager, "5")
@@ -313,9 +313,7 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     onExecutorAdded(manager, "13")
     onExecutorAdded(manager, "14")
     assert(executorIds(manager).size === 8)
-    assert(addExecutors(manager) === 1)
-    assert(addExecutors(manager) === 1) // upper limit reached again
-    assert(addExecutors(manager) === 0)
+    assert(addExecutors(manager) === 0) // still at upper limit
     onExecutorAdded(manager, "15")
     onExecutorAdded(manager, "16")
     assert(executorIds(manager).size === 10)

From 59272dad77eb95c5ae8e0652e00d02a2675cda53 Mon Sep 17 00:00:00 2001
From: wangfei <wangfei1@huawei.com>
Date: Tue, 10 Feb 2015 11:54:30 -0800
Subject: [PATCH 096/357] [SPARK-5592][SQL] java.net.URISyntaxException when
 insert data to a partitioned table

flowing sql get URISyntaxException:
```
create table sc as select *
from (select '2011-01-11', '2011-01-11+14:18:26' from src tablesample (1 rows)
union all
select '2011-01-11', '2011-01-11+15:18:26' from src tablesample (1 rows)
union all
select '2011-01-11', '2011-01-11+16:18:26' from src tablesample (1 rows) ) s;
create table sc_part (key string) partitioned by (ts string) stored as rcfile;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table sc_part partition(ts) select * from sc;
```
java.net.URISyntaxException: Relative path in absolute URI: ts=2011-01-11+15:18:26
at org.apache.hadoop.fs.Path.initialize(Path.java:206)
at org.apache.hadoop.fs.Path.<init>(Path.java:172)
at org.apache.hadoop.fs.Path.<init>(Path.java:94)
at org.apache.spark.sql.hive.SparkHiveDynamicPartitionWriterContainer.org$apache$spark$sql$hive$SparkHiveDynamicPartitionWriterContainer$$newWriter$1(hiveWriterContainers.scala:230)
at org.apache.spark.sql.hive.SparkHiveDynamicPartitionWriterContainer$$anonfun$getLocalFileWriter$1.apply(hiveWriterContainers.scala:243)
at org.apache.spark.sql.hive.SparkHiveDynamicPartitionWriterContainer$$anonfun$getLocalFileWriter$1.apply(hiveWriterContainers.scala:243)
at scala.collection.mutable.MapLike$class.getOrElseUpdate(MapLike.scala:189)
at scala.collection.mutable.AbstractMap.getOrElseUpdate(Map.scala:91)
at org.apache.spark.sql.hive.SparkHiveDynamicPartitionWriterContainer.getLocalFileWriter(hiveWriterContainers.scala:243)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$org$apache$spark$sql$hive$execution$InsertIntoHiveTable$$writeToFile$1$1.apply(InsertIntoHiveTable.scala:113)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$org$apache$spark$sql$hive$execution$InsertIntoHiveTable$$writeToFile$1$1.apply(InsertIntoHiveTable.scala:105)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable.org$apache$spark$sql$hive$execution$InsertIntoHiveTable$$writeToFile$1(InsertIntoHiveTable.scala:105)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:87)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:87)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:64)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:194)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1110)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:603)
at java.lang.Thread.run(Thread.java:722)
Caused by: java.net.URISyntaxException: Relative path in absolute URI: ts=2011-01-11+15:18:26
at java.net.URI.checkPath(URI.java:1804)
at java.net.URI.<init>(URI.java:752)
at org.apache.hadoop.fs.Path.initialize(Path.java:203)

Author: wangfei <wangfei1@huawei.com>
Author: Fei Wang <wangfei1@huawei.com>

Closes #4368 from scwf/SPARK-5592 and squashes the following commits:

aa55ef4 [Fei Wang] comments addressed
f8f8bb1 [wangfei] added test case
f24624f [wangfei] Merge branch 'master' of https://github.com/apache/spark into SPARK-5592
9998177 [wangfei] added test case
ea81daf [wangfei] fix URISyntaxException
---
 .../spark/sql/hive/hiveWriterContainers.scala    | 12 +++++++++---
 .../sql/hive/execution/HiveQuerySuite.scala      | 16 ++++++++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
index aae175e426ade..f136e43acc8f2 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
@@ -30,6 +30,7 @@ import org.apache.hadoop.hive.ql.io.{HiveFileFormatUtils, HiveOutputFormat}
 import org.apache.hadoop.hive.ql.plan.{PlanUtils, TableDesc}
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapred._
+import org.apache.hadoop.hive.common.FileUtils
 
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.sql.Row
@@ -212,9 +213,14 @@ private[spark] class SparkHiveDynamicPartitionWriterContainer(
       .zip(row.toSeq.takeRight(dynamicPartColNames.length))
       .map { case (col, rawVal) =>
         val string = if (rawVal == null) null else String.valueOf(rawVal)
-        s"/$col=${if (string == null || string.isEmpty) defaultPartName else string}"
-      }
-      .mkString
+        val colString = 
+          if (string == null || string.isEmpty) {
+            defaultPartName
+          } else {
+            FileUtils.escapePathName(string)
+          }
+        s"/$col=$colString"
+      }.mkString
 
     def newWriter = {
       val newFileSinkDesc = new FileSinkDesc(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 27047ce4b1b0b..405b200d05412 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -859,6 +859,22 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
     }
   }
 
+  test("SPARK-5592: get java.net.URISyntaxException when dynamic partitioning") {
+    sql("""
+      |create table sc as select *
+      |from (select '2011-01-11', '2011-01-11+14:18:26' from src tablesample (1 rows)
+      |union all
+      |select '2011-01-11', '2011-01-11+15:18:26' from src tablesample (1 rows)
+      |union all
+      |select '2011-01-11', '2011-01-11+16:18:26' from src tablesample (1 rows) ) s
+    """.stripMargin)
+    sql("create table sc_part (key string) partitioned by (ts string) stored as rcfile")
+    sql("set hive.exec.dynamic.partition=true")
+    sql("set hive.exec.dynamic.partition.mode=nonstrict")
+    sql("insert overwrite table sc_part partition(ts) select * from sc")
+    sql("drop table sc_part")
+  }
+
   test("Partition spec validation") {
     sql("DROP TABLE IF EXISTS dp_test")
     sql("CREATE TABLE dp_test(key INT, value STRING) PARTITIONED BY (dp INT, sp INT)")

From c49a4049845c91b225e70fd630cdf6ddc055faf8 Mon Sep 17 00:00:00 2001
From: Miguel Peralvo <miguel.peralvo@gmail.com>
Date: Tue, 10 Feb 2015 19:54:52 +0000
Subject: [PATCH 097/357] [SPARK-5668] Display region in spark_ec2.py
 get_existing_cluster()

Show the region for the different messages displayed by get_existing_cluster(): The search, found and error messages.

Author: Miguel Peralvo <miguel.peralvo@gmail.com>

Closes #4457 from MiguelPeralvo/patch-2 and squashes the following commits:

a5514c8 [Miguel Peralvo] Update spark_ec2.py
0a837b0 [Miguel Peralvo] Update spark_ec2.py
3923f36 [Miguel Peralvo] Update spark_ec2.py
4ecd9f9 [Miguel Peralvo] [SPARK-5668] Display region in spark_ec2.py get_existing_cluster()
---
 ec2/spark_ec2.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index fe510f12bcec6..0ea7365d75b83 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -134,7 +134,7 @@ def parse_args():
         help="Master instance type (leave empty for same as instance-type)")
     parser.add_option(
         "-r", "--region", default="us-east-1",
-        help="EC2 region zone to launch instances in")
+        help="EC2 region used to launch instances in, or to find them in")
     parser.add_option(
         "-z", "--zone", default="",
         help="Availability zone to launch instances in, or 'all' to spread " +
@@ -614,7 +614,8 @@ def launch_cluster(conn, opts, cluster_name):
 # Get the EC2 instances in an existing cluster if available.
 # Returns a tuple of lists of EC2 instance objects for the masters and slaves
 def get_existing_cluster(conn, opts, cluster_name, die_on_error=True):
-    print "Searching for existing cluster " + cluster_name + "..."
+    print "Searching for existing cluster " + cluster_name + " in region " \
+        + opts.region + "..."
     reservations = conn.get_all_reservations()
     master_nodes = []
     slave_nodes = []
@@ -632,9 +633,11 @@ def get_existing_cluster(conn, opts, cluster_name, die_on_error=True):
         return (master_nodes, slave_nodes)
     else:
         if master_nodes == [] and slave_nodes != []:
-            print >> sys.stderr, "ERROR: Could not find master in group " + cluster_name + "-master"
+            print >> sys.stderr, "ERROR: Could not find master in group " + cluster_name \
+                + "-master" + " in region " + opts.region
         else:
-            print >> sys.stderr, "ERROR: Could not find any existing cluster"
+            print >> sys.stderr, "ERROR: Could not find any existing cluster" \
+                + " in region " + opts.region
         sys.exit(1)
 
 

From de80b1ba4d3c4b1b3316d482d62e4668b996f6ac Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 10 Feb 2015 13:14:01 -0800
Subject: [PATCH 098/357] [SQL] Add toString to DataFrame/Column

Author: Michael Armbrust <michael@databricks.com>

Closes #4436 from marmbrus/dfToString and squashes the following commits:

8a3c35f [Michael Armbrust] Merge remote-tracking branch 'origin/master' into dfToString
b72a81b [Michael Armbrust] add toString
---
 python/pyspark/sql/dataframe.py               |  2 +-
 .../sql/catalyst/expressions/Expression.scala | 12 ++++++++
 .../expressions/namedExpressions.scala        | 20 +++++++++++++
 .../org/apache/spark/sql/DataFrame.scala      |  8 +++++
 .../org/apache/spark/sql/DataFrameImpl.scala  | 10 +++----
 .../apache/spark/sql/IncomputableColumn.scala |  2 ++
 .../spark/sql/execution/debug/package.scala   | 11 ++++++-
 .../org/apache/spark/sql/DataFrameSuite.scala | 29 +++++++++++++++++++
 8 files changed, 86 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index cda704eea75f5..04be65fe241c4 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -447,7 +447,7 @@ def selectExpr(self, *expr):
         `select` that accepts SQL expressions.
 
         >>> df.selectExpr("age * 2", "abs(age)").collect()
-        [Row(('age * 2)=4, Abs('age)=2), Row(('age * 2)=10, Abs('age)=5)]
+        [Row((age * 2)=4, Abs(age)=2), Row((age * 2)=10, Abs(age)=5)]
         """
         jexpr = ListConverter().convert(expr, self._sc._gateway._gateway_client)
         jdf = self._jdf.selectExpr(self._sc._jvm.PythonUtils.toSeq(jexpr))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index cf14992ef835c..c32a4b886eb82 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.trees.TreeNode
@@ -66,6 +67,17 @@ abstract class Expression extends TreeNode[Expression] {
    */
   def childrenResolved = !children.exists(!_.resolved)
 
+  /**
+   * Returns a string representation of this expression that does not have developer centric
+   * debugging information like the expression id.
+   */
+  def prettyString: String = {
+    transform {
+      case a: AttributeReference => PrettyAttribute(a.name)
+      case u: UnresolvedAttribute => PrettyAttribute(u.name)
+    }.toString
+  }
+
   /**
    * A set of helper functions that return the correct descendant of `scala.math.Numeric[T]` type
    * and do any casting necessary of child evaluation.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index e6ab1fd8d7939..7f122e9d55734 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -190,6 +190,26 @@ case class AttributeReference(
   override def toString: String = s"$name#${exprId.id}$typeSuffix"
 }
 
+/**
+ * A place holder used when printing expressions without debugging information such as the
+ * expression id or the unresolved indicator.
+ */
+case class PrettyAttribute(name: String) extends Attribute with trees.LeafNode[Expression] {
+  type EvaluatedType = Any
+
+  override def toString = name
+
+  override def withNullability(newNullability: Boolean): Attribute = ???
+  override def newInstance(): Attribute = ???
+  override def withQualifiers(newQualifiers: Seq[String]): Attribute = ???
+  override def withName(newName: String): Attribute = ???
+  override def qualifiers: Seq[String] = ???
+  override def exprId: ExprId = ???
+  override def eval(input: Row): EvaluatedType = ???
+  override def nullable: Boolean = ???
+  override def dataType: DataType = ???
+}
+
 object VirtualColumn {
   val groupingIdName = "grouping__id"
   def newGroupingId = AttributeReference(groupingIdName, IntegerType, false)()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 6abfb7853cf1c..04e0d09947492 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -27,6 +27,8 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
+import scala.util.control.NonFatal
+
 
 private[sql] object DataFrame {
   def apply(sqlContext: SQLContext, logicalPlan: LogicalPlan): DataFrame = {
@@ -92,6 +94,12 @@ trait DataFrame extends RDDApi[Row] {
    */
   def toDataFrame: DataFrame = this
 
+  override def toString =
+     try schema.map(f => s"${f.name}: ${f.dataType.simpleString}").mkString("[", ", ", "]") catch {
+       case NonFatal(e) =>
+         s"Invalid tree; ${e.getMessage}:\n$queryExecution"
+     }
+
   /**
    * Returns a new [[DataFrame]] with columns renamed. This can be quite convenient in conversion
    * from a RDD of tuples into a [[DataFrame]] with meaningful names. For example:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index 73393295ab0a5..1ee16ad5161c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -201,13 +201,11 @@ private[sql] class DataFrameImpl protected[sql](
   override def as(alias: Symbol): DataFrame = Subquery(alias.name, logicalPlan)
 
   override def select(cols: Column*): DataFrame = {
-    val exprs = cols.zipWithIndex.map {
-      case (Column(expr: NamedExpression), _) =>
-        expr
-      case (Column(expr: Expression), _) =>
-        Alias(expr, expr.toString)()
+    val namedExpressions = cols.map {
+      case Column(expr: NamedExpression) => expr
+      case Column(expr: Expression) => Alias(expr, expr.prettyString)()
     }
-    Project(exprs.toSeq, logicalPlan)
+    Project(namedExpressions.toSeq, logicalPlan)
   }
 
   override def select(col: String, cols: String*): DataFrame = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
index 0600dcc226b4d..ce0557b88196f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
@@ -40,6 +40,8 @@ private[sql] class IncomputableColumn(protected[sql] val expr: Expression) exten
     throw new UnsupportedOperationException("Cannot run this method on an UncomputableColumn")
   }
 
+  override def toString = expr.prettyString
+
   override def isComputable: Boolean = false
 
   override val sqlContext: SQLContext = null
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index 5cc67cdd13944..acef49aabfe70 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable.HashSet
 import org.apache.spark.{AccumulatorParam, Accumulator, SparkContext}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.SparkContext._
-import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.{SQLConf, SQLContext, DataFrame, Row}
 import org.apache.spark.sql.catalyst.trees.TreeNodeRef
 import org.apache.spark.sql.types._
 
@@ -37,6 +37,15 @@ import org.apache.spark.sql.types._
  */
 package object debug {
 
+  /**
+   * Augments [[SQLContext]] with debug methods.
+   */
+  implicit class DebugSQLContext(sqlContext: SQLContext) {
+    def debug() = {
+      sqlContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, "false")
+    }
+  }
+
   /**
    * :: DeveloperApi ::
    * Augments [[DataFrame]]s with debug methods.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 5aa3db720c886..02623f73c7f76 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.TestData._
+
 import scala.language.postfixOps
 
 import org.apache.spark.sql.Dsl._
@@ -53,6 +55,33 @@ class DataFrameSuite extends QueryTest {
     TestSQLContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, oldSetting.toString)
   }
 
+  test("dataframe toString") {
+    assert(testData.toString === "[key: int, value: string]")
+    assert(testData("key").toString === "[key: int]")
+  }
+
+  test("incomputable toString") {
+    assert($"test".toString === "test")
+  }
+
+  test("invalid plan toString, debug mode") {
+    val oldSetting = TestSQLContext.conf.dataFrameEagerAnalysis
+    TestSQLContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, "true")
+
+    // Turn on debug mode so we can see invalid query plans.
+    import org.apache.spark.sql.execution.debug._
+    TestSQLContext.debug()
+
+    val badPlan = testData.select('badColumn)
+
+    assert(badPlan.toString contains badPlan.queryExecution.toString,
+      "toString on bad query plans should include the query execution but was:\n" +
+        badPlan.toString)
+
+    // Set the flag back to original value before this test.
+    TestSQLContext.setConf(SQLConf.DATAFRAME_EAGER_ANALYSIS, oldSetting.toString)
+  }
+
   test("table scan") {
     checkAnswer(
       testData,

From f98707c043f1be9569ec774796edb783132773a8 Mon Sep 17 00:00:00 2001
From: OopsOutOfMemory <victorshengli@126.com>
Date: Tue, 10 Feb 2015 13:20:15 -0800
Subject: [PATCH 099/357] [SPARK-5686][SQL] Add show current roles command in
 HiveQl

show current roles

Author: OopsOutOfMemory <victorshengli@126.com>

Closes #4471 from OopsOutOfMemory/show_current_role and squashes the following commits:

1c6b210 [OopsOutOfMemory] add show current roles
---
 sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 969868aef2917..8618301ba84d6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -77,6 +77,7 @@ private[hive] object HiveQl {
     "TOK_REVOKE",
     "TOK_SHOW_GRANT",
     "TOK_SHOW_ROLE_GRANT",
+    "TOK_SHOW_SET_ROLE",
 
     "TOK_CREATEFUNCTION",
     "TOK_DROPFUNCTION",

From fd2c032f95bbee342ca539df9e44927482981659 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 10 Feb 2015 14:05:55 -0800
Subject: [PATCH 100/357] [SPARK-5021] [MLlib] Gaussian Mixture now supports
 Sparse Input

Following discussion in the Jira.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #4459 from MechCoder/sparse_gmm and squashes the following commits:

1b18dab [MechCoder] Rewrite syr for sparse matrices
e579041 [MechCoder] Add test for covariance matrix
5cb370b [MechCoder] Separate tests for sparse data
5e096bd [MechCoder] Alphabetize and correct error message
e180f4c [MechCoder] [SPARK-5021] Gaussian Mixture now supports Sparse Input
---
 .../mllib/clustering/GaussianMixture.scala    | 31 ++++-----
 .../org/apache/spark/mllib/linalg/BLAS.scala  | 36 +++++++++-
 .../distribution/MultivariateGaussian.scala   | 10 +--
 .../clustering/GaussianMixtureSuite.scala     | 66 +++++++++++++++++--
 .../apache/spark/mllib/linalg/BLASSuite.scala |  8 +++
 5 files changed, 125 insertions(+), 26 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 0be3014de862e..80584ef5e5979 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -19,10 +19,12 @@ package org.apache.spark.mllib.clustering
 
 import scala.collection.mutable.IndexedSeq
 
-import breeze.linalg.{DenseMatrix => BreezeMatrix, DenseVector => BreezeVector, Transpose, diag}
+import breeze.linalg.{diag, DenseMatrix => BreezeMatrix, DenseVector => BDV, SparseVector => BSV,
+  Transpose, Vector => BV}
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, DenseVector, Matrices, Vector, Vectors}
+import org.apache.spark.mllib.linalg.{BLAS, DenseVector, DenseMatrix, Matrices,
+  SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
@@ -130,7 +132,7 @@ class GaussianMixture private (
     val sc = data.sparkContext
     
     // we will operate on the data as breeze data
-    val breezeData = data.map(u => u.toBreeze.toDenseVector).cache()
+    val breezeData = data.map(_.toBreeze).cache()
     
     // Get length of the input vectors
     val d = breezeData.first().length
@@ -148,7 +150,7 @@ class GaussianMixture private (
         (Array.fill(k)(1.0 / k), Array.tabulate(k) { i => 
           val slice = samples.view(i * nSamples, (i + 1) * nSamples)
           new MultivariateGaussian(vectorMean(slice), initCovariance(slice)) 
-        })  
+        })
       }
     }
     
@@ -169,7 +171,7 @@ class GaussianMixture private (
       var i = 0
       while (i < k) {
         val mu = sums.means(i) / sums.weights(i)
-        BLAS.syr(-sums.weights(i), Vectors.fromBreeze(mu).asInstanceOf[DenseVector],
+        BLAS.syr(-sums.weights(i), Vectors.fromBreeze(mu),
           Matrices.fromBreeze(sums.sigmas(i)).asInstanceOf[DenseMatrix])
         weights(i) = sums.weights(i) / sumWeights
         gaussians(i) = new MultivariateGaussian(mu, sums.sigmas(i) / sums.weights(i))
@@ -185,8 +187,8 @@ class GaussianMixture private (
   }
     
   /** Average of dense breeze vectors */
-  private def vectorMean(x: IndexedSeq[BreezeVector[Double]]): BreezeVector[Double] = {
-    val v = BreezeVector.zeros[Double](x(0).length)
+  private def vectorMean(x: IndexedSeq[BV[Double]]): BDV[Double] = {
+    val v = BDV.zeros[Double](x(0).length)
     x.foreach(xi => v += xi)
     v / x.length.toDouble 
   }
@@ -195,10 +197,10 @@ class GaussianMixture private (
    * Construct matrix where diagonal entries are element-wise
    * variance of input vectors (computes biased variance)
    */
-  private def initCovariance(x: IndexedSeq[BreezeVector[Double]]): BreezeMatrix[Double] = {
+  private def initCovariance(x: IndexedSeq[BV[Double]]): BreezeMatrix[Double] = {
     val mu = vectorMean(x)
-    val ss = BreezeVector.zeros[Double](x(0).length)
-    x.map(xi => (xi - mu) :^ 2.0).foreach(u => ss += u)
+    val ss = BDV.zeros[Double](x(0).length)
+    x.foreach(xi => ss += (xi - mu) :^ 2.0)
     diag(ss / x.length.toDouble)
   }
 }
@@ -207,7 +209,7 @@ class GaussianMixture private (
 private object ExpectationSum {
   def zero(k: Int, d: Int): ExpectationSum = {
     new ExpectationSum(0.0, Array.fill(k)(0.0), 
-      Array.fill(k)(BreezeVector.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d,d)))
+      Array.fill(k)(BDV.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d,d)))
   }
   
   // compute cluster contributions for each input point
@@ -215,19 +217,18 @@ private object ExpectationSum {
   def add(
       weights: Array[Double], 
       dists: Array[MultivariateGaussian])
-      (sums: ExpectationSum, x: BreezeVector[Double]): ExpectationSum = {
+      (sums: ExpectationSum, x: BV[Double]): ExpectationSum = {
     val p = weights.zip(dists).map {
       case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(x)
     }
     val pSum = p.sum
     sums.logLikelihood += math.log(pSum)
-    val xxt = x * new Transpose(x)
     var i = 0
     while (i < sums.k) {
       p(i) /= pSum
       sums.weights(i) += p(i)
       sums.means(i) += x * p(i)
-      BLAS.syr(p(i), Vectors.fromBreeze(x).asInstanceOf[DenseVector],
+      BLAS.syr(p(i), Vectors.fromBreeze(x),
         Matrices.fromBreeze(sums.sigmas(i)).asInstanceOf[DenseMatrix])
       i = i + 1
     }
@@ -239,7 +240,7 @@ private object ExpectationSum {
 private class ExpectationSum(
     var logLikelihood: Double,
     val weights: Array[Double],
-    val means: Array[BreezeVector[Double]],
+    val means: Array[BDV[Double]],
     val sigmas: Array[BreezeMatrix[Double]]) extends Serializable {
   
   val k = weights.length
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
index 079f7ca564a92..87052e1ba8539 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
@@ -235,12 +235,24 @@ private[spark] object BLAS extends Serializable with Logging {
    * @param x the vector x that contains the n elements.
    * @param A the symmetric matrix A. Size of n x n.
    */
-  def syr(alpha: Double, x: DenseVector, A: DenseMatrix) {
+  def syr(alpha: Double, x: Vector, A: DenseMatrix) {
     val mA = A.numRows
     val nA = A.numCols
-    require(mA == nA, s"A is not a symmetric matrix. A: $mA x $nA")
+    require(mA == nA, s"A is not a square matrix (and hence is not symmetric). A: $mA x $nA")
     require(mA == x.size, s"The size of x doesn't match the rank of A. A: $mA x $nA, x: ${x.size}")
 
+    x match {
+      case dv: DenseVector => syr(alpha, dv, A)
+      case sv: SparseVector => syr(alpha, sv, A)
+      case _ =>
+        throw new IllegalArgumentException(s"syr doesn't support vector type ${x.getClass}.")
+    }
+  }
+
+  private def syr(alpha: Double, x: DenseVector, A: DenseMatrix) {
+    val nA = A.numRows
+    val mA = A.numCols
+
     nativeBLAS.dsyr("U", x.size, alpha, x.values, 1, A.values, nA)
 
     // Fill lower triangular part of A
@@ -255,6 +267,26 @@ private[spark] object BLAS extends Serializable with Logging {
     }    
   }
 
+  private def syr(alpha: Double, x: SparseVector, A: DenseMatrix) {
+    val mA = A.numCols
+    val xIndices = x.indices
+    val xValues = x.values
+    val nnz = xValues.length
+    val Avalues = A.values
+
+    var i = 0
+    while (i < nnz) {
+      val multiplier = alpha * xValues(i)
+      val offset = xIndices(i) * mA
+      var j = 0
+      while (j < nnz) {
+        Avalues(xIndices(j) + offset) += multiplier * xValues(j)
+        j += 1
+      }
+      i += 1
+    }
+  }
+
   /**
    * C := alpha * A * B + beta * C
    * @param alpha a scalar to scale the multiplication A * B.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
index fd186b5ee6f72..cd6add9d60b0d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.stat.distribution
 
-import breeze.linalg.{DenseVector => DBV, DenseMatrix => DBM, diag, max, eigSym}
+import breeze.linalg.{DenseVector => DBV, DenseMatrix => DBM, diag, max, eigSym, Vector => BV}
 
 import org.apache.spark.annotation.DeveloperApi;
 import org.apache.spark.mllib.linalg.{Vectors, Vector, Matrices, Matrix}
@@ -62,21 +62,21 @@ class MultivariateGaussian (
   
   /** Returns density of this multivariate Gaussian at given point, x */
   def pdf(x: Vector): Double = {
-    pdf(x.toBreeze.toDenseVector)
+    pdf(x.toBreeze)
   }
   
   /** Returns the log-density of this multivariate Gaussian at given point, x */
   def logpdf(x: Vector): Double = {
-    logpdf(x.toBreeze.toDenseVector)
+    logpdf(x.toBreeze)
   }
   
   /** Returns density of this multivariate Gaussian at given point, x */
-  private[mllib] def pdf(x: DBV[Double]): Double = {
+  private[mllib] def pdf(x: BV[Double]): Double = {
     math.exp(logpdf(x))
   }
   
   /** Returns the log-density of this multivariate Gaussian at given point, x */
-  private[mllib] def logpdf(x: DBV[Double]): Double = {
+  private[mllib] def logpdf(x: BV[Double]): Double = {
     val delta = x - breezeMu
     val v = rootSigmaInv * delta
     u + v.t * v * -0.5
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
index c2cd56ea40adc..1b46a4012d731 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
@@ -31,7 +31,7 @@ class GaussianMixtureSuite extends FunSuite with MLlibTestSparkContext {
       Vectors.dense(5.0, 10.0),
       Vectors.dense(4.0, 11.0)
     ))
-    
+
     // expectations
     val Ew = 1.0
     val Emu = Vectors.dense(5.0, 10.0)
@@ -44,6 +44,7 @@ class GaussianMixtureSuite extends FunSuite with MLlibTestSparkContext {
       assert(gmm.gaussians(0).mu ~== Emu absTol 1E-5)
       assert(gmm.gaussians(0).sigma ~== Esigma absTol 1E-5)
     }
+
   }
   
   test("two clusters") {
@@ -54,7 +55,7 @@ class GaussianMixtureSuite extends FunSuite with MLlibTestSparkContext {
       Vectors.dense( 5.7048), Vectors.dense( 4.6567), Vectors.dense( 5.5026),
       Vectors.dense( 4.5605), Vectors.dense( 5.2043), Vectors.dense( 6.2734)
     ))
-  
+
     // we set an initial gaussian to induce expected results
     val initialGmm = new GaussianMixtureModel(
       Array(0.5, 0.5),
@@ -63,7 +64,7 @@ class GaussianMixtureSuite extends FunSuite with MLlibTestSparkContext {
         new MultivariateGaussian(Vectors.dense(1.0), Matrices.dense(1, 1, Array(1.0)))
       )
     )
-    
+
     val Ew = Array(1.0 / 3.0, 2.0 / 3.0)
     val Emu = Array(Vectors.dense(-4.3673), Vectors.dense(5.1604))
     val Esigma = Array(Matrices.dense(1, 1, Array(1.1098)), Matrices.dense(1, 1, Array(0.86644)))
@@ -72,7 +73,7 @@ class GaussianMixtureSuite extends FunSuite with MLlibTestSparkContext {
       .setK(2)
       .setInitialModel(initialGmm)
       .run(data)
-      
+
     assert(gmm.weights(0) ~== Ew(0) absTol 1E-3)
     assert(gmm.weights(1) ~== Ew(1) absTol 1E-3)
     assert(gmm.gaussians(0).mu ~== Emu(0) absTol 1E-3)
@@ -80,4 +81,61 @@ class GaussianMixtureSuite extends FunSuite with MLlibTestSparkContext {
     assert(gmm.gaussians(0).sigma ~== Esigma(0) absTol 1E-3)
     assert(gmm.gaussians(1).sigma ~== Esigma(1) absTol 1E-3)
   }
+
+  test("single cluster with sparse data") {
+    val data = sc.parallelize(Array(
+      Vectors.sparse(3, Array(0, 2), Array(4.0, 2.0)),
+      Vectors.sparse(3, Array(0, 2), Array(2.0, 4.0)),
+      Vectors.sparse(3, Array(1), Array(6.0))
+      ))
+
+    val Ew = 1.0
+    val Emu = Vectors.dense(2.0, 2.0, 2.0)
+    val Esigma = Matrices.dense(3, 3,
+      Array(8.0 / 3.0, -4.0, 4.0 / 3.0, -4.0, 8.0, -4.0, 4.0 / 3.0, -4.0, 8.0 / 3.0)
+      )
+
+    val seeds = Array(42, 1994, 27, 11, 0)
+    seeds.foreach { seed =>
+      val gmm = new GaussianMixture().setK(1).setSeed(seed).run(data)
+      assert(gmm.weights(0) ~== Ew absTol 1E-5)
+      assert(gmm.gaussians(0).mu ~== Emu absTol 1E-5)
+      assert(gmm.gaussians(0).sigma ~== Esigma absTol 1E-5)
+    }
+  }
+
+  test("two clusters with sparse data") {
+    val data = sc.parallelize(Array(
+      Vectors.dense(-5.1971), Vectors.dense(-2.5359), Vectors.dense(-3.8220),
+      Vectors.dense(-5.2211), Vectors.dense(-5.0602), Vectors.dense( 4.7118),
+      Vectors.dense( 6.8989), Vectors.dense( 3.4592), Vectors.dense( 4.6322),
+      Vectors.dense( 5.7048), Vectors.dense( 4.6567), Vectors.dense( 5.5026),
+      Vectors.dense( 4.5605), Vectors.dense( 5.2043), Vectors.dense( 6.2734)
+    ))
+
+    val sparseData = data.map(point => Vectors.sparse(1, Array(0), point.toArray))
+    // we set an initial gaussian to induce expected results
+    val initialGmm = new GaussianMixtureModel(
+      Array(0.5, 0.5),
+      Array(
+        new MultivariateGaussian(Vectors.dense(-1.0), Matrices.dense(1, 1, Array(1.0))),
+        new MultivariateGaussian(Vectors.dense(1.0), Matrices.dense(1, 1, Array(1.0)))
+      )
+    )
+    val Ew = Array(1.0 / 3.0, 2.0 / 3.0)
+    val Emu = Array(Vectors.dense(-4.3673), Vectors.dense(5.1604))
+    val Esigma = Array(Matrices.dense(1, 1, Array(1.1098)), Matrices.dense(1, 1, Array(0.86644)))
+
+    val sparseGMM = new GaussianMixture()
+      .setK(2)
+      .setInitialModel(initialGmm)
+      .run(data)
+
+    assert(sparseGMM.weights(0) ~== Ew(0) absTol 1E-3)
+    assert(sparseGMM.weights(1) ~== Ew(1) absTol 1E-3)
+    assert(sparseGMM.gaussians(0).mu ~== Emu(0) absTol 1E-3)
+    assert(sparseGMM.gaussians(1).mu ~== Emu(1) absTol 1E-3)
+    assert(sparseGMM.gaussians(0).sigma ~== Esigma(0) absTol 1E-3)
+    assert(sparseGMM.gaussians(1).sigma ~== Esigma(1) absTol 1E-3)
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
index b0b78acd6df16..002cb253862b5 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/BLASSuite.scala
@@ -166,6 +166,14 @@ class BLASSuite extends FunSuite {
         syr(alpha, y, dA)
       }
     }
+
+    val xSparse = new SparseVector(4, Array(0, 2, 3), Array(1.0, 3.0, 4.0))
+    val dD = new DenseMatrix(4, 4,
+      Array(0.0, 1.2, 2.2, 3.1, 1.2, 3.2, 5.3, 4.6, 2.2, 5.3, 1.8, 3.0, 3.1, 4.6, 3.0, 0.8))
+    syr(0.1, xSparse, dD)
+    val expectedSparse = new DenseMatrix(4, 4,
+      Array(0.1, 1.2, 2.5, 3.5, 1.2, 3.2, 5.3, 4.6, 2.5, 5.3, 2.7, 4.2, 3.5, 4.6, 4.2, 2.4))
+    assert(dD ~== expectedSparse absTol 1e-15)
   }
 
   test("gemm") {

From 5820961289eb98e45eb467efa316c7592b8d619c Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Tue, 10 Feb 2015 14:57:00 -0800
Subject: [PATCH 101/357] [SPARK-5343][GraphX]: ShortestPaths traverses
 backwards

Corrected the logic with ShortestPaths so that the calculation will run forward rather than backwards. Output before looked like:

```scala
import org.apache.spark.graphx._
val g = Graph(sc.makeRDD(Array((1L,""), (2L,""), (3L,""))), sc.makeRDD(Array(Edge(1L,2L,""), Edge(2L,3L,""))))
lib.ShortestPaths.run(g,Array(3)).vertices.collect
// res0: Array[(org.apache.spark.graphx.VertexId, org.apache.spark.graphx.lib.ShortestPaths.SPMap)] = Array((1,Map()), (3,Map(3 -> 0)), (2,Map()))
lib.ShortestPaths.run(g,Array(1)).vertices.collect
// res1: Array[(org.apache.spark.graphx.VertexId, org.apache.spark.graphx.lib.ShortestPaths.SPMap)] = Array((1,Map(1 -> 0)), (3,Map(1 -> 2)), (2,Map(1 -> 1)))
```

And new output after the changes looks like:

```scala
import org.apache.spark.graphx._
val g = Graph(sc.makeRDD(Array((1L,""), (2L,""), (3L,""))), sc.makeRDD(Array(Edge(1L,2L,""), Edge(2L,3L,""))))
lib.ShortestPaths.run(g,Array(3)).vertices.collect
// res0: Array[(org.apache.spark.graphx.VertexId, org.apache.spark.graphx.lib.ShortestPaths.SPMap)] = Array((1,Map(3 -> 2)), (2,Map(3 -> 1)), (3,Map(3 -> 0)))
lib.ShortestPaths.run(g,Array(1)).vertices.collect
// res1: Array[(org.apache.spark.graphx.VertexId, org.apache.spark.graphx.lib.ShortestPaths.SPMap)] = Array((1,Map(1 -> 0)), (2,Map()), (3,Map()))
```

Author: Brennon York <brennon.york@capitalone.com>

Closes #4478 from brennonyork/SPARK-5343 and squashes the following commits:

aa57f83 [Brennon York] updated to set ShortestPaths to run 'forward' rather than 'backward'
---
 .../scala/org/apache/spark/graphx/lib/ShortestPaths.scala     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/ShortestPaths.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/ShortestPaths.scala
index 590f0474957dd..179f2843818e0 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/ShortestPaths.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/ShortestPaths.scala
@@ -61,8 +61,8 @@ object ShortestPaths {
     }
 
     def sendMessage(edge: EdgeTriplet[SPMap, _]): Iterator[(VertexId, SPMap)] = {
-      val newAttr = incrementMap(edge.srcAttr)
-      if (edge.dstAttr != addMaps(newAttr, edge.dstAttr)) Iterator((edge.dstId, newAttr))
+      val newAttr = incrementMap(edge.dstAttr)
+      if (edge.srcAttr != addMaps(newAttr, edge.srcAttr)) Iterator((edge.srcId, newAttr))
       else Iterator.empty
     }
 

From 52983d7f4f1a155433b6df3687cf5dc71804cfd5 Mon Sep 17 00:00:00 2001
From: Sephiroth-Lin <linwzhong@gmail.com>
Date: Tue, 10 Feb 2015 23:23:35 +0000
Subject: [PATCH 102/357] [SPARK-5644] [Core]Delete tmp dir when sc is stop

When we run driver as a service, and for each time we run job we only call sc.stop, then will not delete tmp dir create by HttpFileServer and SparkEnv, it will be deleted until the service process exit, so we need to delete these tmp dirs when sc is stop directly.

Author: Sephiroth-Lin <linwzhong@gmail.com>

Closes #4412 from Sephiroth-Lin/bug-fix-master-01 and squashes the following commits:

fbbc785 [Sephiroth-Lin] using an interpolated string
b968e14 [Sephiroth-Lin] using an interpolated string
4edf394 [Sephiroth-Lin] rename the variable and update comment
1339c96 [Sephiroth-Lin] add a member to store the reference of tmp dir
b2018a5 [Sephiroth-Lin] check sparkFilesDir before delete
f48a3c6 [Sephiroth-Lin] don't check sparkFilesDir, check executorId
dd9686e [Sephiroth-Lin] format code
b38e0f0 [Sephiroth-Lin] add dir check before delete
d7ccc64 [Sephiroth-Lin] Change log level
1d70926 [Sephiroth-Lin] update comment
e2a2b1b [Sephiroth-Lin] update comment
aeac518 [Sephiroth-Lin] Delete tmp dir when sc is stop
c0d5b28 [Sephiroth-Lin] Delete tmp dir when sc is stop
---
 .../org/apache/spark/HttpFileServer.scala     |  9 ++++++
 .../scala/org/apache/spark/SparkEnv.scala     | 29 ++++++++++++++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/HttpFileServer.scala b/core/src/main/scala/org/apache/spark/HttpFileServer.scala
index 3f33332a81eaf..7e706bcc42f04 100644
--- a/core/src/main/scala/org/apache/spark/HttpFileServer.scala
+++ b/core/src/main/scala/org/apache/spark/HttpFileServer.scala
@@ -50,6 +50,15 @@ private[spark] class HttpFileServer(
 
   def stop() {
     httpServer.stop()
+    
+    // If we only stop sc, but the driver process still run as a services then we need to delete 
+    // the tmp dir, if not, it will create too many tmp dirs
+    try {
+      Utils.deleteRecursively(baseDir)
+    } catch {
+      case e: Exception =>
+        logWarning(s"Exception while deleting Spark temp dir: ${baseDir.getAbsolutePath}", e)
+    }
   }
 
   def addFile(file: File) : String = {
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index f25db7f8de565..b63bea5b102b6 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -76,6 +76,8 @@ class SparkEnv (
   // (e.g., HadoopFileRDD uses this to cache JobConfs and InputFormats).
   private[spark] val hadoopJobMetadata = new MapMaker().softValues().makeMap[String, Any]()
 
+  private var driverTmpDirToDelete: Option[String] = None
+
   private[spark] def stop() {
     isStopped = true
     pythonWorkers.foreach { case(key, worker) => worker.stop() }
@@ -93,6 +95,22 @@ class SparkEnv (
     // actorSystem.awaitTermination()
 
     // Note that blockTransferService is stopped by BlockManager since it is started by it.
+    
+    // If we only stop sc, but the driver process still run as a services then we need to delete
+    // the tmp dir, if not, it will create too many tmp dirs.
+    // We only need to delete the tmp dir create by driver, because sparkFilesDir is point to the
+    // current working dir in executor which we do not need to delete.
+    driverTmpDirToDelete match {
+      case Some(path) => {
+        try {
+          Utils.deleteRecursively(new File(path))
+        } catch {
+          case e: Exception =>
+            logWarning(s"Exception while deleting Spark temp dir: $path", e)
+        }
+      }
+      case None => // We just need to delete tmp dir created by driver, so do nothing on executor
+    }
   }
 
   private[spark]
@@ -350,7 +368,7 @@ object SparkEnv extends Logging {
         "levels using the RDD.persist() method instead.")
     }
 
-    new SparkEnv(
+    val envInstance = new SparkEnv(
       executorId,
       actorSystem,
       serializer,
@@ -367,6 +385,15 @@ object SparkEnv extends Logging {
       metricsSystem,
       shuffleMemoryManager,
       conf)
+      
+    // Add a reference to tmp dir created by driver, we will delete this tmp dir when stop() is
+    // called, and we only need to do it for driver. Because driver may run as a service, and if we
+    // don't delete this tmp dir when sc is stopped, then will create too many tmp dirs.
+    if (isDriver) {
+      envInstance.driverTmpDirToDelete = Some(sparkFilesDir)
+    }
+
+    envInstance
   }
 
   /**

From 91e3512544d9ab684799ac9a9c341ab465e1b427 Mon Sep 17 00:00:00 2001
From: "Sheng, Li" <OopsOutOfMemory@users.noreply.github.com>
Date: Wed, 11 Feb 2015 00:59:46 +0000
Subject: [PATCH 103/357] [SQL][Minor] correct some comments

Author: Sheng, Li <OopsOutOfMemory@users.noreply.github.com>
Author: OopsOutOfMemory <victorshengli@126.com>

Closes #4508 from OopsOutOfMemory/cmt and squashes the following commits:

d8a68c6 [Sheng, Li] Update ddl.scala
f24aeaf [OopsOutOfMemory] correct style
---
 sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index a692ef51b31ed..bf2ad14763e9f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -141,7 +141,7 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
 
   /*
    * describe [extended] table avroTable
-   * This will display all columns of table `avroTable` includes column_name,column_type,nullable
+   * This will display all columns of table `avroTable` includes column_name,column_type,comment
    */
   protected lazy val describeTable: Parser[LogicalPlan] =
     (DESCRIBE ~> opt(EXTENDED)) ~ (ident <~ ".").? ~ ident  ^^ {

From 2d50a010ff57a861b13c2088ac048662d535f5e7 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 10 Feb 2015 17:02:44 -0800
Subject: [PATCH 104/357] [SPARK-5725] [SQL] Fixes ParquetRelation2.equals

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4513)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>

Closes #4513 from liancheng/spark-5725 and squashes the following commits:

bf6a087 [Cheng Lian] Fixes ParquetRelation2.equals
---
 .../src/main/scala/org/apache/spark/sql/parquet/newParquet.scala | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 49d46334b6525..04804f78f5c34 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -152,6 +152,7 @@ case class ParquetRelation2
       paths.toSet == relation.paths.toSet &&
         maybeMetastoreSchema == relation.maybeMetastoreSchema &&
         (shouldMergeSchemas == relation.shouldMergeSchemas || schema == relation.schema)
+    case _ => false
   }
 
   private[sql] def sparkContext = sqlContext.sparkContext

From e28b6bdbb5c5e4fd62ec0b547b77719c3f7e476e Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 10 Feb 2015 17:06:12 -0800
Subject: [PATCH 105/357] [SQL] Make Options in the data source API CREATE
 TABLE statements optional.

Users will not need to put `Options()` in a CREATE TABLE statement when there is not option provided.

Author: Yin Huai <yhuai@databricks.com>

Closes #4515 from yhuai/makeOptionsOptional and squashes the following commits:

1a898d3 [Yin Huai] Make options optional.
---
 .../src/main/scala/org/apache/spark/sql/sources/ddl.scala  | 7 ++++---
 .../apache/spark/sql/hive/MetastoreDataSourcesSuite.scala  | 4 +---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index bf2ad14763e9f..9f64f761002c9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -106,13 +106,14 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
   protected lazy val createTable: Parser[LogicalPlan] =
   (
     (CREATE ~> TEMPORARY.? <~ TABLE) ~ (IF ~> NOT <~ EXISTS).? ~ ident
-      ~ (tableCols).? ~ (USING ~> className) ~ (OPTIONS ~> options) ~ (AS ~> restInput).? ^^ {
+      ~ (tableCols).? ~ (USING ~> className) ~ (OPTIONS ~> options).? ~ (AS ~> restInput).? ^^ {
       case temp ~ allowExisting ~ tableName ~ columns ~ provider ~ opts ~ query =>
         if (temp.isDefined && allowExisting.isDefined) {
           throw new DDLException(
             "a CREATE TEMPORARY TABLE statement does not allow IF NOT EXISTS clause.")
         }
 
+        val options = opts.getOrElse(Map.empty[String, String])
         if (query.isDefined) {
           if (columns.isDefined) {
             throw new DDLException(
@@ -121,7 +122,7 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
           CreateTableUsingAsSelect(tableName,
             provider,
             temp.isDefined,
-            opts,
+            options,
             allowExisting.isDefined,
             query.get)
         } else {
@@ -131,7 +132,7 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
             userSpecifiedSchema,
             provider,
             temp.isDefined,
-            opts,
+            options,
             allowExisting.isDefined)
         }
       }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 036efa84d7c85..9ce058909f429 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -361,9 +361,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       s"""
         |CREATE TABLE ctasJsonTable
         |USING org.apache.spark.sql.json.DefaultSource
-        |OPTIONS (
-        |
-        |) AS
+        |AS
         |SELECT * FROM jsonTable
       """.stripMargin)
 

From ed167e70c6d355f39b366ea0d3b92dd26d826a0b Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 10 Feb 2015 17:19:10 -0800
Subject: [PATCH 106/357] [SPARK-5493] [core] Add option to impersonate user.

Hadoop has a feature that allows users to impersonate other users
when submitting applications or talking to HDFS, for example. These
impersonated users are referred generally as "proxy users".

Services such as Oozie or Hive use this feature to run applications
as the requesting user.

This change makes SparkSubmit accept a new command line option to
run the application as a proxy user. It also fixes the plumbing
of the user name through the UI (and a couple of other places) to
refer to the correct user running the application, which can be
different than `sys.props("user.name")` even without proxies (e.g.
when using kerberos).

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #4405 from vanzin/SPARK-5493 and squashes the following commits:

df82427 [Marcelo Vanzin] Clarify the reason for the special exception handling.
05bfc08 [Marcelo Vanzin] Remove unneeded annotation.
4840de9 [Marcelo Vanzin] Review feedback.
8af06ff [Marcelo Vanzin] Fix usage string.
2e4fa8f [Marcelo Vanzin] Merge branch 'master' into SPARK-5493
b6c947d [Marcelo Vanzin] Merge branch 'master' into SPARK-5493
0540d38 [Marcelo Vanzin] [SPARK-5493] [core] Add option to impersonate user.
---
 bin/utils.sh                                  |  3 +-
 bin/windows-utils.cmd                         |  1 +
 .../org/apache/spark/SecurityManager.scala    |  3 +-
 .../scala/org/apache/spark/SparkContext.scala | 16 ++----
 .../apache/spark/deploy/SparkHadoopUtil.scala | 19 +++----
 .../org/apache/spark/deploy/SparkSubmit.scala | 56 ++++++++++++++++---
 .../spark/deploy/SparkSubmitArguments.scala   |  7 +++
 .../scala/org/apache/spark/util/Utils.scala   | 11 ++++
 8 files changed, 82 insertions(+), 34 deletions(-)

diff --git a/bin/utils.sh b/bin/utils.sh
index 2241200082018..748dbe345a74c 100755
--- a/bin/utils.sh
+++ b/bin/utils.sh
@@ -35,7 +35,8 @@ function gatherSparkSubmitOpts() {
       --master | --deploy-mode | --class | --name | --jars | --packages | --py-files | --files | \
       --conf | --repositories | --properties-file | --driver-memory | --driver-java-options | \
       --driver-library-path | --driver-class-path | --executor-memory | --driver-cores | \
-      --total-executor-cores | --executor-cores | --queue | --num-executors | --archives)
+      --total-executor-cores | --executor-cores | --queue | --num-executors | --archives | \
+      --proxy-user)
         if [[ $# -lt 2 ]]; then
           "$SUBMIT_USAGE_FUNCTION"
           exit 1;
diff --git a/bin/windows-utils.cmd b/bin/windows-utils.cmd
index 567b8733f7f77..0cf9e87ca554b 100644
--- a/bin/windows-utils.cmd
+++ b/bin/windows-utils.cmd
@@ -33,6 +33,7 @@ SET opts="%opts:~1,-1% \<--conf\> \<--properties-file\> \<--driver-memory\> \<--
 SET opts="%opts:~1,-1% \<--driver-library-path\> \<--driver-class-path\> \<--executor-memory\>"
 SET opts="%opts:~1,-1% \<--driver-cores\> \<--total-executor-cores\> \<--executor-cores\> \<--queue\>"
 SET opts="%opts:~1,-1% \<--num-executors\> \<--archives\> \<--packages\> \<--repositories\>"
+SET opts="%opts:~1,-1% \<--proxy-user\>"
 
 echo %1 | findstr %opts% >nul
 if %ERRORLEVEL% equ 0 (
diff --git a/core/src/main/scala/org/apache/spark/SecurityManager.scala b/core/src/main/scala/org/apache/spark/SecurityManager.scala
index 88d35a4bacc6e..3653f724ba192 100644
--- a/core/src/main/scala/org/apache/spark/SecurityManager.scala
+++ b/core/src/main/scala/org/apache/spark/SecurityManager.scala
@@ -27,6 +27,7 @@ import org.apache.hadoop.io.Text
 
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.network.sasl.SecretKeyHolder
+import org.apache.spark.util.Utils
 
 /**
  * Spark class responsible for security.
@@ -203,7 +204,7 @@ private[spark] class SecurityManager(sparkConf: SparkConf)
 
   // always add the current user and SPARK_USER to the viewAcls
   private val defaultAclUsers = Set[String](System.getProperty("user.name", ""),
-    Option(System.getenv("SPARK_USER")).getOrElse("")).filter(!_.isEmpty)
+    Utils.getCurrentUserName())
 
   setViewAcls(defaultAclUsers, sparkConf.get("spark.ui.view.acls", ""))
   setModifyAcls(defaultAclUsers, sparkConf.get("spark.modify.acls", ""))
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 04ca5d1019e4b..53fce6b0defdf 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -191,7 +191,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
 
   // log out Spark Version in Spark driver log
   logInfo(s"Running Spark version $SPARK_VERSION")
-  
+
   private[spark] val conf = config.clone()
   conf.validateSettings()
 
@@ -335,11 +335,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   executorEnvs ++= conf.getExecutorEnv
 
   // Set SPARK_USER for user who is running SparkContext.
-  val sparkUser = Option {
-    Option(System.getenv("SPARK_USER")).getOrElse(System.getProperty("user.name"))
-  }.getOrElse {
-    SparkContext.SPARK_UNKNOWN_USER
-  }
+  val sparkUser = Utils.getCurrentUserName()
   executorEnvs("SPARK_USER") = sparkUser
 
   // Create and start the scheduler
@@ -826,7 +822,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       vClass: Class[V],
       conf: Configuration = hadoopConfiguration): RDD[(K, V)] = {
     assertNotStopped()
-    // The call to new NewHadoopJob automatically adds security credentials to conf, 
+    // The call to new NewHadoopJob automatically adds security credentials to conf,
     // so we don't need to explicitly add them ourselves
     val job = new NewHadoopJob(conf)
     NewFileInputFormat.addInputPath(job, new Path(path))
@@ -1626,8 +1622,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   @deprecated("use defaultMinPartitions", "1.0.0")
   def defaultMinSplits: Int = math.min(defaultParallelism, 2)
 
-  /** 
-   * Default min number of partitions for Hadoop RDDs when not given by user 
+  /**
+   * Default min number of partitions for Hadoop RDDs when not given by user
    * Notice that we use math.min so the "defaultMinPartitions" cannot be higher than 2.
    * The reasons for this are discussed in https://github.com/mesos/spark/pull/718
    */
@@ -1844,8 +1840,6 @@ object SparkContext extends Logging {
 
   private[spark] val SPARK_JOB_INTERRUPT_ON_CANCEL = "spark.job.interruptOnCancel"
 
-  private[spark] val SPARK_UNKNOWN_USER = "<unknown>"
-
   private[spark] val DRIVER_IDENTIFIER = "<driver>"
 
   // The following deprecated objects have already been copied to `object AccumulatorParam` to
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 03238e9fa0088..e0a32fb65cd51 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -52,18 +52,13 @@ class SparkHadoopUtil extends Logging {
    * do a FileSystem.closeAllForUGI in order to avoid leaking Filesystems
    */
   def runAsSparkUser(func: () => Unit) {
-    val user = Option(System.getenv("SPARK_USER")).getOrElse(SparkContext.SPARK_UNKNOWN_USER)
-    if (user != SparkContext.SPARK_UNKNOWN_USER) {
-      logDebug("running as user: " + user)
-      val ugi = UserGroupInformation.createRemoteUser(user)
-      transferCredentials(UserGroupInformation.getCurrentUser(), ugi)
-      ugi.doAs(new PrivilegedExceptionAction[Unit] {
-        def run: Unit = func()
-      })
-    } else {
-      logDebug("running as SPARK_UNKNOWN_USER")
-      func()
-    }
+    val user = Utils.getCurrentUserName()
+    logDebug("running as user: " + user)
+    val ugi = UserGroupInformation.createRemoteUser(user)
+    transferCredentials(UserGroupInformation.getCurrentUser(), ugi)
+    ugi.doAs(new PrivilegedExceptionAction[Unit] {
+      def run: Unit = func()
+    })
   }
 
   def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation) {
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index c4bc5054d61a1..80cc0587286b1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -18,12 +18,14 @@
 package org.apache.spark.deploy
 
 import java.io.{File, PrintStream}
-import java.lang.reflect.{InvocationTargetException, Modifier}
+import java.lang.reflect.{InvocationTargetException, Modifier, UndeclaredThrowableException}
 import java.net.URL
+import java.security.PrivilegedExceptionAction
 
 import scala.collection.mutable.{ArrayBuffer, HashMap, Map}
 
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.security.UserGroupInformation
 import org.apache.ivy.Ivy
 import org.apache.ivy.core.LogOptions
 import org.apache.ivy.core.module.descriptor._
@@ -79,7 +81,7 @@ object SparkSubmit {
   private val CLASS_NOT_FOUND_EXIT_STATUS = 101
 
   // Exposed for testing
-  private[spark] var exitFn: () => Unit = () => System.exit(-1)
+  private[spark] var exitFn: () => Unit = () => System.exit(1)
   private[spark] var printStream: PrintStream = System.err
   private[spark] def printWarning(str: String) = printStream.println("Warning: " + str)
   private[spark] def printErrorAndExit(str: String) = {
@@ -126,6 +128,34 @@ object SparkSubmit {
    */
   private[spark] def submit(args: SparkSubmitArguments): Unit = {
     val (childArgs, childClasspath, sysProps, childMainClass) = prepareSubmitEnvironment(args)
+
+    def doRunMain(): Unit = {
+      if (args.proxyUser != null) {
+        val proxyUser = UserGroupInformation.createProxyUser(args.proxyUser,
+          UserGroupInformation.getCurrentUser())
+        try {
+          proxyUser.doAs(new PrivilegedExceptionAction[Unit]() {
+            override def run(): Unit = {
+              runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
+            }
+          })
+        } catch {
+          case e: Exception =>
+            // Hadoop's AuthorizationException suppresses the exception's stack trace, which
+            // makes the message printed to the output by the JVM not very helpful. Instead,
+            // detect exceptions with empty stack traces here, and treat them differently.
+            if (e.getStackTrace().length == 0) {
+              printStream.println(s"ERROR: ${e.getClass().getName()}: ${e.getMessage()}")
+              exitFn()
+            } else {
+              throw e
+            }
+        }
+      } else {
+        runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
+      }
+    }
+
      // In standalone cluster mode, there are two submission gateways:
      //   (1) The traditional Akka gateway using o.a.s.deploy.Client as a wrapper
      //   (2) The new REST-based gateway introduced in Spark 1.3
@@ -134,7 +164,7 @@ object SparkSubmit {
     if (args.isStandaloneCluster && args.useRest) {
       try {
         printStream.println("Running Spark using the REST application submission protocol.")
-        runMain(childArgs, childClasspath, sysProps, childMainClass)
+        doRunMain()
       } catch {
         // Fail over to use the legacy submission gateway
         case e: SubmitRestConnectionException =>
@@ -145,7 +175,7 @@ object SparkSubmit {
       }
     // In all other modes, just run the main class as prepared
     } else {
-      runMain(childArgs, childClasspath, sysProps, childMainClass)
+      doRunMain()
     }
   }
 
@@ -457,7 +487,7 @@ object SparkSubmit {
       childClasspath: Seq[String],
       sysProps: Map[String, String],
       childMainClass: String,
-      verbose: Boolean = false) {
+      verbose: Boolean): Unit = {
     if (verbose) {
       printStream.println(s"Main class:\n$childMainClass")
       printStream.println(s"Arguments:\n${childArgs.mkString("\n")}")
@@ -507,13 +537,21 @@ object SparkSubmit {
     if (!Modifier.isStatic(mainMethod.getModifiers)) {
       throw new IllegalStateException("The main method in the given main class must be static")
     }
+
+    def findCause(t: Throwable): Throwable = t match {
+      case e: UndeclaredThrowableException =>
+        if (e.getCause() != null) findCause(e.getCause()) else e
+      case e: InvocationTargetException =>
+        if (e.getCause() != null) findCause(e.getCause()) else e
+      case e: Throwable =>
+        e
+    }
+
     try {
       mainMethod.invoke(null, childArgs.toArray)
     } catch {
-      case e: InvocationTargetException => e.getCause match {
-        case cause: Throwable => throw cause
-        case null => throw e
-      }
+      case t: Throwable =>
+        throw findCause(t)
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index bd0ae26fd8210..fa38070c6fcfe 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -57,6 +57,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
   var pyFiles: String = null
   var action: SparkSubmitAction = null
   val sparkProperties: HashMap[String, String] = new HashMap[String, String]()
+  var proxyUser: String = null
 
   // Standalone cluster mode only
   var supervise: Boolean = false
@@ -405,6 +406,10 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
         }
         parse(tail)
 
+      case ("--proxy-user") :: value :: tail =>
+        proxyUser = value
+        parse(tail)
+
       case ("--help" | "-h") :: tail =>
         printUsageAndExit(0)
 
@@ -476,6 +481,8 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
         |
         |  --executor-memory MEM       Memory per executor (e.g. 1000M, 2G) (Default: 1G).
         |
+        |  --proxy-user NAME           User to impersonate when submitting the application.
+        |
         |  --help, -h                  Show this help message and exit
         |  --verbose, -v               Print additional debug output
         |
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 61d287ca9c3ac..6af8dd555f2aa 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -38,6 +38,7 @@ import com.google.common.util.concurrent.ThreadFactoryBuilder
 import org.apache.commons.lang3.SystemUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
+import org.apache.hadoop.security.UserGroupInformation
 import org.apache.log4j.PropertyConfigurator
 import org.eclipse.jetty.util.MultiException
 import org.json4s._
@@ -1986,6 +1987,16 @@ private[spark] object Utils extends Logging {
         throw new SparkException("Invalid master URL: " + sparkUrl, e)
     }
   }
+
+  /**
+   * Returns the current user name. This is the currently logged in user, unless that's been
+   * overridden by the `SPARK_USER` environment variable.
+   */
+  def getCurrentUserName(): String = {
+    Option(System.getenv("SPARK_USER"))
+      .getOrElse(UserGroupInformation.getCurrentUser().getUserName())
+  }
+
 }
 
 /**

From aaf50d05c7616e4f8f16654b642500ae06cdd774 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 10 Feb 2015 17:29:52 -0800
Subject: [PATCH 107/357] [SPARK-5658][SQL] Finalize DDL and write support APIs

https://issues.apache.org/jira/browse/SPARK-5658

Author: Yin Huai <yhuai@databricks.com>

This patch had conflicts when merged, resolved by
Committer: Michael Armbrust <michael@databricks.com>

Closes #4446 from yhuai/writeSupportFollowup and squashes the following commits:

f3a96f7 [Yin Huai] davies's comments.
225ff71 [Yin Huai] Use Scala TestHiveContext to initialize the Python HiveContext in Python tests.
2306f93 [Yin Huai] Style.
2091fcd [Yin Huai] Merge remote-tracking branch 'upstream/master' into writeSupportFollowup
537e28f [Yin Huai] Correctly clean up temp data.
ae4649e [Yin Huai] Fix Python test.
609129c [Yin Huai] Doc format.
92b6659 [Yin Huai] Python doc and other minor updates.
cbc717f [Yin Huai] Rename dataSourceName to source.
d1c12d3 [Yin Huai] No need to delete the duplicate rule since it has been removed in master.
22cfa70 [Yin Huai] Merge remote-tracking branch 'upstream/master' into writeSupportFollowup
d91ecb8 [Yin Huai] Fix test.
4c76d78 [Yin Huai] Simplify APIs.
3abc215 [Yin Huai] Merge remote-tracking branch 'upstream/master' into writeSupportFollowup
0832ce4 [Yin Huai] Fix test.
98e7cdb [Yin Huai] Python style.
2bf44ef [Yin Huai] Python APIs.
c204967 [Yin Huai] Format
a10223d [Yin Huai] Merge remote-tracking branch 'upstream/master' into writeSupportFollowup
9ff97d8 [Yin Huai] Add SaveMode to saveAsTable.
9b6e570 [Yin Huai] Update doc.
c2be775 [Yin Huai] Merge remote-tracking branch 'upstream/master' into writeSupportFollowup
99950a2 [Yin Huai] Use Java enum for SaveMode.
4679665 [Yin Huai] Remove duplicate rule.
77d89dc [Yin Huai] Update doc.
e04d908 [Yin Huai] Move import and add (Scala-specific) to scala APIs.
cf5703d [Yin Huai] Add checkAnswer to Java tests.
7db95ff [Yin Huai] Merge remote-tracking branch 'upstream/master' into writeSupportFollowup
6dfd386 [Yin Huai] Add java test.
f2f33ef [Yin Huai] Fix test.
e702386 [Yin Huai] Apache header.
b1e9b1b [Yin Huai] Format.
ed4e1b4 [Yin Huai] Merge remote-tracking branch 'upstream/master' into writeSupportFollowup
af9e9b3 [Yin Huai] DDL and write support API followup.
2a6213a [Yin Huai] Update API names.
e6a0b77 [Yin Huai] Update test.
43bae01 [Yin Huai] Remove createTable from HiveContext.
5ffc372 [Yin Huai] Add more load APIs to SQLContext.
5390743 [Yin Huai] Add more save APIs to DataFrame.
---
 python/pyspark/sql/context.py                 |  68 ++++++++
 python/pyspark/sql/dataframe.py               |  72 +++++++-
 python/pyspark/sql/tests.py                   | 107 +++++++++++-
 .../apache/spark/sql/sources/SaveMode.java    |  45 +++++
 .../org/apache/spark/sql/DataFrame.scala      | 160 ++++++++++++++---
 .../org/apache/spark/sql/DataFrameImpl.scala  |  61 ++-----
 .../apache/spark/sql/IncomputableColumn.scala |  27 +--
 .../scala/org/apache/spark/sql/SQLConf.scala  |   2 +-
 .../org/apache/spark/sql/SQLContext.scala     | 164 +++++++++++++++++-
 .../spark/sql/execution/SparkStrategies.scala |  14 +-
 .../apache/spark/sql/json/JSONRelation.scala  |  30 +++-
 .../apache/spark/sql/parquet/newParquet.scala |  45 ++++-
 .../org/apache/spark/sql/sources/ddl.scala    |  40 ++++-
 .../apache/spark/sql/sources/interfaces.scala |  19 ++
 .../spark/sql/sources/JavaSaveLoadSuite.java  |  97 +++++++++++
 .../org/apache/spark/sql/QueryTest.scala      |  92 ++++++----
 .../sources/CreateTableAsSelectSuite.scala    |  29 +++-
 .../spark/sql/sources/SaveLoadSuite.scala     |  59 +++++--
 .../apache/spark/sql/hive/HiveContext.scala   |  76 --------
 .../spark/sql/hive/HiveStrategies.scala       |  13 +-
 .../spark/sql/hive/execution/commands.scala   | 105 ++++++++---
 .../spark/sql/hive/{ => test}/TestHive.scala  |  20 +--
 .../hive/JavaMetastoreDataSourcesSuite.java   | 147 ++++++++++++++++
 .../org/apache/spark/sql/QueryTest.scala      |  64 +++++--
 .../sql/hive/InsertIntoHiveTableSuite.scala   |  33 ++--
 .../sql/hive/MetastoreDataSourcesSuite.scala  | 118 +++++++++++--
 26 files changed, 1357 insertions(+), 350 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/SaveMode.java
 create mode 100644 sql/core/src/test/java/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
 rename sql/hive/src/main/scala/org/apache/spark/sql/hive/{ => test}/TestHive.scala (99%)
 create mode 100644 sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 49f016a9cf2e9..882c0f98ea40b 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -21,6 +21,7 @@
 from itertools import imap
 
 from py4j.protocol import Py4JError
+from py4j.java_collections import MapConverter
 
 from pyspark.rdd import _prepare_for_python_RDD
 from pyspark.serializers import AutoBatchedSerializer, PickleSerializer
@@ -87,6 +88,18 @@ def _ssql_ctx(self):
             self._scala_SQLContext = self._jvm.SQLContext(self._jsc.sc())
         return self._scala_SQLContext
 
+    def setConf(self, key, value):
+        """Sets the given Spark SQL configuration property.
+        """
+        self._ssql_ctx.setConf(key, value)
+
+    def getConf(self, key, defaultValue):
+        """Returns the value of Spark SQL configuration property for the given key.
+
+        If the key is not set, returns defaultValue.
+        """
+        return self._ssql_ctx.getConf(key, defaultValue)
+
     def registerFunction(self, name, f, returnType=StringType()):
         """Registers a lambda function as a UDF so it can be used in SQL statements.
 
@@ -455,6 +468,61 @@ def func(iterator):
             df = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
         return DataFrame(df, self)
 
+    def load(self, path=None, source=None, schema=None, **options):
+        """Returns the dataset in a data source as a DataFrame.
+
+        The data source is specified by the `source` and a set of `options`.
+        If `source` is not specified, the default data source configured by
+        spark.sql.sources.default will be used.
+
+        Optionally, a schema can be provided as the schema of the returned DataFrame.
+        """
+        if path is not None:
+            options["path"] = path
+        if source is None:
+            source = self.getConf("spark.sql.sources.default",
+                                  "org.apache.spark.sql.parquet")
+        joptions = MapConverter().convert(options,
+                                          self._sc._gateway._gateway_client)
+        if schema is None:
+            df = self._ssql_ctx.load(source, joptions)
+        else:
+            if not isinstance(schema, StructType):
+                raise TypeError("schema should be StructType")
+            scala_datatype = self._ssql_ctx.parseDataType(schema.json())
+            df = self._ssql_ctx.load(source, scala_datatype, joptions)
+        return DataFrame(df, self)
+
+    def createExternalTable(self, tableName, path=None, source=None,
+                            schema=None, **options):
+        """Creates an external table based on the dataset in a data source.
+
+        It returns the DataFrame associated with the external table.
+
+        The data source is specified by the `source` and a set of `options`.
+        If `source` is not specified, the default data source configured by
+        spark.sql.sources.default will be used.
+
+        Optionally, a schema can be provided as the schema of the returned DataFrame and
+        created external table.
+        """
+        if path is not None:
+            options["path"] = path
+        if source is None:
+            source = self.getConf("spark.sql.sources.default",
+                                  "org.apache.spark.sql.parquet")
+        joptions = MapConverter().convert(options,
+                                          self._sc._gateway._gateway_client)
+        if schema is None:
+            df = self._ssql_ctx.createExternalTable(tableName, source, joptions)
+        else:
+            if not isinstance(schema, StructType):
+                raise TypeError("schema should be StructType")
+            scala_datatype = self._ssql_ctx.parseDataType(schema.json())
+            df = self._ssql_ctx.createExternalTable(tableName, source, scala_datatype,
+                                                    joptions)
+        return DataFrame(df, self)
+
     def sql(self, sqlQuery):
         """Return a L{DataFrame} representing the result of the given query.
 
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 04be65fe241c4..3eef0cc376a2d 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -146,9 +146,75 @@ def insertInto(self, tableName, overwrite=False):
         """
         self._jdf.insertInto(tableName, overwrite)
 
-    def saveAsTable(self, tableName):
-        """Creates a new table with the contents of this DataFrame."""
-        self._jdf.saveAsTable(tableName)
+    def _java_save_mode(self, mode):
+        """Returns the Java save mode based on the Python save mode represented by a string.
+        """
+        jSaveMode = self._sc._jvm.org.apache.spark.sql.sources.SaveMode
+        jmode = jSaveMode.ErrorIfExists
+        mode = mode.lower()
+        if mode == "append":
+            jmode = jSaveMode.Append
+        elif mode == "overwrite":
+            jmode = jSaveMode.Overwrite
+        elif mode == "ignore":
+            jmode = jSaveMode.Ignore
+        elif mode == "error":
+            pass
+        else:
+            raise ValueError(
+                "Only 'append', 'overwrite', 'ignore', and 'error' are acceptable save mode.")
+        return jmode
+
+    def saveAsTable(self, tableName, source=None, mode="append", **options):
+        """Saves the contents of the DataFrame to a data source as a table.
+
+        The data source is specified by the `source` and a set of `options`.
+        If `source` is not specified, the default data source configured by
+        spark.sql.sources.default will be used.
+
+        Additionally, mode is used to specify the behavior of the saveAsTable operation when
+        table already exists in the data source. There are four modes:
+
+        * append: Contents of this DataFrame are expected to be appended to existing table.
+        * overwrite: Data in the existing table is expected to be overwritten by the contents of \
+            this DataFrame.
+        * error: An exception is expected to be thrown.
+        * ignore: The save operation is expected to not save the contents of the DataFrame and \
+            to not change the existing table.
+        """
+        if source is None:
+            source = self.sql_ctx.getConf("spark.sql.sources.default",
+                                          "org.apache.spark.sql.parquet")
+        jmode = self._java_save_mode(mode)
+        joptions = MapConverter().convert(options,
+                                          self.sql_ctx._sc._gateway._gateway_client)
+        self._jdf.saveAsTable(tableName, source, jmode, joptions)
+
+    def save(self, path=None, source=None, mode="append", **options):
+        """Saves the contents of the DataFrame to a data source.
+
+        The data source is specified by the `source` and a set of `options`.
+        If `source` is not specified, the default data source configured by
+        spark.sql.sources.default will be used.
+
+        Additionally, mode is used to specify the behavior of the save operation when
+        data already exists in the data source. There are four modes:
+
+        * append: Contents of this DataFrame are expected to be appended to existing data.
+        * overwrite: Existing data is expected to be overwritten by the contents of this DataFrame.
+        * error: An exception is expected to be thrown.
+        * ignore: The save operation is expected to not save the contents of the DataFrame and \
+            to not change the existing data.
+        """
+        if path is not None:
+            options["path"] = path
+        if source is None:
+            source = self.sql_ctx.getConf("spark.sql.sources.default",
+                                          "org.apache.spark.sql.parquet")
+        jmode = self._java_save_mode(mode)
+        joptions = MapConverter().convert(options,
+                                          self._sc._gateway._gateway_client)
+        self._jdf.save(source, jmode, joptions)
 
     def schema(self):
         """Returns the schema of this DataFrame (represented by
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index d25c6365ed067..bc945091f7042 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -34,10 +34,9 @@
 else:
     import unittest
 
-
-from pyspark.sql import SQLContext, Column
+from pyspark.sql import SQLContext, HiveContext, Column
 from pyspark.sql.types import IntegerType, Row, ArrayType, StructType, StructField, \
-    UserDefinedType, DoubleType, LongType
+    UserDefinedType, DoubleType, LongType, StringType
 from pyspark.tests import ReusedPySparkTestCase
 
 
@@ -286,6 +285,37 @@ def test_aggregator(self):
         self.assertTrue(95 < g.agg(Dsl.approxCountDistinct(df.key)).first()[0])
         self.assertEqual(100, g.agg(Dsl.countDistinct(df.value)).first()[0])
 
+    def test_save_and_load(self):
+        df = self.df
+        tmpPath = tempfile.mkdtemp()
+        shutil.rmtree(tmpPath)
+        df.save(tmpPath, "org.apache.spark.sql.json", "error")
+        actual = self.sqlCtx.load(tmpPath, "org.apache.spark.sql.json")
+        self.assertTrue(sorted(df.collect()) == sorted(actual.collect()))
+
+        schema = StructType([StructField("value", StringType(), True)])
+        actual = self.sqlCtx.load(tmpPath, "org.apache.spark.sql.json", schema)
+        self.assertTrue(sorted(df.select("value").collect()) == sorted(actual.collect()))
+
+        df.save(tmpPath, "org.apache.spark.sql.json", "overwrite")
+        actual = self.sqlCtx.load(tmpPath, "org.apache.spark.sql.json")
+        self.assertTrue(sorted(df.collect()) == sorted(actual.collect()))
+
+        df.save(source="org.apache.spark.sql.json", mode="overwrite", path=tmpPath,
+                noUse="this options will not be used in save.")
+        actual = self.sqlCtx.load(source="org.apache.spark.sql.json", path=tmpPath,
+                                  noUse="this options will not be used in load.")
+        self.assertTrue(sorted(df.collect()) == sorted(actual.collect()))
+
+        defaultDataSourceName = self.sqlCtx.getConf("spark.sql.sources.default",
+                                                    "org.apache.spark.sql.parquet")
+        self.sqlCtx.sql("SET spark.sql.sources.default=org.apache.spark.sql.json")
+        actual = self.sqlCtx.load(path=tmpPath)
+        self.assertTrue(sorted(df.collect()) == sorted(actual.collect()))
+        self.sqlCtx.sql("SET spark.sql.sources.default=" + defaultDataSourceName)
+
+        shutil.rmtree(tmpPath)
+
     def test_help_command(self):
         # Regression test for SPARK-5464
         rdd = self.sc.parallelize(['{"foo":"bar"}', '{"foo":"baz"}'])
@@ -296,5 +326,76 @@ def test_help_command(self):
         pydoc.render_doc(df.take(1))
 
 
+class HiveContextSQLTests(ReusedPySparkTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        ReusedPySparkTestCase.setUpClass()
+        cls.tempdir = tempfile.NamedTemporaryFile(delete=False)
+        os.unlink(cls.tempdir.name)
+        print "type", type(cls.sc)
+        print "type", type(cls.sc._jsc)
+        _scala_HiveContext =\
+            cls.sc._jvm.org.apache.spark.sql.hive.test.TestHiveContext(cls.sc._jsc.sc())
+        cls.sqlCtx = HiveContext(cls.sc, _scala_HiveContext)
+        cls.testData = [Row(key=i, value=str(i)) for i in range(100)]
+        rdd = cls.sc.parallelize(cls.testData)
+        cls.df = cls.sqlCtx.inferSchema(rdd)
+
+    @classmethod
+    def tearDownClass(cls):
+        ReusedPySparkTestCase.tearDownClass()
+        shutil.rmtree(cls.tempdir.name, ignore_errors=True)
+
+    def test_save_and_load_table(self):
+        df = self.df
+        tmpPath = tempfile.mkdtemp()
+        shutil.rmtree(tmpPath)
+        df.saveAsTable("savedJsonTable", "org.apache.spark.sql.json", "append", path=tmpPath)
+        actual = self.sqlCtx.createExternalTable("externalJsonTable", tmpPath,
+                                                 "org.apache.spark.sql.json")
+        self.assertTrue(
+            sorted(df.collect()) ==
+            sorted(self.sqlCtx.sql("SELECT * FROM savedJsonTable").collect()))
+        self.assertTrue(
+            sorted(df.collect()) ==
+            sorted(self.sqlCtx.sql("SELECT * FROM externalJsonTable").collect()))
+        self.assertTrue(sorted(df.collect()) == sorted(actual.collect()))
+        self.sqlCtx.sql("DROP TABLE externalJsonTable")
+
+        df.saveAsTable("savedJsonTable", "org.apache.spark.sql.json", "overwrite", path=tmpPath)
+        schema = StructType([StructField("value", StringType(), True)])
+        actual = self.sqlCtx.createExternalTable("externalJsonTable",
+                                                 source="org.apache.spark.sql.json",
+                                                 schema=schema, path=tmpPath,
+                                                 noUse="this options will not be used")
+        self.assertTrue(
+            sorted(df.collect()) ==
+            sorted(self.sqlCtx.sql("SELECT * FROM savedJsonTable").collect()))
+        self.assertTrue(
+            sorted(df.select("value").collect()) ==
+            sorted(self.sqlCtx.sql("SELECT * FROM externalJsonTable").collect()))
+        self.assertTrue(sorted(df.select("value").collect()) == sorted(actual.collect()))
+        self.sqlCtx.sql("DROP TABLE savedJsonTable")
+        self.sqlCtx.sql("DROP TABLE externalJsonTable")
+
+        defaultDataSourceName = self.sqlCtx.getConf("spark.sql.sources.default",
+                                                    "org.apache.spark.sql.parquet")
+        self.sqlCtx.sql("SET spark.sql.sources.default=org.apache.spark.sql.json")
+        df.saveAsTable("savedJsonTable", path=tmpPath, mode="overwrite")
+        actual = self.sqlCtx.createExternalTable("externalJsonTable", path=tmpPath)
+        self.assertTrue(
+            sorted(df.collect()) ==
+            sorted(self.sqlCtx.sql("SELECT * FROM savedJsonTable").collect()))
+        self.assertTrue(
+            sorted(df.collect()) ==
+            sorted(self.sqlCtx.sql("SELECT * FROM externalJsonTable").collect()))
+        self.assertTrue(sorted(df.collect()) == sorted(actual.collect()))
+        self.sqlCtx.sql("DROP TABLE savedJsonTable")
+        self.sqlCtx.sql("DROP TABLE externalJsonTable")
+        self.sqlCtx.sql("SET spark.sql.sources.default=" + defaultDataSourceName)
+
+        shutil.rmtree(tmpPath)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/SaveMode.java b/sql/core/src/main/java/org/apache/spark/sql/sources/SaveMode.java
new file mode 100644
index 0000000000000..3109f5716da2c
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/SaveMode.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.sources;
+
+/**
+ * SaveMode is used to specify the expected behavior of saving a DataFrame to a data source.
+ */
+public enum SaveMode {
+  /**
+   * Append mode means that when saving a DataFrame to a data source, if data/table already exists,
+   * contents of the DataFrame are expected to be appended to existing data.
+   */
+  Append,
+  /**
+   * Overwrite mode means that when saving a DataFrame to a data source,
+   * if data/table already exists, existing data is expected to be overwritten by the contents of
+   * the DataFrame.
+   */
+  Overwrite,
+  /**
+   * ErrorIfExists mode means that when saving a DataFrame to a data source, if data already exists,
+   * an exception is expected to be thrown.
+   */
+  ErrorIfExists,
+  /**
+   * Ignore mode means that when saving a DataFrame to a data source, if data already exists,
+   * the save operation is expected to not save the contents of the DataFrame and to not
+   * change the existing data.
+   */
+  Ignore
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 04e0d09947492..ca8d552c5febf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -17,19 +17,19 @@
 
 package org.apache.spark.sql
 
+import scala.collection.JavaConversions._
 import scala.reflect.ClassTag
+import scala.util.control.NonFatal
 
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.sources.SaveMode
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
-import scala.util.control.NonFatal
-
-
 private[sql] object DataFrame {
   def apply(sqlContext: SQLContext, logicalPlan: LogicalPlan): DataFrame = {
     new DataFrameImpl(sqlContext, logicalPlan)
@@ -574,8 +574,64 @@ trait DataFrame extends RDDApi[Row] {
 
   /**
    * :: Experimental ::
-   * Creates a table from the the contents of this DataFrame.  This will fail if the table already
-   * exists.
+   * Creates a table from the the contents of this DataFrame.
+   * It will use the default data source configured by spark.sql.sources.default.
+   * This will fail if the table already exists.
+   *
+   * Note that this currently only works with DataFrames that are created from a HiveContext as
+   * there is no notion of a persisted catalog in a standard SQL context.  Instead you can write
+   * an RDD out to a parquet file, and then register that file as a table.  This "table" can then
+   * be the target of an `insertInto`.
+   */
+  @Experimental
+  def saveAsTable(tableName: String): Unit = {
+    saveAsTable(tableName, SaveMode.ErrorIfExists)
+  }
+
+  /**
+   * :: Experimental ::
+   * Creates a table from the the contents of this DataFrame, using the default data source
+   * configured by spark.sql.sources.default and [[SaveMode.ErrorIfExists]] as the save mode.
+   *
+   * Note that this currently only works with DataFrames that are created from a HiveContext as
+   * there is no notion of a persisted catalog in a standard SQL context.  Instead you can write
+   * an RDD out to a parquet file, and then register that file as a table.  This "table" can then
+   * be the target of an `insertInto`.
+   */
+  @Experimental
+  def saveAsTable(tableName: String, mode: SaveMode): Unit = {
+    if (sqlContext.catalog.tableExists(Seq(tableName)) && mode == SaveMode.Append) {
+      // If table already exists and the save mode is Append,
+      // we will just call insertInto to append the contents of this DataFrame.
+      insertInto(tableName, overwrite = false)
+    } else {
+      val dataSourceName = sqlContext.conf.defaultDataSourceName
+      saveAsTable(tableName, dataSourceName, mode)
+    }
+  }
+
+  /**
+   * :: Experimental ::
+   * Creates a table at the given path from the the contents of this DataFrame
+   * based on a given data source and a set of options,
+   * using [[SaveMode.ErrorIfExists]] as the save mode.
+   *
+   * Note that this currently only works with DataFrames that are created from a HiveContext as
+   * there is no notion of a persisted catalog in a standard SQL context.  Instead you can write
+   * an RDD out to a parquet file, and then register that file as a table.  This "table" can then
+   * be the target of an `insertInto`.
+   */
+  @Experimental
+  def saveAsTable(
+      tableName: String,
+      source: String): Unit = {
+    saveAsTable(tableName, source, SaveMode.ErrorIfExists)
+  }
+
+  /**
+   * :: Experimental ::
+   * Creates a table at the given path from the the contents of this DataFrame
+   * based on a given data source, [[SaveMode]] specified by mode, and a set of options.
    *
    * Note that this currently only works with DataFrames that are created from a HiveContext as
    * there is no notion of a persisted catalog in a standard SQL context.  Instead you can write
@@ -583,12 +639,17 @@ trait DataFrame extends RDDApi[Row] {
    * be the target of an `insertInto`.
    */
   @Experimental
-  def saveAsTable(tableName: String): Unit
+  def saveAsTable(
+      tableName: String,
+      source: String,
+      mode: SaveMode): Unit = {
+    saveAsTable(tableName, source, mode, Map.empty[String, String])
+  }
 
   /**
    * :: Experimental ::
-   * Creates a table from the the contents of this DataFrame based on a given data source and
-   * a set of options. This will fail if the table already exists.
+   * Creates a table at the given path from the the contents of this DataFrame
+   * based on a given data source, [[SaveMode]] specified by mode, and a set of options.
    *
    * Note that this currently only works with DataFrames that are created from a HiveContext as
    * there is no notion of a persisted catalog in a standard SQL context.  Instead you can write
@@ -598,14 +659,17 @@ trait DataFrame extends RDDApi[Row] {
   @Experimental
   def saveAsTable(
       tableName: String,
-      dataSourceName: String,
-      option: (String, String),
-      options: (String, String)*): Unit
+      source: String,
+      mode: SaveMode,
+      options: java.util.Map[String, String]): Unit = {
+    saveAsTable(tableName, source, mode, options.toMap)
+  }
 
   /**
    * :: Experimental ::
-   * Creates a table from the the contents of this DataFrame based on a given data source and
-   * a set of options. This will fail if the table already exists.
+   * (Scala-specific)
+   * Creates a table from the the contents of this DataFrame based on a given data source,
+   * [[SaveMode]] specified by mode, and a set of options.
    *
    * Note that this currently only works with DataFrames that are created from a HiveContext as
    * there is no notion of a persisted catalog in a standard SQL context.  Instead you can write
@@ -615,22 +679,76 @@ trait DataFrame extends RDDApi[Row] {
   @Experimental
   def saveAsTable(
       tableName: String,
-      dataSourceName: String,
-      options: java.util.Map[String, String]): Unit
+      source: String,
+      mode: SaveMode,
+      options: Map[String, String]): Unit
+
+  /**
+   * :: Experimental ::
+   * Saves the contents of this DataFrame to the given path,
+   * using the default data source configured by spark.sql.sources.default and
+   * [[SaveMode.ErrorIfExists]] as the save mode.
+   */
+  @Experimental
+  def save(path: String): Unit = {
+    save(path, SaveMode.ErrorIfExists)
+  }
+
+  /**
+   * :: Experimental ::
+   * Saves the contents of this DataFrame to the given path and [[SaveMode]] specified by mode,
+   * using the default data source configured by spark.sql.sources.default.
+   */
+  @Experimental
+  def save(path: String, mode: SaveMode): Unit = {
+    val dataSourceName = sqlContext.conf.defaultDataSourceName
+    save(path, dataSourceName, mode)
+  }
 
+  /**
+   * :: Experimental ::
+   * Saves the contents of this DataFrame to the given path based on the given data source,
+   * using [[SaveMode.ErrorIfExists]] as the save mode.
+   */
+  @Experimental
+  def save(path: String, source: String): Unit = {
+    save(source, SaveMode.ErrorIfExists, Map("path" -> path))
+  }
+
+  /**
+   * :: Experimental ::
+   * Saves the contents of this DataFrame to the given path based on the given data source and
+   * [[SaveMode]] specified by mode.
+   */
   @Experimental
-  def save(path: String): Unit
+  def save(path: String, source: String, mode: SaveMode): Unit = {
+    save(source, mode, Map("path" -> path))
+  }
 
+  /**
+   * :: Experimental ::
+   * Saves the contents of this DataFrame based on the given data source,
+   * [[SaveMode]] specified by mode, and a set of options.
+   */
   @Experimental
   def save(
-      dataSourceName: String,
-      option: (String, String),
-      options: (String, String)*): Unit
+      source: String,
+      mode: SaveMode,
+      options: java.util.Map[String, String]): Unit = {
+    save(source, mode, options.toMap)
+  }
 
+  /**
+   * :: Experimental ::
+   * (Scala-specific)
+   * Saves the contents of this DataFrame based on the given data source,
+   * [[SaveMode]] specified by mode, and a set of options
+   */
   @Experimental
   def save(
-      dataSourceName: String,
-      options: java.util.Map[String, String]): Unit
+      source: String,
+      mode: SaveMode,
+      options: Map[String, String]): Unit
 
   /**
    * :: Experimental ::
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index 1ee16ad5161c8..11f9334556981 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -28,13 +28,14 @@ import org.apache.spark.api.python.SerDeUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.sql.catalyst.{SqlParser, ScalaReflection}
-import org.apache.spark.sql.catalyst.analysis.{ResolvedStar, UnresolvedRelation}
+import org.apache.spark.sql.catalyst.analysis.{EliminateAnalysisOperators, ResolvedStar, UnresolvedRelation}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.{JoinType, Inner}
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.util.sideBySide
 import org.apache.spark.sql.execution.{LogicalRDD, EvaluatePython}
 import org.apache.spark.sql.json.JsonRDD
-import org.apache.spark.sql.sources.{ResolvedDataSource, CreateTableUsingAsLogicalPlan}
+import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{NumericType, StructType}
 
 
@@ -341,68 +342,34 @@ private[sql] class DataFrameImpl protected[sql](
 
   override def saveAsParquetFile(path: String): Unit = {
     if (sqlContext.conf.parquetUseDataSourceApi) {
-      save("org.apache.spark.sql.parquet", "path" -> path)
+      save("org.apache.spark.sql.parquet", SaveMode.ErrorIfExists, Map("path" -> path))
     } else {
       sqlContext.executePlan(WriteToFile(path, logicalPlan)).toRdd
     }
   }
 
-  override def saveAsTable(tableName: String): Unit = {
-    val dataSourceName = sqlContext.conf.defaultDataSourceName
-    val cmd =
-      CreateTableUsingAsLogicalPlan(
-        tableName,
-        dataSourceName,
-        temporary = false,
-        Map.empty,
-        allowExisting = false,
-        logicalPlan)
-
-    sqlContext.executePlan(cmd).toRdd
-  }
-
   override def saveAsTable(
       tableName: String,
-      dataSourceName: String,
-      option: (String, String),
-      options: (String, String)*): Unit = {
+      source: String,
+      mode: SaveMode,
+      options: Map[String, String]): Unit = {
     val cmd =
       CreateTableUsingAsLogicalPlan(
         tableName,
-        dataSourceName,
+        source,
         temporary = false,
-        (option +: options).toMap,
-        allowExisting = false,
+        mode,
+        options,
         logicalPlan)
 
     sqlContext.executePlan(cmd).toRdd
   }
 
-  override def saveAsTable(
-      tableName: String,
-      dataSourceName: String,
-      options: java.util.Map[String, String]): Unit = {
-    val opts = options.toSeq
-    saveAsTable(tableName, dataSourceName, opts.head, opts.tail:_*)
-  }
-
-  override def save(path: String): Unit = {
-    val dataSourceName = sqlContext.conf.defaultDataSourceName
-    save(dataSourceName, "path" -> path)
-  }
-
-  override def save(
-      dataSourceName: String,
-      option: (String, String),
-      options: (String, String)*): Unit = {
-    ResolvedDataSource(sqlContext, dataSourceName, (option +: options).toMap, this)
-  }
-
   override def save(
-      dataSourceName: String,
-      options: java.util.Map[String, String]): Unit = {
-    val opts = options.toSeq
-    save(dataSourceName, opts.head, opts.tail:_*)
+      source: String,
+      mode: SaveMode,
+      options: Map[String, String]): Unit = {
+    ResolvedDataSource(sqlContext, source, mode, options, this)
   }
 
   override def insertInto(tableName: String, overwrite: Boolean): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
index ce0557b88196f..494e49c1317b6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
@@ -25,9 +25,9 @@ import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedSt
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.sql.sources.SaveMode
 import org.apache.spark.sql.types.StructType
 
-
 private[sql] class IncomputableColumn(protected[sql] val expr: Expression) extends Column {
 
   def this(name: String) = this(name match {
@@ -156,29 +156,16 @@ private[sql] class IncomputableColumn(protected[sql] val expr: Expression) exten
 
   override def saveAsParquetFile(path: String): Unit = err()
 
-  override def saveAsTable(tableName: String): Unit = err()
-
-  override def saveAsTable(
-      tableName: String,
-      dataSourceName: String,
-      option: (String, String),
-      options: (String, String)*): Unit = err()
-
   override def saveAsTable(
       tableName: String,
-      dataSourceName: String,
-      options: java.util.Map[String, String]): Unit = err()
-
-  override def save(path: String): Unit = err()
-
-  override def save(
-      dataSourceName: String,
-      option: (String, String),
-      options: (String, String)*): Unit = err()
+      source: String,
+      mode: SaveMode,
+      options: Map[String, String]): Unit = err()
 
   override def save(
-      dataSourceName: String,
-      options: java.util.Map[String, String]): Unit = err()
+      source: String,
+      mode: SaveMode,
+      options: Map[String, String]): Unit = err()
 
   override def insertInto(tableName: String, overwrite: Boolean): Unit = err()
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 180f5e765fb91..39f6c2f4bc8b4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -50,7 +50,7 @@ private[spark] object SQLConf {
   val THRIFTSERVER_POOL = "spark.sql.thriftserver.scheduler.pool"
 
   // This is used to set the default data source
-  val DEFAULT_DATA_SOURCE_NAME = "spark.sql.default.datasource"
+  val DEFAULT_DATA_SOURCE_NAME = "spark.sql.sources.default"
 
   // Whether to perform eager analysis on a DataFrame.
   val DATAFRAME_EAGER_ANALYSIS = "spark.sql.dataframe.eagerAnalysis"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 97e3777f933e4..801505bceb956 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -401,27 +401,173 @@ class SQLContext(@transient val sparkContext: SparkContext)
     jsonRDD(json.rdd, samplingRatio);
   }
 
+  /**
+   * :: Experimental ::
+   * Returns the dataset stored at path as a DataFrame,
+   * using the default data source configured by spark.sql.sources.default.
+   */
   @Experimental
   def load(path: String): DataFrame = {
     val dataSourceName = conf.defaultDataSourceName
-    load(dataSourceName, ("path", path))
+    load(path, dataSourceName)
   }
 
+  /**
+   * :: Experimental ::
+   * Returns the dataset stored at path as a DataFrame,
+   * using the given data source.
+   */
   @Experimental
-  def load(
-      dataSourceName: String,
-      option: (String, String),
-      options: (String, String)*): DataFrame = {
-    val resolved = ResolvedDataSource(this, None, dataSourceName, (option +: options).toMap)
+  def load(path: String, source: String): DataFrame = {
+    load(source, Map("path" -> path))
+  }
+
+  /**
+   * :: Experimental ::
+   * Returns the dataset specified by the given data source and a set of options as a DataFrame.
+   */
+  @Experimental
+  def load(source: String, options: java.util.Map[String, String]): DataFrame = {
+    load(source, options.toMap)
+  }
+
+  /**
+   * :: Experimental ::
+   * (Scala-specific)
+   * Returns the dataset specified by the given data source and a set of options as a DataFrame.
+   */
+  @Experimental
+  def load(source: String, options: Map[String, String]): DataFrame = {
+    val resolved = ResolvedDataSource(this, None, source, options)
     DataFrame(this, LogicalRelation(resolved.relation))
   }
 
+  /**
+   * :: Experimental ::
+   * Returns the dataset specified by the given data source and a set of options as a DataFrame,
+   * using the given schema as the schema of the DataFrame.
+   */
   @Experimental
   def load(
-      dataSourceName: String,
+      source: String,
+      schema: StructType,
       options: java.util.Map[String, String]): DataFrame = {
-    val opts = options.toSeq
-    load(dataSourceName, opts.head, opts.tail:_*)
+    load(source, schema, options.toMap)
+  }
+
+  /**
+   * :: Experimental ::
+   * (Scala-specific)
+   * Returns the dataset specified by the given data source and a set of options as a DataFrame,
+   * using the given schema as the schema of the DataFrame.
+   */
+  @Experimental
+  def load(
+      source: String,
+      schema: StructType,
+      options: Map[String, String]): DataFrame = {
+    val resolved = ResolvedDataSource(this, Some(schema), source, options)
+    DataFrame(this, LogicalRelation(resolved.relation))
+  }
+
+  /**
+   * :: Experimental ::
+   * Creates an external table from the given path and returns the corresponding DataFrame.
+   * It will use the default data source configured by spark.sql.sources.default.
+   */
+  @Experimental
+  def createExternalTable(tableName: String, path: String): DataFrame = {
+    val dataSourceName = conf.defaultDataSourceName
+    createExternalTable(tableName, path, dataSourceName)
+  }
+
+  /**
+   * :: Experimental ::
+   * Creates an external table from the given path based on a data source
+   * and returns the corresponding DataFrame.
+   */
+  @Experimental
+  def createExternalTable(
+      tableName: String,
+      path: String,
+      source: String): DataFrame = {
+    createExternalTable(tableName, source, Map("path" -> path))
+  }
+
+  /**
+   * :: Experimental ::
+   * Creates an external table from the given path based on a data source and a set of options.
+   * Then, returns the corresponding DataFrame.
+   */
+  @Experimental
+  def createExternalTable(
+      tableName: String,
+      source: String,
+      options: java.util.Map[String, String]): DataFrame = {
+    createExternalTable(tableName, source, options.toMap)
+  }
+
+  /**
+   * :: Experimental ::
+   * (Scala-specific)
+   * Creates an external table from the given path based on a data source and a set of options.
+   * Then, returns the corresponding DataFrame.
+   */
+  @Experimental
+  def createExternalTable(
+      tableName: String,
+      source: String,
+      options: Map[String, String]): DataFrame = {
+    val cmd =
+      CreateTableUsing(
+        tableName,
+        userSpecifiedSchema = None,
+        source,
+        temporary = false,
+        options,
+        allowExisting = false,
+        managedIfNoPath = false)
+    executePlan(cmd).toRdd
+    table(tableName)
+  }
+
+  /**
+   * :: Experimental ::
+   * Create an external table from the given path based on a data source, a schema and
+   * a set of options. Then, returns the corresponding DataFrame.
+   */
+  @Experimental
+  def createExternalTable(
+      tableName: String,
+      source: String,
+      schema: StructType,
+      options: java.util.Map[String, String]): DataFrame = {
+    createExternalTable(tableName, source, schema, options.toMap)
+  }
+
+  /**
+   * :: Experimental ::
+   * (Scala-specific)
+   * Create an external table from the given path based on a data source, a schema and
+   * a set of options. Then, returns the corresponding DataFrame.
+   */
+  @Experimental
+  def createExternalTable(
+      tableName: String,
+      source: String,
+      schema: StructType,
+      options: Map[String, String]): DataFrame = {
+    val cmd =
+      CreateTableUsing(
+        tableName,
+        userSpecifiedSchema = Some(schema),
+        source,
+        temporary = false,
+        options,
+        allowExisting = false,
+        managedIfNoPath = false)
+    executePlan(cmd).toRdd
+    table(tableName)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index edf8a5be64ff1..e915e0e6a0ec1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -309,7 +309,7 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
   object DDLStrategy extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case CreateTableUsing(tableName, userSpecifiedSchema, provider, true, opts, false) =>
+      case CreateTableUsing(tableName, userSpecifiedSchema, provider, true, opts, false, _) =>
         ExecutedCommand(
           CreateTempTableUsing(
             tableName, userSpecifiedSchema, provider, opts)) :: Nil
@@ -318,24 +318,20 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case c: CreateTableUsing if c.temporary && c.allowExisting =>
         sys.error("allowExisting should be set to false when creating a temporary table.")
 
-      case CreateTableUsingAsSelect(tableName, provider, true, opts, false, query) =>
+      case CreateTableUsingAsSelect(tableName, provider, true, mode, opts, query) =>
         val logicalPlan = sqlContext.parseSql(query)
         val cmd =
-          CreateTempTableUsingAsSelect(tableName, provider, opts, logicalPlan)
+          CreateTempTableUsingAsSelect(tableName, provider, mode, opts, logicalPlan)
         ExecutedCommand(cmd) :: Nil
       case c: CreateTableUsingAsSelect if !c.temporary =>
         sys.error("Tables created with SQLContext must be TEMPORARY. Use a HiveContext instead.")
-      case c: CreateTableUsingAsSelect if c.temporary && c.allowExisting =>
-        sys.error("allowExisting should be set to false when creating a temporary table.")
 
-      case CreateTableUsingAsLogicalPlan(tableName, provider, true, opts, false, query) =>
+      case CreateTableUsingAsLogicalPlan(tableName, provider, true, mode, opts, query) =>
         val cmd =
-          CreateTempTableUsingAsSelect(tableName, provider, opts, query)
+          CreateTempTableUsingAsSelect(tableName, provider, mode, opts, query)
         ExecutedCommand(cmd) :: Nil
       case c: CreateTableUsingAsLogicalPlan if !c.temporary =>
         sys.error("Tables created with SQLContext must be TEMPORARY. Use a HiveContext instead.")
-      case c: CreateTableUsingAsLogicalPlan if c.temporary && c.allowExisting =>
-        sys.error("allowExisting should be set to false when creating a temporary table.")
 
       case LogicalDescribeCommand(table, isExtended) =>
         val resultPlan = self.sqlContext.executePlan(table).executedPlan
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
index c4e14c6c92908..f828bcdd65c9e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
@@ -20,8 +20,7 @@ package org.apache.spark.sql.json
 import java.io.IOException
 
 import org.apache.hadoop.fs.Path
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
 
@@ -29,6 +28,10 @@ import org.apache.spark.sql.types.StructType
 private[sql] class DefaultSource
   extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider {
 
+  private def checkPath(parameters: Map[String, String]): String = {
+    parameters.getOrElse("path", sys.error("'path' must be specified for json data."))
+  }
+
   /** Returns a new base relation with the parameters. */
   override def createRelation(
       sqlContext: SQLContext,
@@ -52,15 +55,30 @@ private[sql] class DefaultSource
 
   override def createRelation(
       sqlContext: SQLContext,
+      mode: SaveMode,
       parameters: Map[String, String],
       data: DataFrame): BaseRelation = {
-    val path = parameters.getOrElse("path", sys.error("Option 'path' not specified"))
+    val path = checkPath(parameters)
     val filesystemPath = new Path(path)
     val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
-    if (fs.exists(filesystemPath)) {
-      sys.error(s"path $path already exists.")
+    val doSave = if (fs.exists(filesystemPath)) {
+      mode match {
+        case SaveMode.Append =>
+          sys.error(s"Append mode is not supported by ${this.getClass.getCanonicalName}")
+        case SaveMode.Overwrite =>
+          fs.delete(filesystemPath, true)
+          true
+        case SaveMode.ErrorIfExists =>
+          sys.error(s"path $path already exists.")
+        case SaveMode.Ignore => false
+      }
+    } else {
+      true
+    }
+    if (doSave) {
+      // Only save data when the save mode is not ignore.
+      data.toJSON.saveAsTextFile(path)
     }
-    data.toJSON.saveAsTextFile(path)
 
     createRelation(sqlContext, parameters, data.schema)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 04804f78f5c34..aef9c10fbcd01 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -80,18 +80,45 @@ class DefaultSource
 
   override def createRelation(
       sqlContext: SQLContext,
+      mode: SaveMode,
       parameters: Map[String, String],
       data: DataFrame): BaseRelation = {
     val path = checkPath(parameters)
-    ParquetRelation.createEmpty(
-      path,
-      data.schema.toAttributes,
-      false,
-      sqlContext.sparkContext.hadoopConfiguration,
-      sqlContext)
-
-    val relation = createRelation(sqlContext, parameters, data.schema)
-    relation.asInstanceOf[ParquetRelation2].insert(data, true)
+    val filesystemPath = new Path(path)
+    val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
+    val doSave = if (fs.exists(filesystemPath)) {
+      mode match {
+        case SaveMode.Append =>
+          sys.error(s"Append mode is not supported by ${this.getClass.getCanonicalName}")
+        case SaveMode.Overwrite =>
+          fs.delete(filesystemPath, true)
+          true
+        case SaveMode.ErrorIfExists =>
+          sys.error(s"path $path already exists.")
+        case SaveMode.Ignore => false
+      }
+    } else {
+      true
+    }
+
+    val relation = if (doSave) {
+      // Only save data when the save mode is not ignore.
+      ParquetRelation.createEmpty(
+        path,
+        data.schema.toAttributes,
+        false,
+        sqlContext.sparkContext.hadoopConfiguration,
+        sqlContext)
+
+      val createdRelation = createRelation(sqlContext, parameters, data.schema)
+      createdRelation.asInstanceOf[ParquetRelation2].insert(data, true)
+
+      createdRelation
+    } else {
+      // If the save mode is Ignore, we will just create the relation based on existing data.
+      createRelation(sqlContext, parameters)
+    }
+
     relation
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index 9f64f761002c9..6487c14b1eb8f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -119,11 +119,20 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
             throw new DDLException(
               "a CREATE TABLE AS SELECT statement does not allow column definitions.")
           }
+          // When IF NOT EXISTS clause appears in the query, the save mode will be ignore.
+          val mode = if (allowExisting.isDefined) {
+            SaveMode.Ignore
+          } else if (temp.isDefined) {
+            SaveMode.Overwrite
+          } else {
+            SaveMode.ErrorIfExists
+          }
+
           CreateTableUsingAsSelect(tableName,
             provider,
             temp.isDefined,
+            mode,
             options,
-            allowExisting.isDefined,
             query.get)
         } else {
           val userSpecifiedSchema = columns.flatMap(fields => Some(StructType(fields)))
@@ -133,7 +142,8 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
             provider,
             temp.isDefined,
             options,
-            allowExisting.isDefined)
+            allowExisting.isDefined,
+            managedIfNoPath = false)
         }
       }
   )
@@ -264,6 +274,7 @@ object ResolvedDataSource {
   def apply(
       sqlContext: SQLContext,
       provider: String,
+      mode: SaveMode,
       options: Map[String, String],
       data: DataFrame): ResolvedDataSource = {
     val loader = Utils.getContextOrSparkClassLoader
@@ -277,7 +288,7 @@ object ResolvedDataSource {
 
     val relation = clazz.newInstance match {
       case dataSource: CreatableRelationProvider =>
-        dataSource.createRelation(sqlContext, options, data)
+        dataSource.createRelation(sqlContext, mode, options, data)
       case _ =>
         sys.error(s"${clazz.getCanonicalName} does not allow create table as select.")
     }
@@ -307,28 +318,40 @@ private[sql] case class DescribeCommand(
       new MetadataBuilder().putString("comment", "comment of the column").build())())
 }
 
+/**
+  * Used to represent the operation of create table using a data source.
+  * @param tableName
+  * @param userSpecifiedSchema
+  * @param provider
+  * @param temporary
+  * @param options
+  * @param allowExisting If it is true, we will do nothing when the table already exists.
+ *                      If it is false, an exception will be thrown
+  * @param managedIfNoPath
+  */
 private[sql] case class CreateTableUsing(
     tableName: String,
     userSpecifiedSchema: Option[StructType],
     provider: String,
     temporary: Boolean,
     options: Map[String, String],
-    allowExisting: Boolean) extends Command
+    allowExisting: Boolean,
+    managedIfNoPath: Boolean) extends Command
 
 private[sql] case class CreateTableUsingAsSelect(
     tableName: String,
     provider: String,
     temporary: Boolean,
+    mode: SaveMode,
     options: Map[String, String],
-    allowExisting: Boolean,
     query: String) extends Command
 
 private[sql] case class CreateTableUsingAsLogicalPlan(
     tableName: String,
     provider: String,
     temporary: Boolean,
+    mode: SaveMode,
     options: Map[String, String],
-    allowExisting: Boolean,
     query: LogicalPlan) extends Command
 
 private [sql] case class CreateTempTableUsing(
@@ -348,12 +371,13 @@ private [sql] case class CreateTempTableUsing(
 private [sql] case class CreateTempTableUsingAsSelect(
     tableName: String,
     provider: String,
+    mode: SaveMode,
     options: Map[String, String],
     query: LogicalPlan) extends RunnableCommand {
 
   def run(sqlContext: SQLContext) = {
     val df = DataFrame(sqlContext, query)
-    val resolved = ResolvedDataSource(sqlContext, provider, options, df)
+    val resolved = ResolvedDataSource(sqlContext, provider, mode, options, df)
     sqlContext.registerRDDAsTable(
       DataFrame(sqlContext, LogicalRelation(resolved.relation)), tableName)
 
@@ -364,7 +388,7 @@ private [sql] case class CreateTempTableUsingAsSelect(
 /**
  * Builds a map in which keys are case insensitive
  */
-protected class CaseInsensitiveMap(map: Map[String, String]) extends Map[String, String]
+protected[sql] class CaseInsensitiveMap(map: Map[String, String]) extends Map[String, String]
   with Serializable {
 
   val baseMap = map.map(kv => kv.copy(_1 = kv._1.toLowerCase))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 5eecc303ef72b..37fda7ba6e5d0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -79,8 +79,27 @@ trait SchemaRelationProvider {
 
 @DeveloperApi
 trait CreatableRelationProvider {
+  /**
+    * Creates a relation with the given parameters based on the contents of the given
+    * DataFrame. The mode specifies the expected behavior of createRelation when
+    * data already exists.
+    * Right now, there are three modes, Append, Overwrite, and ErrorIfExists.
+    * Append mode means that when saving a DataFrame to a data source, if data already exists,
+    * contents of the DataFrame are expected to be appended to existing data.
+    * Overwrite mode means that when saving a DataFrame to a data source, if data already exists,
+    * existing data is expected to be overwritten by the contents of the DataFrame.
+    * ErrorIfExists mode means that when saving a DataFrame to a data source,
+    * if data already exists, an exception is expected to be thrown.
+    *
+    * @param sqlContext
+    * @param mode
+    * @param parameters
+    * @param data
+    * @return
+    */
   def createRelation(
       sqlContext: SQLContext,
+      mode: SaveMode,
       parameters: Map[String, String],
       data: DataFrame): BaseRelation
 }
diff --git a/sql/core/src/test/java/org/apache/spark/sql/sources/JavaSaveLoadSuite.java b/sql/core/src/test/java/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
new file mode 100644
index 0000000000000..852baf0e09245
--- /dev/null
+++ b/sql/core/src/test/java/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.sources;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.*;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.test.TestSQLContext$;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.util.Utils;
+
+public class JavaSaveLoadSuite {
+
+  private transient JavaSparkContext sc;
+  private transient SQLContext sqlContext;
+
+  String originalDefaultSource;
+  File path;
+  DataFrame df;
+
+  private void checkAnswer(DataFrame actual, List<Row> expected) {
+    String errorMessage = QueryTest$.MODULE$.checkAnswer(actual, expected);
+    if (errorMessage != null) {
+      Assert.fail(errorMessage);
+    }
+  }
+
+  @Before
+  public void setUp() throws IOException {
+    sqlContext = TestSQLContext$.MODULE$;
+    sc = new JavaSparkContext(sqlContext.sparkContext());
+
+    originalDefaultSource = sqlContext.conf().defaultDataSourceName();
+    path =
+      Utils.createTempDir(System.getProperty("java.io.tmpdir"), "datasource").getCanonicalFile();
+    if (path.exists()) {
+      path.delete();
+    }
+
+    List<String> jsonObjects = new ArrayList<String>(10);
+    for (int i = 0; i < 10; i++) {
+      jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}");
+    }
+    JavaRDD<String> rdd = sc.parallelize(jsonObjects);
+    df = sqlContext.jsonRDD(rdd);
+    df.registerTempTable("jsonTable");
+  }
+
+  @Test
+  public void saveAndLoad() {
+    Map<String, String> options = new HashMap<String, String>();
+    options.put("path", path.toString());
+    df.save("org.apache.spark.sql.json", SaveMode.ErrorIfExists, options);
+
+    DataFrame loadedDF = sqlContext.load("org.apache.spark.sql.json", options);
+
+    checkAnswer(loadedDF, df.collectAsList());
+  }
+
+  @Test
+  public void saveAndLoadWithSchema() {
+    Map<String, String> options = new HashMap<String, String>();
+    options.put("path", path.toString());
+    df.save("org.apache.spark.sql.json", SaveMode.ErrorIfExists, options);
+
+    List<StructField> fields = new ArrayList<>();
+    fields.add(DataTypes.createStructField("b", DataTypes.StringType, true));
+    StructType schema = DataTypes.createStructType(fields);
+    DataFrame loadedDF = sqlContext.load("org.apache.spark.sql.json", schema, options);
+
+    checkAnswer(loadedDF, sqlContext.sql("SELECT b FROM jsonTable").collectAsList());
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
index f9ddd2ca5c567..dfb6858957fb9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql
 
 import java.util.{Locale, TimeZone}
 
+import scala.collection.JavaConversions._
+
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.columnar.InMemoryRelation
@@ -52,9 +54,51 @@ class QueryTest extends PlanTest {
   /**
    * Runs the plan and makes sure the answer matches the expected result.
    * @param rdd the [[DataFrame]] to be executed
-   * @param expectedAnswer the expected result, can either be an Any, Seq[Product], or Seq[ Seq[Any] ].
+   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
    */
   protected def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Unit = {
+    QueryTest.checkAnswer(rdd, expectedAnswer) match {
+      case Some(errorMessage) => fail(errorMessage)
+      case None =>
+    }
+  }
+
+  protected def checkAnswer(rdd: DataFrame, expectedAnswer: Row): Unit = {
+    checkAnswer(rdd, Seq(expectedAnswer))
+  }
+
+  def sqlTest(sqlString: String, expectedAnswer: Seq[Row])(implicit sqlContext: SQLContext): Unit = {
+    test(sqlString) {
+      checkAnswer(sqlContext.sql(sqlString), expectedAnswer)
+    }
+  }
+
+  /**
+   * Asserts that a given [[DataFrame]] will be executed using the given number of cached results.
+   */
+  def assertCached(query: DataFrame, numCachedTables: Int = 1): Unit = {
+    val planWithCaching = query.queryExecution.withCachedData
+    val cachedData = planWithCaching collect {
+      case cached: InMemoryRelation => cached
+    }
+
+    assert(
+      cachedData.size == numCachedTables,
+      s"Expected query to contain $numCachedTables, but it actually had ${cachedData.size}\n" +
+        planWithCaching)
+  }
+}
+
+object QueryTest {
+  /**
+   * Runs the plan and makes sure the answer matches the expected result.
+   * If there was exception during the execution or the contents of the DataFrame does not
+   * match the expected result, an error message will be returned. Otherwise, a [[None]] will
+   * be returned.
+   * @param rdd the [[DataFrame]] to be executed
+   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
+   */
+  def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Option[String] = {
     val isSorted = rdd.logicalPlan.collect { case s: logical.Sort => s }.nonEmpty
     def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
       // Converts data to types that we can do equality comparison using Scala collections.
@@ -70,18 +114,20 @@ class QueryTest extends PlanTest {
     }
     val sparkAnswer = try rdd.collect().toSeq catch {
       case e: Exception =>
-        fail(
+        val errorMessage =
           s"""
             |Exception thrown while executing query:
             |${rdd.queryExecution}
             |== Exception ==
             |$e
             |${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
-          """.stripMargin)
+          """.stripMargin
+        return Some(errorMessage)
     }
 
     if (prepareAnswer(expectedAnswer) != prepareAnswer(sparkAnswer)) {
-      fail(s"""
+      val errorMessage =
+        s"""
         |Results do not match for query:
         |${rdd.logicalPlan}
         |== Analyzed Plan ==
@@ -90,37 +136,21 @@ class QueryTest extends PlanTest {
         |${rdd.queryExecution.executedPlan}
         |== Results ==
         |${sideBySide(
-        s"== Correct Answer - ${expectedAnswer.size} ==" +:
-          prepareAnswer(expectedAnswer).map(_.toString),
-        s"== Spark Answer - ${sparkAnswer.size} ==" +:
-          prepareAnswer(sparkAnswer).map(_.toString)).mkString("\n")}
-      """.stripMargin)
+          s"== Correct Answer - ${expectedAnswer.size} ==" +:
+            prepareAnswer(expectedAnswer).map(_.toString),
+          s"== Spark Answer - ${sparkAnswer.size} ==" +:
+            prepareAnswer(sparkAnswer).map(_.toString)).mkString("\n")}
+      """.stripMargin
+      return Some(errorMessage)
     }
-  }
 
-  protected def checkAnswer(rdd: DataFrame, expectedAnswer: Row): Unit = {
-    checkAnswer(rdd, Seq(expectedAnswer))
-  }
-
-  def sqlTest(sqlString: String, expectedAnswer: Seq[Row])(implicit sqlContext: SQLContext): Unit = {
-    test(sqlString) {
-      checkAnswer(sqlContext.sql(sqlString), expectedAnswer)
-    }
+    return None
   }
 
-  /**
-   * Asserts that a given [[DataFrame]] will be executed using the given number of cached results.
-   */
-  def assertCached(query: DataFrame, numCachedTables: Int = 1): Unit = {
-    val planWithCaching = query.queryExecution.withCachedData
-    val cachedData = planWithCaching collect {
-      case cached: InMemoryRelation => cached
+  def checkAnswer(rdd: DataFrame, expectedAnswer: java.util.List[Row]): String = {
+    checkAnswer(rdd, expectedAnswer.toSeq) match {
+      case Some(errorMessage) => errorMessage
+      case None => null
     }
-
-    assert(
-      cachedData.size == numCachedTables,
-      s"Expected query to contain $numCachedTables, but it actually had ${cachedData.size}\n" +
-        planWithCaching)
   }
-
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
index b02389978b625..29caed9337ff6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
@@ -77,12 +77,10 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll {
       sql("SELECT a, b FROM jsonTable"),
       sql("SELECT a, b FROM jt").collect())
 
-    dropTempTable("jsonTable")
-
-    val message = intercept[RuntimeException]{
+    val message = intercept[DDLException]{
       sql(
         s"""
-        |CREATE TEMPORARY TABLE jsonTable
+        |CREATE TEMPORARY TABLE IF NOT EXISTS jsonTable
         |USING org.apache.spark.sql.json.DefaultSource
         |OPTIONS (
         |  path '${path.toString}'
@@ -91,10 +89,25 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll {
       """.stripMargin)
     }.getMessage
     assert(
-      message.contains(s"path ${path.toString} already exists."),
+      message.contains(s"a CREATE TEMPORARY TABLE statement does not allow IF NOT EXISTS clause."),
       "CREATE TEMPORARY TABLE IF NOT EXISTS should not be allowed.")
 
-    // Explicitly delete it.
+    // Overwrite the temporary table.
+    sql(
+      s"""
+        |CREATE TEMPORARY TABLE jsonTable
+        |USING org.apache.spark.sql.json.DefaultSource
+        |OPTIONS (
+        |  path '${path.toString}'
+        |) AS
+        |SELECT a * 4 FROM jt
+      """.stripMargin)
+    checkAnswer(
+      sql("SELECT * FROM jsonTable"),
+      sql("SELECT a * 4 FROM jt").collect())
+
+    dropTempTable("jsonTable")
+    // Explicitly delete the data.
     if (path.exists()) Utils.deleteRecursively(path)
 
     sql(
@@ -104,12 +117,12 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll {
         |OPTIONS (
         |  path '${path.toString}'
         |) AS
-        |SELECT a * 4 FROM jt
+        |SELECT b FROM jt
       """.stripMargin)
 
     checkAnswer(
       sql("SELECT * FROM jsonTable"),
-      sql("SELECT a * 4 FROM jt").collect())
+      sql("SELECT b FROM jt").collect())
 
     dropTempTable("jsonTable")
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
index fe2f76cc397f5..a51004567175c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
@@ -21,10 +21,10 @@ import java.io.File
 
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.sql.DataFrame
-import org.apache.spark.util.Utils
-
 import org.apache.spark.sql.catalyst.util
+import org.apache.spark.sql.{SQLConf, DataFrame}
+import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 
 class SaveLoadSuite extends DataSourceTest with BeforeAndAfterAll {
 
@@ -38,42 +38,60 @@ class SaveLoadSuite extends DataSourceTest with BeforeAndAfterAll {
 
   override def beforeAll(): Unit = {
     originalDefaultSource = conf.defaultDataSourceName
-    conf.setConf("spark.sql.default.datasource", "org.apache.spark.sql.json")
 
     path = util.getTempFilePath("datasource").getCanonicalFile
 
     val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
     df = jsonRDD(rdd)
+    df.registerTempTable("jsonTable")
   }
 
   override def afterAll(): Unit = {
-    conf.setConf("spark.sql.default.datasource", originalDefaultSource)
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource)
   }
 
   after {
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource)
     if (path.exists()) Utils.deleteRecursively(path)
   }
 
   def checkLoad(): Unit = {
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
     checkAnswer(load(path.toString), df.collect())
-    checkAnswer(load("org.apache.spark.sql.json", ("path", path.toString)), df.collect())
+
+    // Test if we can pick up the data source name passed in load.
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
+    checkAnswer(load(path.toString, "org.apache.spark.sql.json"), df.collect())
+    checkAnswer(load("org.apache.spark.sql.json", Map("path" -> path.toString)), df.collect())
+    val schema = StructType(StructField("b", StringType, true) :: Nil)
+    checkAnswer(
+      load("org.apache.spark.sql.json", schema, Map("path" -> path.toString)),
+      sql("SELECT b FROM jsonTable").collect())
   }
 
-  test("save with overwrite and load") {
+  test("save with path and load") {
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
     df.save(path.toString)
-    checkLoad
+    checkLoad()
+  }
+
+  test("save with path and datasource, and load") {
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
+    df.save(path.toString, "org.apache.spark.sql.json")
+    checkLoad()
   }
 
   test("save with data source and options, and load") {
-    df.save("org.apache.spark.sql.json", ("path", path.toString))
-    checkLoad
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
+    df.save("org.apache.spark.sql.json", SaveMode.ErrorIfExists, Map("path" -> path.toString))
+    checkLoad()
   }
 
   test("save and save again") {
-    df.save(path.toString)
+    df.save(path.toString, "org.apache.spark.sql.json")
 
-    val message = intercept[RuntimeException] {
-      df.save(path.toString)
+    var message = intercept[RuntimeException] {
+      df.save(path.toString, "org.apache.spark.sql.json")
     }.getMessage
 
     assert(
@@ -82,7 +100,18 @@ class SaveLoadSuite extends DataSourceTest with BeforeAndAfterAll {
 
     if (path.exists()) Utils.deleteRecursively(path)
 
-    df.save(path.toString)
-    checkLoad
+    df.save(path.toString, "org.apache.spark.sql.json")
+    checkLoad()
+
+    df.save("org.apache.spark.sql.json", SaveMode.Overwrite, Map("path" -> path.toString))
+    checkLoad()
+
+    message = intercept[RuntimeException] {
+      df.save("org.apache.spark.sql.json", SaveMode.Append, Map("path" -> path.toString))
+    }.getMessage
+
+    assert(
+      message.contains("Append mode is not supported"),
+      "We should complain that 'Append mode is not supported' for JSON source.")
   }
 }
\ No newline at end of file
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 2c00659496972..7ae6ed6f841bf 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -79,18 +79,6 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
     }
   }
 
-  /**
-   * Creates a table using the schema of the given class.
-   *
-   * @param tableName The name of the table to create.
-   * @param allowExisting When false, an exception will be thrown if the table already exists.
-   * @tparam A A case class that is used to describe the schema of the table to be created.
-   */
-  @Deprecated
-  def createTable[A <: Product : TypeTag](tableName: String, allowExisting: Boolean = true) {
-    catalog.createTable("default", tableName, ScalaReflection.attributesFor[A], allowExisting)
-  }
-
   /**
    * Invalidate and refresh all the cached the metadata of the given table. For performance reasons,
    * Spark SQL or the external data source library it uses might cache certain metadata about a
@@ -107,70 +95,6 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
     catalog.invalidateTable("default", tableName)
   }
 
-  @Experimental
-  def createTable(tableName: String, path: String, allowExisting: Boolean): Unit = {
-    val dataSourceName = conf.defaultDataSourceName
-    createTable(tableName, dataSourceName, allowExisting, ("path", path))
-  }
-
-  @Experimental
-  def createTable(
-      tableName: String,
-      dataSourceName: String,
-      allowExisting: Boolean,
-      option: (String, String),
-      options: (String, String)*): Unit = {
-    val cmd =
-      CreateTableUsing(
-        tableName,
-        userSpecifiedSchema = None,
-        dataSourceName,
-        temporary = false,
-        (option +: options).toMap,
-        allowExisting)
-    executePlan(cmd).toRdd
-  }
-
-  @Experimental
-  def createTable(
-      tableName: String,
-      dataSourceName: String,
-      schema: StructType,
-      allowExisting: Boolean,
-      option: (String, String),
-      options: (String, String)*): Unit = {
-    val cmd =
-      CreateTableUsing(
-        tableName,
-        userSpecifiedSchema = Some(schema),
-        dataSourceName,
-        temporary = false,
-        (option +: options).toMap,
-        allowExisting)
-    executePlan(cmd).toRdd
-  }
-
-  @Experimental
-  def createTable(
-      tableName: String,
-      dataSourceName: String,
-      allowExisting: Boolean,
-      options: java.util.Map[String, String]): Unit = {
-    val opts = options.toSeq
-    createTable(tableName, dataSourceName, allowExisting, opts.head, opts.tail:_*)
-  }
-
-  @Experimental
-  def createTable(
-      tableName: String,
-      dataSourceName: String,
-      schema: StructType,
-      allowExisting: Boolean,
-      options: java.util.Map[String, String]): Unit = {
-    val opts = options.toSeq
-    createTable(tableName, dataSourceName, schema, allowExisting, opts.head, opts.tail:_*)
-  }
-
   /**
    * Analyzes the given table in the current database to generate statistics, which will be
    * used in query optimizations.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 95abc363ae767..cb138be90e2e1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -216,20 +216,21 @@ private[hive] trait HiveStrategies {
 
   object HiveDDLStrategy extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case CreateTableUsing(tableName, userSpecifiedSchema, provider, false, opts, allowExisting) =>
+      case CreateTableUsing(
+      tableName, userSpecifiedSchema, provider, false, opts, allowExisting, managedIfNoPath) =>
         ExecutedCommand(
           CreateMetastoreDataSource(
-            tableName, userSpecifiedSchema, provider, opts, allowExisting)) :: Nil
+            tableName, userSpecifiedSchema, provider, opts, allowExisting, managedIfNoPath)) :: Nil
 
-      case CreateTableUsingAsSelect(tableName, provider, false, opts, allowExisting, query) =>
+      case CreateTableUsingAsSelect(tableName, provider, false, mode, opts, query) =>
         val logicalPlan = hiveContext.parseSql(query)
         val cmd =
-          CreateMetastoreDataSourceAsSelect(tableName, provider, opts, allowExisting, logicalPlan)
+          CreateMetastoreDataSourceAsSelect(tableName, provider, mode, opts, logicalPlan)
         ExecutedCommand(cmd) :: Nil
 
-      case CreateTableUsingAsLogicalPlan(tableName, provider, false, opts, allowExisting, query) =>
+      case CreateTableUsingAsLogicalPlan(tableName, provider, false, mode, opts, query) =>
         val cmd =
-          CreateMetastoreDataSourceAsSelect(tableName, provider, opts, allowExisting, query)
+          CreateMetastoreDataSourceAsSelect(tableName, provider, mode, opts, query)
         ExecutedCommand(cmd) :: Nil
 
       case _ => Nil
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 95dcaccefdc54..f6bea1c6a6fe1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -18,7 +18,9 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.sql.sources.ResolvedDataSource
+import org.apache.spark.sql.catalyst.analysis.EliminateAnalysisOperators
+import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.sources._
 import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
@@ -105,7 +107,8 @@ case class CreateMetastoreDataSource(
     userSpecifiedSchema: Option[StructType],
     provider: String,
     options: Map[String, String],
-    allowExisting: Boolean) extends RunnableCommand {
+    allowExisting: Boolean,
+    managedIfNoPath: Boolean) extends RunnableCommand {
 
   override def run(sqlContext: SQLContext): Seq[Row] = {
     val hiveContext = sqlContext.asInstanceOf[HiveContext]
@@ -120,7 +123,7 @@ case class CreateMetastoreDataSource(
 
     var isExternal = true
     val optionsWithPath =
-      if (!options.contains("path")) {
+      if (!options.contains("path") && managedIfNoPath) {
         isExternal = false
         options + ("path" -> hiveContext.catalog.hiveDefaultTableFilePath(tableName))
       } else {
@@ -141,22 +144,13 @@ case class CreateMetastoreDataSource(
 case class CreateMetastoreDataSourceAsSelect(
     tableName: String,
     provider: String,
+    mode: SaveMode,
     options: Map[String, String],
-    allowExisting: Boolean,
     query: LogicalPlan) extends RunnableCommand {
 
   override def run(sqlContext: SQLContext): Seq[Row] = {
     val hiveContext = sqlContext.asInstanceOf[HiveContext]
-
-    if (hiveContext.catalog.tableExists(tableName :: Nil)) {
-      if (allowExisting) {
-        return Seq.empty[Row]
-      } else {
-        sys.error(s"Table $tableName already exists.")
-      }
-    }
-
-    val df = DataFrame(hiveContext, query)
+    var createMetastoreTable = false
     var isExternal = true
     val optionsWithPath =
       if (!options.contains("path")) {
@@ -166,15 +160,82 @@ case class CreateMetastoreDataSourceAsSelect(
         options
       }
 
-    // Create the relation based on the data of df.
-    ResolvedDataSource(sqlContext, provider, optionsWithPath, df)
+    if (sqlContext.catalog.tableExists(Seq(tableName))) {
+      // Check if we need to throw an exception or just return.
+      mode match {
+        case SaveMode.ErrorIfExists =>
+          sys.error(s"Table $tableName already exists. " +
+            s"If you want to append into it, please set mode to SaveMode.Append. " +
+            s"Or, if you want to overwrite it, please set mode to SaveMode.Overwrite.")
+        case SaveMode.Ignore =>
+          // Since the table already exists and the save mode is Ignore, we will just return.
+          return Seq.empty[Row]
+        case SaveMode.Append =>
+          // Check if the specified data source match the data source of the existing table.
+          val resolved =
+            ResolvedDataSource(sqlContext, Some(query.schema), provider, optionsWithPath)
+          val createdRelation = LogicalRelation(resolved.relation)
+          EliminateAnalysisOperators(sqlContext.table(tableName).logicalPlan) match {
+            case l @ LogicalRelation(i: InsertableRelation) =>
+              if (l.schema != createdRelation.schema) {
+                val errorDescription =
+                  s"Cannot append to table $tableName because the schema of this " +
+                    s"DataFrame does not match the schema of table $tableName."
+                val errorMessage =
+                  s"""
+                |$errorDescription
+                |== Schemas ==
+                |${sideBySide(
+                s"== Expected Schema ==" +:
+                  l.schema.treeString.split("\\\n"),
+                s"== Actual Schema ==" +:
+                  createdRelation.schema.treeString.split("\\\n")).mkString("\n")}
+              """.stripMargin
+                sys.error(errorMessage)
+              } else if (i != createdRelation.relation) {
+                val errorDescription =
+                  s"Cannot append to table $tableName because the resolved relation does not " +
+                  s"match the existing relation of $tableName. " +
+                  s"You can use insertInto($tableName, false) to append this DataFrame to the " +
+                  s"table $tableName and using its data source and options."
+                val errorMessage =
+                  s"""
+                |$errorDescription
+                |== Relations ==
+                |${sideBySide(
+                s"== Expected Relation ==" ::
+                  l.toString :: Nil,
+                s"== Actual Relation ==" ::
+                  createdRelation.toString :: Nil).mkString("\n")}
+              """.stripMargin
+                sys.error(errorMessage)
+              }
+            case o =>
+              sys.error(s"Saving data in ${o.toString} is not supported.")
+          }
+        case SaveMode.Overwrite =>
+          hiveContext.sql(s"DROP TABLE IF EXISTS $tableName")
+          // Need to create the table again.
+          createMetastoreTable = true
+      }
+    } else {
+      // The table does not exist. We need to create it in metastore.
+      createMetastoreTable = true
+    }
 
-    hiveContext.catalog.createDataSourceTable(
-      tableName,
-      None,
-      provider,
-      optionsWithPath,
-      isExternal)
+    val df = DataFrame(hiveContext, query)
+
+    // Create the relation based on the data of df.
+    ResolvedDataSource(sqlContext, provider, mode, optionsWithPath, df)
+
+    if (createMetastoreTable) {
+      hiveContext.catalog.createDataSourceTable(
+        tableName,
+        Some(df.schema),
+        provider,
+        optionsWithPath,
+        isExternal)
+    }
 
     Seq.empty[Row]
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
similarity index 99%
rename from sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
rename to sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 7c1d1133c3425..840fbc197259a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -20,9 +20,6 @@ package org.apache.spark.sql.hive.test
 import java.io.File
 import java.util.{Set => JavaSet}
 
-import scala.collection.mutable
-import scala.language.implicitConversions
-
 import org.apache.hadoop.hive.ql.exec.FunctionRegistry
 import org.apache.hadoop.hive.ql.io.avro.{AvroContainerInputFormat, AvroContainerOutputFormat}
 import org.apache.hadoop.hive.ql.metadata.Table
@@ -30,16 +27,18 @@ import org.apache.hadoop.hive.ql.processors._
 import org.apache.hadoop.hive.serde2.RegexSerDe
 import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 import org.apache.hadoop.hive.serde2.avro.AvroSerDe
-
-import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.util.Utils
+import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.CacheTableCommand
 import org.apache.spark.sql.hive._
-import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.hive.execution.HiveNativeCommand
+import org.apache.spark.util.Utils
+import org.apache.spark.{SparkConf, SparkContext}
+
+import scala.collection.mutable
+import scala.language.implicitConversions
 
 /* Implicit conversions */
 import scala.collection.JavaConversions._
@@ -224,11 +223,10 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
       }
     }),
     TestTable("src_thrift", () => {
-      import org.apache.thrift.protocol.TBinaryProtocol
-      import org.apache.hadoop.hive.serde2.thrift.test.Complex
       import org.apache.hadoop.hive.serde2.thrift.ThriftDeserializer
-      import org.apache.hadoop.mapred.SequenceFileInputFormat
-      import org.apache.hadoop.mapred.SequenceFileOutputFormat
+      import org.apache.hadoop.hive.serde2.thrift.test.Complex
+      import org.apache.hadoop.mapred.{SequenceFileInputFormat, SequenceFileOutputFormat}
+      import org.apache.thrift.protocol.TBinaryProtocol
 
       val srcThrift = new Table("default", "src_thrift")
       srcThrift.setFields(Nil)
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
new file mode 100644
index 0000000000000..9744a2aa3f59c
--- /dev/null
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.hive;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.spark.sql.sources.SaveMode;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.QueryTest$;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.hive.test.TestHive$;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.util.Utils;
+
+public class JavaMetastoreDataSourcesSuite {
+  private transient JavaSparkContext sc;
+  private transient HiveContext sqlContext;
+
+  String originalDefaultSource;
+  File path;
+  Path hiveManagedPath;
+  FileSystem fs;
+  DataFrame df;
+
+  private void checkAnswer(DataFrame actual, List<Row> expected) {
+    String errorMessage = QueryTest$.MODULE$.checkAnswer(actual, expected);
+    if (errorMessage != null) {
+      Assert.fail(errorMessage);
+    }
+  }
+
+  @Before
+  public void setUp() throws IOException {
+    sqlContext = TestHive$.MODULE$;
+    sc = new JavaSparkContext(sqlContext.sparkContext());
+
+    originalDefaultSource = sqlContext.conf().defaultDataSourceName();
+    path =
+      Utils.createTempDir(System.getProperty("java.io.tmpdir"), "datasource").getCanonicalFile();
+    if (path.exists()) {
+      path.delete();
+    }
+    hiveManagedPath = new Path(sqlContext.catalog().hiveDefaultTableFilePath("javaSavedTable"));
+    fs = hiveManagedPath.getFileSystem(sc.hadoopConfiguration());
+    if (fs.exists(hiveManagedPath)){
+      fs.delete(hiveManagedPath, true);
+    }
+
+    List<String> jsonObjects = new ArrayList<String>(10);
+    for (int i = 0; i < 10; i++) {
+      jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}");
+    }
+    JavaRDD<String> rdd = sc.parallelize(jsonObjects);
+    df = sqlContext.jsonRDD(rdd);
+    df.registerTempTable("jsonTable");
+  }
+
+  @After
+  public void tearDown() throws IOException {
+    // Clean up tables.
+    sqlContext.sql("DROP TABLE IF EXISTS javaSavedTable");
+    sqlContext.sql("DROP TABLE IF EXISTS externalTable");
+  }
+
+  @Test
+  public void saveExternalTableAndQueryIt() {
+    Map<String, String> options = new HashMap<String, String>();
+    options.put("path", path.toString());
+    df.saveAsTable("javaSavedTable", "org.apache.spark.sql.json", SaveMode.Append, options);
+
+    checkAnswer(
+      sqlContext.sql("SELECT * FROM javaSavedTable"),
+      df.collectAsList());
+
+    DataFrame loadedDF =
+      sqlContext.createExternalTable("externalTable", "org.apache.spark.sql.json", options);
+
+    checkAnswer(loadedDF, df.collectAsList());
+    checkAnswer(
+      sqlContext.sql("SELECT * FROM externalTable"),
+      df.collectAsList());
+  }
+
+  @Test
+  public void saveExternalTableWithSchemaAndQueryIt() {
+    Map<String, String> options = new HashMap<String, String>();
+    options.put("path", path.toString());
+    df.saveAsTable("javaSavedTable", "org.apache.spark.sql.json", SaveMode.Append, options);
+
+    checkAnswer(
+      sqlContext.sql("SELECT * FROM javaSavedTable"),
+      df.collectAsList());
+
+    List<StructField> fields = new ArrayList<>();
+    fields.add(DataTypes.createStructField("b", DataTypes.StringType, true));
+    StructType schema = DataTypes.createStructType(fields);
+    DataFrame loadedDF =
+      sqlContext.createExternalTable("externalTable", "org.apache.spark.sql.json", schema, options);
+
+    checkAnswer(
+      loadedDF,
+      sqlContext.sql("SELECT b FROM javaSavedTable").collectAsList());
+    checkAnswer(
+      sqlContext.sql("SELECT * FROM externalTable"),
+      sqlContext.sql("SELECT b FROM javaSavedTable").collectAsList());
+  }
+
+  @Test
+  public void saveTableAndQueryIt() {
+    Map<String, String> options = new HashMap<String, String>();
+    df.saveAsTable("javaSavedTable", "org.apache.spark.sql.json", SaveMode.Append, options);
+
+    checkAnswer(
+      sqlContext.sql("SELECT * FROM javaSavedTable"),
+      df.collectAsList());
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala
index ba391293884bd..0270e63557963 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -17,10 +17,8 @@
 
 package org.apache.spark.sql
 
-import org.scalatest.FunSuite
+import scala.collection.JavaConversions._
 
-import org.apache.spark.sql.catalyst.expressions.{ExprId, AttributeReference}
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.util._
 
@@ -55,9 +53,36 @@ class QueryTest extends PlanTest {
   /**
    * Runs the plan and makes sure the answer matches the expected result.
    * @param rdd the [[DataFrame]] to be executed
-   * @param expectedAnswer the expected result, can either be an Any, Seq[Product], or Seq[ Seq[Any] ].
+   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
    */
   protected def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Unit = {
+    QueryTest.checkAnswer(rdd, expectedAnswer) match {
+      case Some(errorMessage) => fail(errorMessage)
+      case None =>
+    }
+  }
+
+  protected def checkAnswer(rdd: DataFrame, expectedAnswer: Row): Unit = {
+    checkAnswer(rdd, Seq(expectedAnswer))
+  }
+
+  def sqlTest(sqlString: String, expectedAnswer: Seq[Row])(implicit sqlContext: SQLContext): Unit = {
+    test(sqlString) {
+      checkAnswer(sqlContext.sql(sqlString), expectedAnswer)
+    }
+  }
+}
+
+object QueryTest {
+  /**
+   * Runs the plan and makes sure the answer matches the expected result.
+   * If there was exception during the execution or the contents of the DataFrame does not
+   * match the expected result, an error message will be returned. Otherwise, a [[None]] will
+   * be returned.
+   * @param rdd the [[DataFrame]] to be executed
+   * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
+   */
+  def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Option[String] = {
     val isSorted = rdd.logicalPlan.collect { case s: logical.Sort => s }.nonEmpty
     def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
       // Converts data to types that we can do equality comparison using Scala collections.
@@ -73,18 +98,20 @@ class QueryTest extends PlanTest {
     }
     val sparkAnswer = try rdd.collect().toSeq catch {
       case e: Exception =>
-        fail(
+        val errorMessage =
           s"""
             |Exception thrown while executing query:
             |${rdd.queryExecution}
             |== Exception ==
             |$e
             |${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
-          """.stripMargin)
+          """.stripMargin
+        return Some(errorMessage)
     }
 
     if (prepareAnswer(expectedAnswer) != prepareAnswer(sparkAnswer)) {
-      fail(s"""
+      val errorMessage =
+        s"""
         |Results do not match for query:
         |${rdd.logicalPlan}
         |== Analyzed Plan ==
@@ -93,22 +120,21 @@ class QueryTest extends PlanTest {
         |${rdd.queryExecution.executedPlan}
         |== Results ==
         |${sideBySide(
-        s"== Correct Answer - ${expectedAnswer.size} ==" +:
-          prepareAnswer(expectedAnswer).map(_.toString),
-        s"== Spark Answer - ${sparkAnswer.size} ==" +:
-          prepareAnswer(sparkAnswer).map(_.toString)).mkString("\n")}
-      """.stripMargin)
+          s"== Correct Answer - ${expectedAnswer.size} ==" +:
+            prepareAnswer(expectedAnswer).map(_.toString),
+          s"== Spark Answer - ${sparkAnswer.size} ==" +:
+            prepareAnswer(sparkAnswer).map(_.toString)).mkString("\n")}
+      """.stripMargin
+      return Some(errorMessage)
     }
-  }
 
-  protected def checkAnswer(rdd: DataFrame, expectedAnswer: Row): Unit = {
-    checkAnswer(rdd, Seq(expectedAnswer))
+    return None
   }
 
-  def sqlTest(sqlString: String, expectedAnswer: Seq[Row])(implicit sqlContext: SQLContext): Unit = {
-    test(sqlString) {
-      checkAnswer(sqlContext.sql(sqlString), expectedAnswer)
+  def checkAnswer(rdd: DataFrame, expectedAnswer: java.util.List[Row]): String = {
+    checkAnswer(rdd, expectedAnswer.toSeq) match {
+      case Some(errorMessage) => errorMessage
+      case None => null
     }
   }
-
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index 869d01eb398c5..43da7519ac8db 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -19,7 +19,11 @@ package org.apache.spark.sql.hive
 
 import java.io.File
 
+import org.scalatest.BeforeAndAfter
+
 import com.google.common.io.Files
+
+import org.apache.spark.sql.execution.QueryExecutionException
 import org.apache.spark.sql.{QueryTest, _}
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.types._
@@ -29,15 +33,22 @@ import org.apache.spark.sql.hive.test.TestHive._
 
 case class TestData(key: Int, value: String)
 
-class InsertIntoHiveTableSuite extends QueryTest {
+class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
   import org.apache.spark.sql.hive.test.TestHive.implicits._
 
   val testData = TestHive.sparkContext.parallelize(
     (1 to 100).map(i => TestData(i, i.toString)))
-  testData.registerTempTable("testData")
+
+  before {
+    // Since every we are doing tests for DDL statements,
+    // it is better to reset before every test.
+    TestHive.reset()
+    // Register the testData, which will be used in every test.
+    testData.registerTempTable("testData")
+  }
 
   test("insertInto() HiveTable") {
-    createTable[TestData]("createAndInsertTest")
+    sql("CREATE TABLE createAndInsertTest (key int, value string)")
 
     // Add some data.
     testData.insertInto("createAndInsertTest")
@@ -68,16 +79,18 @@ class InsertIntoHiveTableSuite extends QueryTest {
   }
 
   test("Double create fails when allowExisting = false") {
-    createTable[TestData]("doubleCreateAndInsertTest")
+    sql("CREATE TABLE doubleCreateAndInsertTest (key int, value string)")
 
-    intercept[org.apache.hadoop.hive.ql.metadata.HiveException] {
-      createTable[TestData]("doubleCreateAndInsertTest", allowExisting = false)
-    }
+    val message = intercept[QueryExecutionException] {
+      sql("CREATE TABLE doubleCreateAndInsertTest (key int, value string)")
+    }.getMessage
+
+    println("message!!!!" + message)
   }
 
   test("Double create does not fail when allowExisting = true") {
-    createTable[TestData]("createAndInsertTest")
-    createTable[TestData]("createAndInsertTest")
+    sql("CREATE TABLE doubleCreateAndInsertTest (key int, value string)")
+    sql("CREATE TABLE IF NOT EXISTS doubleCreateAndInsertTest (key int, value string)")
   }
 
   test("SPARK-4052: scala.collection.Map as value type of MapType") {
@@ -98,7 +111,7 @@ class InsertIntoHiveTableSuite extends QueryTest {
   }
 
   test("SPARK-4203:random partition directory order") {
-    createTable[TestData]("tmp_table")
+    sql("CREATE TABLE tmp_table (key int, value string)")
     val tmpDir = Files.createTempDir()
     sql(s"CREATE TABLE table_with_partition(c1 string) PARTITIONED by (p1 string,p2 string,p3 string,p4 string,p5 string) location '${tmpDir.toURI.toString}'  ")
     sql("INSERT OVERWRITE TABLE table_with_partition  partition (p1='a',p2='b',p3='c',p4='c',p5='1') SELECT 'blarr' FROM tmp_table")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 9ce058909f429..f94aabd29ad23 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -19,10 +19,12 @@ package org.apache.spark.sql.hive
 
 import java.io.File
 
+import org.apache.spark.sql.sources.SaveMode
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.commons.io.FileUtils
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapred.InvalidInputException
 
 import org.apache.spark.sql.catalyst.util
 import org.apache.spark.sql._
@@ -41,11 +43,11 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
 
   override def afterEach(): Unit = {
     reset()
-    if (ctasPath.exists()) Utils.deleteRecursively(ctasPath)
+    if (tempPath.exists()) Utils.deleteRecursively(tempPath)
   }
 
   val filePath = Utils.getSparkClassLoader.getResource("sample.json").getFile
-  var ctasPath: File = util.getTempFilePath("jsonCTAS").getCanonicalFile
+  var tempPath: File = util.getTempFilePath("jsonCTAS").getCanonicalFile
 
   test ("persistent JSON table") {
     sql(
@@ -270,7 +272,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
         |CREATE TABLE ctasJsonTable
         |USING org.apache.spark.sql.json.DefaultSource
         |OPTIONS (
-        |  path '${ctasPath}'
+        |  path '${tempPath}'
         |) AS
         |SELECT * FROM jsonTable
       """.stripMargin)
@@ -297,7 +299,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
         |CREATE TABLE ctasJsonTable
         |USING org.apache.spark.sql.json.DefaultSource
         |OPTIONS (
-        |  path '${ctasPath}'
+        |  path '${tempPath}'
         |) AS
         |SELECT * FROM jsonTable
       """.stripMargin)
@@ -309,7 +311,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
         |CREATE TABLE ctasJsonTable
         |USING org.apache.spark.sql.json.DefaultSource
         |OPTIONS (
-        |  path '${ctasPath}'
+        |  path '${tempPath}'
         |) AS
         |SELECT * FROM jsonTable
       """.stripMargin)
@@ -325,7 +327,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
         |CREATE TABLE IF NOT EXISTS ctasJsonTable
         |USING org.apache.spark.sql.json.DefaultSource
         |OPTIONS (
-        |  path '${ctasPath}'
+        |  path '${tempPath}'
         |) AS
         |SELECT a FROM jsonTable
       """.stripMargin)
@@ -400,38 +402,122 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
     sql("DROP TABLE jsonTable").collect().foreach(println)
   }
 
-  test("save and load table") {
+  test("save table") {
     val originalDefaultSource = conf.defaultDataSourceName
-    conf.setConf("spark.sql.default.datasource", "org.apache.spark.sql.json")
 
     val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
     val df = jsonRDD(rdd)
 
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
+    // Save the df as a managed table (by not specifiying the path).
     df.saveAsTable("savedJsonTable")
 
     checkAnswer(
       sql("SELECT * FROM savedJsonTable"),
       df.collect())
 
-    createTable("createdJsonTable", catalog.hiveDefaultTableFilePath("savedJsonTable"), false)
+    // Right now, we cannot append to an existing JSON table.
+    intercept[RuntimeException] {
+      df.saveAsTable("savedJsonTable", SaveMode.Append)
+    }
+
+    // We can overwrite it.
+    df.saveAsTable("savedJsonTable", SaveMode.Overwrite)
+    checkAnswer(
+      sql("SELECT * FROM savedJsonTable"),
+      df.collect())
+
+    // When the save mode is Ignore, we will do nothing when the table already exists.
+    df.select("b").saveAsTable("savedJsonTable", SaveMode.Ignore)
+    assert(df.schema === table("savedJsonTable").schema)
+    checkAnswer(
+      sql("SELECT * FROM savedJsonTable"),
+      df.collect())
+
+    // Drop table will also delete the data.
+    sql("DROP TABLE savedJsonTable")
+    intercept[InvalidInputException] {
+      jsonFile(catalog.hiveDefaultTableFilePath("savedJsonTable"))
+    }
+
+    // Create an external table by specifying the path.
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
+    df.saveAsTable(
+      "savedJsonTable",
+      "org.apache.spark.sql.json",
+      SaveMode.Append,
+      Map("path" -> tempPath.toString))
+    checkAnswer(
+      sql("SELECT * FROM savedJsonTable"),
+      df.collect())
+
+    // Data should not be deleted after we drop the table.
+    sql("DROP TABLE savedJsonTable")
+    checkAnswer(
+      jsonFile(tempPath.toString),
+      df.collect())
+
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource)
+  }
+
+  test("create external table") {
+    val originalDefaultSource = conf.defaultDataSourceName
+
+    val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
+    val df = jsonRDD(rdd)
+
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
+    df.saveAsTable(
+      "savedJsonTable",
+      "org.apache.spark.sql.json",
+      SaveMode.Append,
+      Map("path" -> tempPath.toString))
+
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
+    createExternalTable("createdJsonTable", tempPath.toString)
     assert(table("createdJsonTable").schema === df.schema)
     checkAnswer(
       sql("SELECT * FROM createdJsonTable"),
       df.collect())
 
-    val message = intercept[RuntimeException] {
-      createTable("createdJsonTable", filePath.toString, false)
+    var message = intercept[RuntimeException] {
+      createExternalTable("createdJsonTable", filePath.toString)
     }.getMessage
     assert(message.contains("Table createdJsonTable already exists."),
       "We should complain that ctasJsonTable already exists")
 
-    createTable("createdJsonTable", filePath.toString, true)
-    // createdJsonTable should be not changed.
-    assert(table("createdJsonTable").schema === df.schema)
+    // Data should not be deleted.
+    sql("DROP TABLE createdJsonTable")
     checkAnswer(
-      sql("SELECT * FROM createdJsonTable"),
+      jsonFile(tempPath.toString),
       df.collect())
 
-    conf.setConf("spark.sql.default.datasource", originalDefaultSource)
+    // Try to specify the schema.
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "not a source name")
+    val schema = StructType(StructField("b", StringType, true) :: Nil)
+    createExternalTable(
+      "createdJsonTable",
+      "org.apache.spark.sql.json",
+      schema,
+      Map("path" -> tempPath.toString))
+    checkAnswer(
+      sql("SELECT * FROM createdJsonTable"),
+      sql("SELECT b FROM savedJsonTable").collect())
+
+    sql("DROP TABLE createdJsonTable")
+
+    message = intercept[RuntimeException] {
+      createExternalTable(
+        "createdJsonTable",
+        "org.apache.spark.sql.json",
+        schema,
+        Map.empty[String, String])
+    }.getMessage
+    assert(
+      message.contains("Option 'path' not specified"),
+      "We should complain that path is not specified.")
+
+    sql("DROP TABLE savedJsonTable")
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource)
   }
 }

From 6195e2473b98253ccc9edc3d624ba2bf59ffc398 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 10 Feb 2015 17:32:42 -0800
Subject: [PATCH 108/357] [SQL] Add an exception for analysis errors.

Also start from the bottom so we show the first error instead of the top error.

Author: Michael Armbrust <michael@databricks.com>

Closes #4439 from marmbrus/analysisException and squashes the following commits:

45862a0 [Michael Armbrust] fix hive test
a773bba [Michael Armbrust] Merge remote-tracking branch 'origin/master' into analysisException
f88079f [Michael Armbrust] update more cases
fede90a [Michael Armbrust] newline
fbf4bc3 [Michael Armbrust] move to sql
6235db4 [Michael Armbrust] [SQL] Add an exception for analysis errors.
---
 .../apache/spark/sql/AnalysisException.scala  | 23 +++++++++++++++++++
 .../sql/catalyst/analysis/Analyzer.scala      | 21 ++++++++++-------
 .../sql/catalyst/analysis/AnalysisSuite.scala | 14 +++++------
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  2 +-
 .../hive/execution/HiveResolutionSuite.scala  |  3 ++-
 5 files changed, 46 insertions(+), 17 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala
new file mode 100644
index 0000000000000..871d560b9d54f
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+/**
+ * Thrown when a query fails to analyze, usually because the query itself is invalid.
+ */
+class AnalysisException(message: String) extends Exception(message) with Serializable
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index fb2ff014cef07..3f0d77ad6322a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.util.collection.OpenHashSet
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -80,16 +81,18 @@ class Analyzer(catalog: Catalog,
    */
   object CheckResolution extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = {
-      plan.transform {
+      plan.transformUp {
         case p if p.expressions.exists(!_.resolved) =>
-          throw new TreeNodeException(p,
-            s"Unresolved attributes: ${p.expressions.filterNot(_.resolved).mkString(",")}")
+          val missing = p.expressions.filterNot(_.resolved).map(_.prettyString).mkString(",")
+          val from = p.inputSet.map(_.name).mkString("{", ", ", "}")
+
+          throw new AnalysisException(s"Cannot resolve '$missing' given input columns $from")
         case p if !p.resolved && p.childrenResolved =>
-          throw new TreeNodeException(p, "Unresolved plan found")
+          throw new AnalysisException(s"Unresolved operator in the query plan ${p.simpleString}")
       } match {
         // As a backstop, use the root node to check that the entire plan tree is resolved.
         case p if !p.resolved =>
-          throw new TreeNodeException(p, "Unresolved plan in tree")
+          throw new AnalysisException(s"Unresolved operator in the query plan ${p.simpleString}")
         case p => p
       }
     }
@@ -314,10 +317,11 @@ class Analyzer(catalog: Catalog,
         val checkField = (f: StructField) => resolver(f.name, fieldName)
         val ordinal = fields.indexWhere(checkField)
         if (ordinal == -1) {
-          sys.error(
+          throw new AnalysisException(
             s"No such struct field $fieldName in ${fields.map(_.name).mkString(", ")}")
         } else if (fields.indexWhere(checkField, ordinal + 1) != -1) {
-          sys.error(s"Ambiguous reference to fields ${fields.filter(checkField).mkString(", ")}")
+          throw new AnalysisException(
+            s"Ambiguous reference to fields ${fields.filter(checkField).mkString(", ")}")
         } else {
           ordinal
         }
@@ -329,7 +333,8 @@ class Analyzer(catalog: Catalog,
         case ArrayType(StructType(fields), containsNull) =>
           val ordinal = findField(fields)
           ArrayGetField(expr, fields(ordinal), ordinal, containsNull)
-        case otherType => sys.error(s"GetField is not valid on fields of type $otherType")
+        case otherType =>
+          throw new AnalysisException(s"GetField is not valid on fields of type $otherType")
       }
     }
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 60060bf02913b..f011a5ff15ea9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -19,8 +19,8 @@ package org.apache.spark.sql.catalyst.analysis
 
 import org.scalatest.{BeforeAndAfter, FunSuite}
 
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference}
-import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
 
@@ -69,12 +69,12 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
           UnresolvedRelation(Seq("TaBlE"), Some("TbL")))) ===
         Project(testRelation.output, testRelation))
 
-    val e = intercept[TreeNodeException[_]] {
+    val e = intercept[AnalysisException] {
       caseSensitiveAnalyze(
         Project(Seq(UnresolvedAttribute("tBl.a")),
           UnresolvedRelation(Seq("TaBlE"), Some("TbL"))))
     }
-    assert(e.getMessage().toLowerCase.contains("unresolved"))
+    assert(e.getMessage().toLowerCase.contains("cannot resolve"))
 
     assert(
       caseInsensitiveAnalyze(
@@ -109,10 +109,10 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
   }
 
   test("throw errors for unresolved attributes during analysis") {
-    val e = intercept[TreeNodeException[_]] {
+    val e = intercept[AnalysisException] {
       caseSensitiveAnalyze(Project(Seq(UnresolvedAttribute("abcd")), testRelation))
     }
-    assert(e.getMessage().toLowerCase.contains("unresolved attribute"))
+    assert(e.getMessage().toLowerCase.contains("cannot resolve"))
   }
 
   test("throw errors for unresolved plans during analysis") {
@@ -120,10 +120,10 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
       override lazy val resolved = false
       override def output = Nil
     }
-    val e = intercept[TreeNodeException[_]] {
+    val e = intercept[AnalysisException] {
       caseSensitiveAnalyze(UnresolvedTestPlan())
     }
-    assert(e.getMessage().toLowerCase.contains("unresolved plan"))
+    assert(e.getMessage().toLowerCase.contains("unresolved"))
   }
 
   test("divide should be casted into fractional types") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 11502edf972e9..55fd0b0892fa1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -589,7 +589,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
       ("1" :: "2" :: "3" :: "4" :: "A" :: "B" :: "C" :: "D" :: "E" :: "F" :: Nil).map(Row(_)))
     // Column type mismatches where a coercion is not possible, in this case between integer
     // and array types, trigger a TreeNodeException.
-    intercept[TreeNodeException[_]] {
+    intercept[AnalysisException] {
       sql("SELECT data FROM arrayData UNION SELECT 1 FROM arrayData").collect()
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
index ff8130ae5f6bc..ab5f9cdddf508 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.hive.execution
 
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.hive.test.TestHive.{sparkContext, jsonRDD, sql}
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 
@@ -40,7 +41,7 @@ class HiveResolutionSuite extends HiveComparisonTest {
       """{"a": [{"b": 1, "B": 2}]}""" :: Nil)).registerTempTable("nested")
 
     // there are 2 filed matching field name "b", we should report Ambiguous reference error
-    val exception = intercept[RuntimeException] {
+    val exception = intercept[AnalysisException] {
       sql("SELECT a[0].b from nested").queryExecution.analyzed
     }
     assert(exception.getMessage.contains("Ambiguous reference to fields"))

From a60aea86b4d4b716b5ec3bff776b509fe0831342 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Tue, 10 Feb 2015 18:19:56 -0800
Subject: [PATCH 109/357] [SPARK-5683] [SQL] Avoid multiple json generator
 created

Author: Cheng Hao <hao.cheng@intel.com>

Closes #4468 from chenghao-intel/json and squashes the following commits:

aeb7801 [Cheng Hao] avoid multiple json generator created
---
 .../org/apache/spark/sql/DataFrameImpl.scala  | 24 +++++++++++++++++--
 .../org/apache/spark/sql/json/JsonRDD.scala   | 13 +++-------
 .../org/apache/spark/sql/json/JsonSuite.scala |  8 +++----
 3 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index 11f9334556981..0134b038f3c5a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql
 
+import java.io.CharArrayWriter
+
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
 import scala.collection.JavaConversions._
@@ -380,8 +382,26 @@ private[sql] class DataFrameImpl protected[sql](
   override def toJSON: RDD[String] = {
     val rowSchema = this.schema
     this.mapPartitions { iter =>
-      val jsonFactory = new JsonFactory()
-      iter.map(JsonRDD.rowToJSON(rowSchema, jsonFactory))
+      val writer = new CharArrayWriter()
+      // create the Generator without separator inserted between 2 records
+      val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null)
+
+      new Iterator[String] {
+        override def hasNext() = iter.hasNext
+        override def next(): String = {
+          JsonRDD.rowToJSON(rowSchema, gen)(iter.next())
+          gen.flush()
+
+          val json = writer.toString
+          if (hasNext) {
+            writer.reset()
+          } else {
+            gen.close()
+          }
+
+          json
+        }
+      }
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index 33ce71b51b213..1043eefcfc6a5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -23,8 +23,7 @@ import java.sql.{Date, Timestamp}
 import scala.collection.Map
 import scala.collection.convert.Wrappers.{JMapWrapper, JListWrapper}
 
-import com.fasterxml.jackson.core.JsonProcessingException
-import com.fasterxml.jackson.core.JsonFactory
+import com.fasterxml.jackson.core.{JsonGenerator, JsonProcessingException, JsonFactory}
 import com.fasterxml.jackson.databind.ObjectMapper
 
 import org.apache.spark.rdd.RDD
@@ -430,14 +429,11 @@ private[sql] object JsonRDD extends Logging {
 
   /** Transforms a single Row to JSON using Jackson
     *
-    * @param jsonFactory a JsonFactory object to construct a JsonGenerator
     * @param rowSchema the schema object used for conversion
+    * @param gen a JsonGenerator object
     * @param row The row to convert
     */
-  private[sql] def rowToJSON(rowSchema: StructType, jsonFactory: JsonFactory)(row: Row): String = {
-    val writer = new StringWriter()
-    val gen = jsonFactory.createGenerator(writer)
-
+  private[sql] def rowToJSON(rowSchema: StructType, gen: JsonGenerator)(row: Row) = {
     def valWriter: (DataType, Any) => Unit = {
       case (_, null) | (NullType, _)  => gen.writeNull()
       case (StringType, v: String) => gen.writeString(v)
@@ -479,8 +475,5 @@ private[sql] object JsonRDD extends Logging {
     }
 
     valWriter(rowSchema, row)
-    gen.close()
-    writer.toString
   }
-
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 7870cf9b0a868..4fc92e3e3b8c0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -824,8 +824,8 @@ class JsonSuite extends QueryTest {
     df1.registerTempTable("applySchema1")
     val df2 = df1.toDataFrame
     val result = df2.toJSON.collect()
-    assert(result(0) == "{\"f1\":1,\"f2\":\"A1\",\"f3\":true,\"f4\":[\"1\",\" A1\",\" true\",\" null\"]}")
-    assert(result(3) == "{\"f1\":4,\"f2\":\"D4\",\"f3\":true,\"f4\":[\"4\",\" D4\",\" true\",\" 2147483644\"],\"f5\":2147483644}")
+    assert(result(0) === "{\"f1\":1,\"f2\":\"A1\",\"f3\":true,\"f4\":[\"1\",\" A1\",\" true\",\" null\"]}")
+    assert(result(3) === "{\"f1\":4,\"f2\":\"D4\",\"f3\":true,\"f4\":[\"4\",\" D4\",\" true\",\" 2147483644\"],\"f5\":2147483644}")
 
     val schema2 = StructType(
       StructField("f1", StructType(
@@ -846,8 +846,8 @@ class JsonSuite extends QueryTest {
     val df4 = df3.toDataFrame
     val result2 = df4.toJSON.collect()
 
-    assert(result2(1) == "{\"f1\":{\"f11\":2,\"f12\":false},\"f2\":{\"B2\":null}}")
-    assert(result2(3) == "{\"f1\":{\"f11\":4,\"f12\":true},\"f2\":{\"D4\":2147483644}}")
+    assert(result2(1) === "{\"f1\":{\"f11\":2,\"f12\":false},\"f2\":{\"B2\":null}}")
+    assert(result2(3) === "{\"f1\":{\"f11\":4,\"f12\":true},\"f2\":{\"D4\":2147483644}}")
 
     val jsonDF = jsonRDD(primitiveFieldAndType)
     val primTable = jsonRDD(jsonDF.toJSON)

From ea60284095cad43aa7ac98256576375d0e91a52a Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 10 Feb 2015 19:40:12 -0800
Subject: [PATCH 110/357] [SPARK-5704] [SQL] [PySpark] createDataFrame from RDD
 with columns

Deprecate inferSchema() and applySchema(), use createDataFrame() instead, which could take an optional `schema` to create an DataFrame from an RDD. The `schema` could be StructType or list of names of columns.

Author: Davies Liu <davies@databricks.com>

Closes #4498 from davies/create and squashes the following commits:

08469c1 [Davies Liu] remove Scala/Java API for now
c80a7a9 [Davies Liu] fix hive test
d1bd8f2 [Davies Liu] cleanup applySchema
9526e97 [Davies Liu] createDataFrame from RDD with columns
---
 docs/ml-guide.md                              | 12 +--
 docs/sql-programming-guide.md                 | 16 ++--
 .../ml/JavaCrossValidatorExample.java         |  4 +-
 .../examples/ml/JavaDeveloperApiExample.java  |  4 +-
 .../examples/ml/JavaSimpleParamsExample.java  |  4 +-
 .../JavaSimpleTextClassificationPipeline.java |  4 +-
 .../spark/examples/sql/JavaSparkSQL.java      |  2 +-
 examples/src/main/python/sql.py               |  4 +-
 .../spark/ml/tuning/CrossValidator.scala      |  4 +-
 .../apache/spark/ml/JavaPipelineSuite.java    |  2 +-
 .../JavaLogisticRegressionSuite.java          |  2 +-
 .../regression/JavaLinearRegressionSuite.java |  2 +-
 .../ml/tuning/JavaCrossValidatorSuite.java    |  2 +-
 python/pyspark/sql/context.py                 | 87 +++++++++++++----
 python/pyspark/sql/tests.py                   | 26 ++---
 .../org/apache/spark/sql/SQLContext.scala     | 95 +++++++++++++++++--
 .../spark/sql/ColumnExpressionSuite.scala     |  4 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  9 +-
 .../spark/sql/execution/PlannerSuite.scala    |  2 +-
 .../spark/sql/jdbc/JDBCWriteSuite.scala       | 18 ++--
 .../org/apache/spark/sql/json/JsonSuite.scala |  4 +-
 .../sql/hive/InsertIntoHiveTableSuite.scala   |  8 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |  4 +-
 23 files changed, 222 insertions(+), 97 deletions(-)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index be178d7689fdd..4bf14fba34eec 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -260,7 +260,7 @@ List<LabeledPoint> localTraining = Lists.newArrayList(
   new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
   new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
   new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
-JavaSchemaRDD training = jsql.applySchema(jsc.parallelize(localTraining), LabeledPoint.class);
+JavaSchemaRDD training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledPoint.class);
 
 // Create a LogisticRegression instance.  This instance is an Estimator.
 LogisticRegression lr = new LogisticRegression();
@@ -300,7 +300,7 @@ List<LabeledPoint> localTest = Lists.newArrayList(
     new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
     new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
     new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
-JavaSchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), LabeledPoint.class);
+JavaSchemaRDD test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class);
 
 // Make predictions on test documents using the Transformer.transform() method.
 // LogisticRegression.transform will only use the 'features' column.
@@ -443,7 +443,7 @@ List<LabeledDocument> localTraining = Lists.newArrayList(
   new LabeledDocument(2L, "spark f g h", 1.0),
   new LabeledDocument(3L, "hadoop mapreduce", 0.0));
 JavaSchemaRDD training =
-  jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
+  jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
 
 // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
 Tokenizer tokenizer = new Tokenizer()
@@ -469,7 +469,7 @@ List<Document> localTest = Lists.newArrayList(
   new Document(6L, "mapreduce spark"),
   new Document(7L, "apache hadoop"));
 JavaSchemaRDD test =
-  jsql.applySchema(jsc.parallelize(localTest), Document.class);
+  jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
 
 // Make predictions on test documents.
 model.transform(test).registerAsTable("prediction");
@@ -626,7 +626,7 @@ List<LabeledDocument> localTraining = Lists.newArrayList(
   new LabeledDocument(10L, "spark compile", 1.0),
   new LabeledDocument(11L, "hadoop software", 0.0));
 JavaSchemaRDD training =
-    jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
+    jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
 
 // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
 Tokenizer tokenizer = new Tokenizer()
@@ -669,7 +669,7 @@ List<Document> localTest = Lists.newArrayList(
   new Document(5L, "l m n"),
   new Document(6L, "mapreduce spark"),
   new Document(7L, "apache hadoop"));
-JavaSchemaRDD test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
+JavaSchemaRDD test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
 
 // Make predictions on test documents. cvModel uses the best model found (lrModel).
 cvModel.transform(test).registerAsTable("prediction");
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 38f617d0c836c..b2b007509c735 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -225,7 +225,7 @@ public static class Person implements Serializable {
 {% endhighlight %}
 
 
-A schema can be applied to an existing RDD by calling `applySchema` and providing the Class object
+A schema can be applied to an existing RDD by calling `createDataFrame` and providing the Class object
 for the JavaBean.
 
 {% highlight java %}
@@ -247,7 +247,7 @@ JavaRDD<Person> people = sc.textFile("examples/src/main/resources/people.txt").m
   });
 
 // Apply a schema to an RDD of JavaBeans and register it as a table.
-JavaSchemaRDD schemaPeople = sqlContext.applySchema(people, Person.class);
+JavaSchemaRDD schemaPeople = sqlContext.createDataFrame(people, Person.class);
 schemaPeople.registerTempTable("people");
 
 // SQL can be run over RDDs that have been registered as tables.
@@ -315,7 +315,7 @@ a `SchemaRDD` can be created programmatically with three steps.
 1. Create an RDD of `Row`s from the original RDD;
 2. Create the schema represented by a `StructType` matching the structure of
 `Row`s in the RDD created in Step 1.
-3. Apply the schema to the RDD of `Row`s via `applySchema` method provided
+3. Apply the schema to the RDD of `Row`s via `createDataFrame` method provided
 by `SQLContext`.
 
 For example:
@@ -341,7 +341,7 @@ val schema =
 val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1).trim))
 
 // Apply the schema to the RDD.
-val peopleSchemaRDD = sqlContext.applySchema(rowRDD, schema)
+val peopleSchemaRDD = sqlContext.createDataFrame(rowRDD, schema)
 
 // Register the SchemaRDD as a table.
 peopleSchemaRDD.registerTempTable("people")
@@ -367,7 +367,7 @@ a `SchemaRDD` can be created programmatically with three steps.
 1. Create an RDD of `Row`s from the original RDD;
 2. Create the schema represented by a `StructType` matching the structure of
 `Row`s in the RDD created in Step 1.
-3. Apply the schema to the RDD of `Row`s via `applySchema` method provided
+3. Apply the schema to the RDD of `Row`s via `createDataFrame` method provided
 by `JavaSQLContext`.
 
 For example:
@@ -406,7 +406,7 @@ JavaRDD<Row> rowRDD = people.map(
   });
 
 // Apply the schema to the RDD.
-JavaSchemaRDD peopleSchemaRDD = sqlContext.applySchema(rowRDD, schema);
+JavaSchemaRDD peopleSchemaRDD = sqlContext.createDataFrame(rowRDD, schema);
 
 // Register the SchemaRDD as a table.
 peopleSchemaRDD.registerTempTable("people");
@@ -436,7 +436,7 @@ a `SchemaRDD` can be created programmatically with three steps.
 1. Create an RDD of tuples or lists from the original RDD;
 2. Create the schema represented by a `StructType` matching the structure of
 tuples or lists in the RDD created in the step 1.
-3. Apply the schema to the RDD via `applySchema` method provided by `SQLContext`.
+3. Apply the schema to the RDD via `createDataFrame` method provided by `SQLContext`.
 
 For example:
 {% highlight python %}
@@ -458,7 +458,7 @@ fields = [StructField(field_name, StringType(), True) for field_name in schemaSt
 schema = StructType(fields)
 
 # Apply the schema to the RDD.
-schemaPeople = sqlContext.applySchema(people, schema)
+schemaPeople = sqlContext.createDataFrame(people, schema)
 
 # Register the SchemaRDD as a table.
 schemaPeople.registerTempTable("people")
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
index 5041e0b6d34b0..5d8c5d0a92daa 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
@@ -71,7 +71,7 @@ public static void main(String[] args) {
       new LabeledDocument(9L, "a e c l", 0.0),
       new LabeledDocument(10L, "spark compile", 1.0),
       new LabeledDocument(11L, "hadoop software", 0.0));
-    DataFrame training = jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
+    DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
 
     // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
     Tokenizer tokenizer = new Tokenizer()
@@ -112,7 +112,7 @@ public static void main(String[] args) {
       new Document(5L, "l m n"),
       new Document(6L, "mapreduce spark"),
       new Document(7L, "apache hadoop"));
-    DataFrame test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
+    DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
 
     // Make predictions on test documents. cvModel uses the best model found (lrModel).
     cvModel.transform(test).registerTempTable("prediction");
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
index 4d9dad9f23038..19d0eb216848e 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
@@ -62,7 +62,7 @@ public static void main(String[] args) throws Exception {
         new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
         new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
         new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
-    DataFrame training = jsql.applySchema(jsc.parallelize(localTraining), LabeledPoint.class);
+    DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledPoint.class);
 
     // Create a LogisticRegression instance.  This instance is an Estimator.
     MyJavaLogisticRegression lr = new MyJavaLogisticRegression();
@@ -80,7 +80,7 @@ public static void main(String[] args) throws Exception {
         new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
         new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
         new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
-    DataFrame test = jsql.applySchema(jsc.parallelize(localTest), LabeledPoint.class);
+    DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class);
 
     // Make predictions on test documents. cvModel uses the best model found (lrModel).
     DataFrame results = model.transform(test);
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
index cc69e6315fdda..4c4d532388781 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
@@ -54,7 +54,7 @@ public static void main(String[] args) {
       new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
       new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
       new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
-    DataFrame training = jsql.applySchema(jsc.parallelize(localTraining), LabeledPoint.class);
+    DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledPoint.class);
 
     // Create a LogisticRegression instance.  This instance is an Estimator.
     LogisticRegression lr = new LogisticRegression();
@@ -94,7 +94,7 @@ public static void main(String[] args) {
         new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
         new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
         new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
-    DataFrame test = jsql.applySchema(jsc.parallelize(localTest), LabeledPoint.class);
+    DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class);
 
     // Make predictions on test documents using the Transformer.transform() method.
     // LogisticRegression.transform will only use the 'features' column.
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
index d929f1ad2014a..fdcfc888c235f 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
@@ -54,7 +54,7 @@ public static void main(String[] args) {
       new LabeledDocument(1L, "b d", 0.0),
       new LabeledDocument(2L, "spark f g h", 1.0),
       new LabeledDocument(3L, "hadoop mapreduce", 0.0));
-    DataFrame training = jsql.applySchema(jsc.parallelize(localTraining), LabeledDocument.class);
+    DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
 
     // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
     Tokenizer tokenizer = new Tokenizer()
@@ -79,7 +79,7 @@ public static void main(String[] args) {
       new Document(5L, "l m n"),
       new Document(6L, "mapreduce spark"),
       new Document(7L, "apache hadoop"));
-    DataFrame test = jsql.applySchema(jsc.parallelize(localTest), Document.class);
+    DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
 
     // Make predictions on test documents.
     model.transform(test).registerTempTable("prediction");
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
index 8defb769ffaaf..dee794840a3e1 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java
@@ -74,7 +74,7 @@ public Person call(String line) {
       });
 
     // Apply a schema to an RDD of Java Beans and register it as a table.
-    DataFrame schemaPeople = sqlCtx.applySchema(people, Person.class);
+    DataFrame schemaPeople = sqlCtx.createDataFrame(people, Person.class);
     schemaPeople.registerTempTable("people");
 
     // SQL can be run over RDDs that have been registered as tables.
diff --git a/examples/src/main/python/sql.py b/examples/src/main/python/sql.py
index 7f5c68e3d0fe2..47202fde7510b 100644
--- a/examples/src/main/python/sql.py
+++ b/examples/src/main/python/sql.py
@@ -31,7 +31,7 @@
                               Row(name="Smith", age=23),
                               Row(name="Sarah", age=18)])
     # Infer schema from the first row, create a DataFrame and print the schema
-    some_df = sqlContext.inferSchema(some_rdd)
+    some_df = sqlContext.createDataFrame(some_rdd)
     some_df.printSchema()
 
     # Another RDD is created from a list of tuples
@@ -40,7 +40,7 @@
     schema = StructType([StructField("person_name", StringType(), False),
                         StructField("person_age", IntegerType(), False)])
     # Create a DataFrame by applying the schema to the RDD and print the schema
-    another_df = sqlContext.applySchema(another_rdd, schema)
+    another_df = sqlContext.createDataFrame(another_rdd, schema)
     another_df.printSchema()
     # root
     #  |-- age: integer (nullable = true)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 5d51c51346665..324b1ba784387 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -76,8 +76,8 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
     val metrics = new Array[Double](epm.size)
     val splits = MLUtils.kFold(dataset.rdd, map(numFolds), 0)
     splits.zipWithIndex.foreach { case ((training, validation), splitIndex) =>
-      val trainingDataset = sqlCtx.applySchema(training, schema).cache()
-      val validationDataset = sqlCtx.applySchema(validation, schema).cache()
+      val trainingDataset = sqlCtx.createDataFrame(training, schema).cache()
+      val validationDataset = sqlCtx.createDataFrame(validation, schema).cache()
       // multi-model training
       logDebug(s"Train split $splitIndex with multiple sets of parameters.")
       val models = est.fit(trainingDataset, epm).asInstanceOf[Seq[Model[_]]]
diff --git a/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java b/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java
index 50995ffef9ad5..0a8c9e5954676 100644
--- a/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/JavaPipelineSuite.java
@@ -45,7 +45,7 @@ public void setUp() {
     jsql = new SQLContext(jsc);
     JavaRDD<LabeledPoint> points =
       jsc.parallelize(generateLogisticInputAsList(1.0, 1.0, 100, 42), 2);
-    dataset = jsql.applySchema(points, LabeledPoint.class);
+    dataset = jsql.createDataFrame(points, LabeledPoint.class);
   }
 
   @After
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
index d4b664479255d..3f8e59de0f05c 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java
@@ -50,7 +50,7 @@ public void setUp() {
     jsql = new SQLContext(jsc);
     List<LabeledPoint> points = generateLogisticInputAsList(1.0, 1.0, 100, 42);
     datasetRDD = jsc.parallelize(points, 2);
-    dataset = jsql.applySchema(datasetRDD, LabeledPoint.class);
+    dataset = jsql.createDataFrame(datasetRDD, LabeledPoint.class);
     dataset.registerTempTable("dataset");
   }
 
diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
index 40d5a92bb32af..0cc36c8d56d70 100644
--- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
@@ -46,7 +46,7 @@ public void setUp() {
     jsql = new SQLContext(jsc);
     List<LabeledPoint> points = generateLogisticInputAsList(1.0, 1.0, 100, 42);
     datasetRDD = jsc.parallelize(points, 2);
-    dataset = jsql.applySchema(datasetRDD, LabeledPoint.class);
+    dataset = jsql.createDataFrame(datasetRDD, LabeledPoint.class);
     dataset.registerTempTable("dataset");
   }
 
diff --git a/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java b/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java
index 074b58c07df7a..0bb6b489f2757 100644
--- a/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/tuning/JavaCrossValidatorSuite.java
@@ -45,7 +45,7 @@ public void setUp() {
     jsc = new JavaSparkContext("local", "JavaCrossValidatorSuite");
     jsql = new SQLContext(jsc);
     List<LabeledPoint> points = generateLogisticInputAsList(1.0, 1.0, 100, 42);
-    dataset = jsql.applySchema(jsc.parallelize(points, 2), LabeledPoint.class);
+    dataset = jsql.createDataFrame(jsc.parallelize(points, 2), LabeledPoint.class);
   }
 
   @After
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 882c0f98ea40b..9d29ef4839a43 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -25,7 +25,7 @@
 
 from pyspark.rdd import _prepare_for_python_RDD
 from pyspark.serializers import AutoBatchedSerializer, PickleSerializer
-from pyspark.sql.types import StringType, StructType, _verify_type, \
+from pyspark.sql.types import StringType, StructType, _infer_type, _verify_type, \
     _infer_schema, _has_nulltype, _merge_type, _create_converter, _python_to_sql_converter
 from pyspark.sql.dataframe import DataFrame
 
@@ -47,23 +47,11 @@ def __init__(self, sparkContext, sqlContext=None):
         :param sqlContext: An optional JVM Scala SQLContext. If set, we do not instatiate a new
         SQLContext in the JVM, instead we make all calls to this object.
 
-        >>> df = sqlCtx.inferSchema(rdd)
-        >>> sqlCtx.inferSchema(df) # doctest: +IGNORE_EXCEPTION_DETAIL
-        Traceback (most recent call last):
-            ...
-        TypeError:...
-
-        >>> bad_rdd = sc.parallelize([1,2,3])
-        >>> sqlCtx.inferSchema(bad_rdd) # doctest: +IGNORE_EXCEPTION_DETAIL
-        Traceback (most recent call last):
-            ...
-        ValueError:...
-
         >>> from datetime import datetime
         >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1L,
         ...     b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1),
         ...     time=datetime(2014, 8, 1, 14, 1, 5))])
-        >>> df = sqlCtx.inferSchema(allTypes)
+        >>> df = sqlCtx.createDataFrame(allTypes)
         >>> df.registerTempTable("allTypes")
         >>> sqlCtx.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a '
         ...            'from allTypes where b and i > 0').collect()
@@ -131,6 +119,9 @@ def registerFunction(self, name, f, returnType=StringType()):
     def inferSchema(self, rdd, samplingRatio=None):
         """Infer and apply a schema to an RDD of L{Row}.
 
+        ::note:
+            Deprecated in 1.3, use :func:`createDataFrame` instead
+
         When samplingRatio is specified, the schema is inferred by looking
         at the types of each row in the sampled dataset. Otherwise, the
         first 100 rows of the RDD are inspected. Nested collections are
@@ -199,7 +190,7 @@ def inferSchema(self, rdd, samplingRatio=None):
                     warnings.warn("Some of types cannot be determined by the "
                                   "first 100 rows, please try again with sampling")
         else:
-            if samplingRatio > 0.99:
+            if samplingRatio < 0.99:
                 rdd = rdd.sample(False, float(samplingRatio))
             schema = rdd.map(_infer_schema).reduce(_merge_type)
 
@@ -211,6 +202,9 @@ def applySchema(self, rdd, schema):
         """
         Applies the given schema to the given RDD of L{tuple} or L{list}.
 
+        ::note:
+            Deprecated in 1.3, use :func:`createDataFrame` instead
+
         These tuples or lists can contain complex nested structures like
         lists, maps or nested rows.
 
@@ -300,13 +294,68 @@ def applySchema(self, rdd, schema):
         df = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
         return DataFrame(df, self)
 
+    def createDataFrame(self, rdd, schema=None, samplingRatio=None):
+        """
+        Create a DataFrame from an RDD of tuple/list and an optional `schema`.
+
+        `schema` could be :class:`StructType` or a list of column names.
+
+        When `schema` is a list of column names, the type of each column
+        will be inferred from `rdd`.
+
+        When `schema` is None, it will try to infer the column name and type
+        from `rdd`, which should be an RDD of :class:`Row`, or namedtuple,
+        or dict.
+
+        If referring needed, `samplingRatio` is used to determined how many
+        rows will be used to do referring. The first row will be used if
+        `samplingRatio` is None.
+
+        :param rdd: an RDD of Row or tuple or list or dict
+        :param schema: a StructType or list of names of columns
+        :param samplingRatio: the sample ratio of rows used for inferring
+        :return: a DataFrame
+
+        >>> rdd = sc.parallelize([('Alice', 1)])
+        >>> df = sqlCtx.createDataFrame(rdd, ['name', 'age'])
+        >>> df.collect()
+        [Row(name=u'Alice', age=1)]
+
+        >>> from pyspark.sql import Row
+        >>> Person = Row('name', 'age')
+        >>> person = rdd.map(lambda r: Person(*r))
+        >>> df2 = sqlCtx.createDataFrame(person)
+        >>> df2.collect()
+        [Row(name=u'Alice', age=1)]
+
+        >>> from pyspark.sql.types import *
+        >>> schema = StructType([
+        ...    StructField("name", StringType(), True),
+        ...    StructField("age", IntegerType(), True)])
+        >>> df3 = sqlCtx.createDataFrame(rdd, schema)
+        >>> df3.collect()
+        [Row(name=u'Alice', age=1)]
+        """
+        if isinstance(rdd, DataFrame):
+            raise TypeError("rdd is already a DataFrame")
+
+        if isinstance(schema, StructType):
+            return self.applySchema(rdd, schema)
+        else:
+            if isinstance(schema, (list, tuple)):
+                first = rdd.first()
+                if not isinstance(first, (list, tuple)):
+                    raise ValueError("each row in `rdd` should be list or tuple")
+                row_cls = Row(*schema)
+                rdd = rdd.map(lambda r: row_cls(*r))
+            return self.inferSchema(rdd, samplingRatio)
+
     def registerRDDAsTable(self, rdd, tableName):
         """Registers the given RDD as a temporary table in the catalog.
 
         Temporary tables exist only during the lifetime of this instance of
         SQLContext.
 
-        >>> df = sqlCtx.inferSchema(rdd)
         >>> sqlCtx.registerRDDAsTable(df, "table1")
         """
         if (rdd.__class__ is DataFrame):
@@ -321,7 +370,6 @@ def parquetFile(self, *paths):
         >>> import tempfile, shutil
         >>> parquetFile = tempfile.mkdtemp()
         >>> shutil.rmtree(parquetFile)
-        >>> df = sqlCtx.inferSchema(rdd)
         >>> df.saveAsParquetFile(parquetFile)
         >>> df2 = sqlCtx.parquetFile(parquetFile)
         >>> sorted(df.collect()) == sorted(df2.collect())
@@ -526,7 +574,6 @@ def createExternalTable(self, tableName, path=None, source=None,
     def sql(self, sqlQuery):
         """Return a L{DataFrame} representing the result of the given query.
 
-        >>> df = sqlCtx.inferSchema(rdd)
         >>> sqlCtx.registerRDDAsTable(df, "table1")
         >>> df2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2 from table1")
         >>> df2.collect()
@@ -537,7 +584,6 @@ def sql(self, sqlQuery):
     def table(self, tableName):
         """Returns the specified table as a L{DataFrame}.
 
-        >>> df = sqlCtx.inferSchema(rdd)
         >>> sqlCtx.registerRDDAsTable(df, "table1")
         >>> df2 = sqlCtx.table("table1")
         >>> sorted(df.collect()) == sorted(df2.collect())
@@ -685,11 +731,12 @@ def _test():
     sc = SparkContext('local[4]', 'PythonTest')
     globs['sc'] = sc
     globs['sqlCtx'] = sqlCtx = SQLContext(sc)
-    globs['rdd'] = sc.parallelize(
+    globs['rdd'] = rdd = sc.parallelize(
         [Row(field1=1, field2="row1"),
          Row(field1=2, field2="row2"),
          Row(field1=3, field2="row3")]
     )
+    globs['df'] = sqlCtx.createDataFrame(rdd)
     jsonStrings = [
         '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
         '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index bc945091f7042..5e41e36897b5d 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -96,7 +96,7 @@ def setUpClass(cls):
         cls.sqlCtx = SQLContext(cls.sc)
         cls.testData = [Row(key=i, value=str(i)) for i in range(100)]
         rdd = cls.sc.parallelize(cls.testData)
-        cls.df = cls.sqlCtx.inferSchema(rdd)
+        cls.df = cls.sqlCtx.createDataFrame(rdd)
 
     @classmethod
     def tearDownClass(cls):
@@ -110,14 +110,14 @@ def test_udf(self):
 
     def test_udf2(self):
         self.sqlCtx.registerFunction("strlen", lambda string: len(string), IntegerType())
-        self.sqlCtx.inferSchema(self.sc.parallelize([Row(a="test")])).registerTempTable("test")
+        self.sqlCtx.createDataFrame(self.sc.parallelize([Row(a="test")])).registerTempTable("test")
         [res] = self.sqlCtx.sql("SELECT strlen(a) FROM test WHERE strlen(a) > 1").collect()
         self.assertEqual(4, res[0])
 
     def test_udf_with_array_type(self):
         d = [Row(l=range(3), d={"key": range(5)})]
         rdd = self.sc.parallelize(d)
-        self.sqlCtx.inferSchema(rdd).registerTempTable("test")
+        self.sqlCtx.createDataFrame(rdd).registerTempTable("test")
         self.sqlCtx.registerFunction("copylist", lambda l: list(l), ArrayType(IntegerType()))
         self.sqlCtx.registerFunction("maplen", lambda d: len(d), IntegerType())
         [(l1, l2)] = self.sqlCtx.sql("select copylist(l), maplen(d) from test").collect()
@@ -155,17 +155,17 @@ def test_basic_functions(self):
 
     def test_apply_schema_to_row(self):
         df = self.sqlCtx.jsonRDD(self.sc.parallelize(["""{"a":2}"""]))
-        df2 = self.sqlCtx.applySchema(df.map(lambda x: x), df.schema())
+        df2 = self.sqlCtx.createDataFrame(df.map(lambda x: x), df.schema())
         self.assertEqual(df.collect(), df2.collect())
 
         rdd = self.sc.parallelize(range(10)).map(lambda x: Row(a=x))
-        df3 = self.sqlCtx.applySchema(rdd, df.schema())
+        df3 = self.sqlCtx.createDataFrame(rdd, df.schema())
         self.assertEqual(10, df3.count())
 
     def test_serialize_nested_array_and_map(self):
         d = [Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})]
         rdd = self.sc.parallelize(d)
-        df = self.sqlCtx.inferSchema(rdd)
+        df = self.sqlCtx.createDataFrame(rdd)
         row = df.head()
         self.assertEqual(1, len(row.l))
         self.assertEqual(1, row.l[0].a)
@@ -187,14 +187,14 @@ def test_infer_schema(self):
         d = [Row(l=[], d={}),
              Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")}, s="")]
         rdd = self.sc.parallelize(d)
-        df = self.sqlCtx.inferSchema(rdd)
+        df = self.sqlCtx.createDataFrame(rdd)
         self.assertEqual([], df.map(lambda r: r.l).first())
         self.assertEqual([None, ""], df.map(lambda r: r.s).collect())
         df.registerTempTable("test")
         result = self.sqlCtx.sql("SELECT l[0].a from test where d['key'].d = '2'")
         self.assertEqual(1, result.head()[0])
 
-        df2 = self.sqlCtx.inferSchema(rdd, 1.0)
+        df2 = self.sqlCtx.createDataFrame(rdd, 1.0)
         self.assertEqual(df.schema(), df2.schema())
         self.assertEqual({}, df2.map(lambda r: r.d).first())
         self.assertEqual([None, ""], df2.map(lambda r: r.s).collect())
@@ -205,7 +205,7 @@ def test_infer_schema(self):
     def test_struct_in_map(self):
         d = [Row(m={Row(i=1): Row(s="")})]
         rdd = self.sc.parallelize(d)
-        df = self.sqlCtx.inferSchema(rdd)
+        df = self.sqlCtx.createDataFrame(rdd)
         k, v = df.head().m.items()[0]
         self.assertEqual(1, k.i)
         self.assertEqual("", v.s)
@@ -214,7 +214,7 @@ def test_convert_row_to_dict(self):
         row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})
         self.assertEqual(1, row.asDict()['l'][0].a)
         rdd = self.sc.parallelize([row])
-        df = self.sqlCtx.inferSchema(rdd)
+        df = self.sqlCtx.createDataFrame(rdd)
         df.registerTempTable("test")
         row = self.sqlCtx.sql("select l, d from test").head()
         self.assertEqual(1, row.asDict()["l"][0].a)
@@ -224,7 +224,7 @@ def test_infer_schema_with_udt(self):
         from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
         row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
         rdd = self.sc.parallelize([row])
-        df = self.sqlCtx.inferSchema(rdd)
+        df = self.sqlCtx.createDataFrame(rdd)
         schema = df.schema()
         field = [f for f in schema.fields if f.name == "point"][0]
         self.assertEqual(type(field.dataType), ExamplePointUDT)
@@ -238,7 +238,7 @@ def test_apply_schema_with_udt(self):
         rdd = self.sc.parallelize([row])
         schema = StructType([StructField("label", DoubleType(), False),
                              StructField("point", ExamplePointUDT(), False)])
-        df = self.sqlCtx.applySchema(rdd, schema)
+        df = self.sqlCtx.createDataFrame(rdd, schema)
         point = df.head().point
         self.assertEquals(point, ExamplePoint(1.0, 2.0))
 
@@ -246,7 +246,7 @@ def test_parquet_with_udt(self):
         from pyspark.sql.tests import ExamplePoint
         row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
         rdd = self.sc.parallelize([row])
-        df0 = self.sqlCtx.inferSchema(rdd)
+        df0 = self.sqlCtx.createDataFrame(rdd)
         output_dir = os.path.join(self.tempdir.name, "labeled_point")
         df0.saveAsParquetFile(output_dir)
         df1 = self.sqlCtx.parquetFile(output_dir)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 801505bceb956..523911d108029 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -243,7 +243,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *  val people =
    *    sc.textFile("examples/src/main/resources/people.txt").map(
    *      _.split(",")).map(p => Row(p(0), p(1).trim.toInt))
-   *  val dataFrame = sqlContext. applySchema(people, schema)
+   *  val dataFrame = sqlContext.createDataFrame(people, schema)
    *  dataFrame.printSchema
    *  // root
    *  // |-- name: string (nullable = false)
@@ -252,11 +252,9 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *  dataFrame.registerTempTable("people")
    *  sqlContext.sql("select name from people").collect.foreach(println)
    * }}}
-   *
-   * @group userf
    */
   @DeveloperApi
-  def applySchema(rowRDD: RDD[Row], schema: StructType): DataFrame = {
+  def createDataFrame(rowRDD: RDD[Row], schema: StructType): DataFrame = {
     // TODO: use MutableProjection when rowRDD is another DataFrame and the applied
     // schema differs from the existing schema on any field data type.
     val logicalPlan = LogicalRDD(schema.toAttributes, rowRDD)(self)
@@ -264,8 +262,21 @@ class SQLContext(@transient val sparkContext: SparkContext)
   }
 
   @DeveloperApi
-  def applySchema(rowRDD: JavaRDD[Row], schema: StructType): DataFrame = {
-    applySchema(rowRDD.rdd, schema);
+  def createDataFrame(rowRDD: JavaRDD[Row], schema: StructType): DataFrame = {
+    createDataFrame(rowRDD.rdd, schema)
+  }
+
+  /**
+   * Creates a [[DataFrame]] from an [[JavaRDD]] containing [[Row]]s by applying
+   * a seq of names of columns to this RDD, the data type for each column will
+   * be inferred by the first row.
+   *
+   * @param rowRDD an JavaRDD of Row
+   * @param columns names for each column
+   * @return DataFrame
+   */
+  def createDataFrame(rowRDD: JavaRDD[Row], columns: java.util.List[String]): DataFrame = {
+    createDataFrame(rowRDD.rdd, columns.toSeq)
   }
 
   /**
@@ -274,7 +285,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * WARNING: Since there is no guaranteed ordering for fields in a Java Bean,
    *          SELECT * queries will return the columns in an undefined order.
    */
-  def applySchema(rdd: RDD[_], beanClass: Class[_]): DataFrame = {
+  def createDataFrame(rdd: RDD[_], beanClass: Class[_]): DataFrame = {
     val attributeSeq = getSchema(beanClass)
     val className = beanClass.getName
     val rowRdd = rdd.mapPartitions { iter =>
@@ -301,8 +312,72 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * WARNING: Since there is no guaranteed ordering for fields in a Java Bean,
    *          SELECT * queries will return the columns in an undefined order.
    */
+  def createDataFrame(rdd: JavaRDD[_], beanClass: Class[_]): DataFrame = {
+    createDataFrame(rdd.rdd, beanClass)
+  }
+
+  /**
+   * :: DeveloperApi ::
+   * Creates a [[DataFrame]] from an [[RDD]] containing [[Row]]s by applying a schema to this RDD.
+   * It is important to make sure that the structure of every [[Row]] of the provided RDD matches
+   * the provided schema. Otherwise, there will be runtime exception.
+   * Example:
+   * {{{
+   *  import org.apache.spark.sql._
+   *  val sqlContext = new org.apache.spark.sql.SQLContext(sc)
+   *
+   *  val schema =
+   *    StructType(
+   *      StructField("name", StringType, false) ::
+   *      StructField("age", IntegerType, true) :: Nil)
+   *
+   *  val people =
+   *    sc.textFile("examples/src/main/resources/people.txt").map(
+   *      _.split(",")).map(p => Row(p(0), p(1).trim.toInt))
+   *  val dataFrame = sqlContext. applySchema(people, schema)
+   *  dataFrame.printSchema
+   *  // root
+   *  // |-- name: string (nullable = false)
+   *  // |-- age: integer (nullable = true)
+   *
+   *  dataFrame.registerTempTable("people")
+   *  sqlContext.sql("select name from people").collect.foreach(println)
+   * }}}
+   *
+   * @group userf
+   */
+  @DeveloperApi
+  @deprecated("use createDataFrame", "1.3.0")
+  def applySchema(rowRDD: RDD[Row], schema: StructType): DataFrame = {
+    createDataFrame(rowRDD, schema)
+  }
+
+  @DeveloperApi
+  @deprecated("use createDataFrame", "1.3.0")
+  def applySchema(rowRDD: JavaRDD[Row], schema: StructType): DataFrame = {
+    createDataFrame(rowRDD, schema)
+  }
+
+  /**
+   * Applies a schema to an RDD of Java Beans.
+   *
+   * WARNING: Since there is no guaranteed ordering for fields in a Java Bean,
+   *          SELECT * queries will return the columns in an undefined order.
+   */
+  @deprecated("use createDataFrame", "1.3.0")
+  def applySchema(rdd: RDD[_], beanClass: Class[_]): DataFrame = {
+    createDataFrame(rdd, beanClass)
+  }
+
+  /**
+   * Applies a schema to an RDD of Java Beans.
+   *
+   * WARNING: Since there is no guaranteed ordering for fields in a Java Bean,
+   *          SELECT * queries will return the columns in an undefined order.
+   */
+  @deprecated("use createDataFrame", "1.3.0")
   def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): DataFrame = {
-    applySchema(rdd.rdd, beanClass)
+    createDataFrame(rdd, beanClass)
   }
 
   /**
@@ -375,7 +450,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
         JsonRDD.nullTypeToStringType(
           JsonRDD.inferSchema(json, 1.0, columnNameOfCorruptJsonRecord)))
     val rowRDD = JsonRDD.jsonStringToRow(json, appliedSchema, columnNameOfCorruptJsonRecord)
-    applySchema(rowRDD, appliedSchema)
+    createDataFrame(rowRDD, appliedSchema)
   }
 
   @Experimental
@@ -393,7 +468,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
       JsonRDD.nullTypeToStringType(
         JsonRDD.inferSchema(json, samplingRatio, columnNameOfCorruptJsonRecord))
     val rowRDD = JsonRDD.jsonStringToRow(json, appliedSchema, columnNameOfCorruptJsonRecord)
-    applySchema(rowRDD, appliedSchema)
+    createDataFrame(rowRDD, appliedSchema)
   }
 
   @Experimental
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index fa4cdecbcb340..1d71039872434 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -180,7 +180,7 @@ class ColumnExpressionSuite extends QueryTest {
   }
 
   test("!==") {
-    val nullData = TestSQLContext.applySchema(TestSQLContext.sparkContext.parallelize(
+    val nullData = TestSQLContext.createDataFrame(TestSQLContext.sparkContext.parallelize(
       Row(1, 1) ::
       Row(1, 2) ::
       Row(1, null) ::
@@ -240,7 +240,7 @@ class ColumnExpressionSuite extends QueryTest {
       testData2.collect().toSeq.filter(r => r.getInt(0) <= r.getInt(1)))
   }
 
-  val booleanData = TestSQLContext.applySchema(TestSQLContext.sparkContext.parallelize(
+  val booleanData = TestSQLContext.createDataFrame(TestSQLContext.sparkContext.parallelize(
     Row(false, false) ::
       Row(false, true) ::
       Row(true, false) ::
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 55fd0b0892fa1..bba8899651259 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -34,6 +34,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   TestData
 
   import org.apache.spark.sql.test.TestSQLContext.implicits._
+  val sqlCtx = TestSQLContext
 
   test("SPARK-4625 support SORT BY in SimpleSQLParser & DSL") {
     checkAnswer(
@@ -669,7 +670,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
       Row(values(0).toInt, values(1), values(2).toBoolean, v4)
     }
 
-    val df1 = applySchema(rowRDD1, schema1)
+    val df1 = sqlCtx.createDataFrame(rowRDD1, schema1)
     df1.registerTempTable("applySchema1")
     checkAnswer(
       sql("SELECT * FROM applySchema1"),
@@ -699,7 +700,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
       Row(Row(values(0).toInt, values(2).toBoolean), Map(values(1) -> v4))
     }
 
-    val df2 = applySchema(rowRDD2, schema2)
+    val df2 = sqlCtx.createDataFrame(rowRDD2, schema2)
     df2.registerTempTable("applySchema2")
     checkAnswer(
       sql("SELECT * FROM applySchema2"),
@@ -724,7 +725,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
       Row(Row(values(0).toInt, values(2).toBoolean), scala.collection.mutable.Map(values(1) -> v4))
     }
 
-    val df3 = applySchema(rowRDD3, schema2)
+    val df3 = sqlCtx.createDataFrame(rowRDD3, schema2)
     df3.registerTempTable("applySchema3")
 
     checkAnswer(
@@ -769,7 +770,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
       .build()
     val schemaWithMeta = new StructType(Array(
       schema("id"), schema("name").copy(metadata = metadata), schema("age")))
-    val personWithMeta = applySchema(person.rdd, schemaWithMeta)
+    val personWithMeta = sqlCtx.createDataFrame(person.rdd, schemaWithMeta)
     def validateMetadata(rdd: DataFrame): Unit = {
       assert(rdd.schema("name").metadata.getString(docKey) == docValue)
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index df108a9d262bb..c3210733f1d42 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -71,7 +71,7 @@ class PlannerSuite extends FunSuite {
       val schema = StructType(fields)
       val row = Row.fromSeq(Seq.fill(fields.size)(null))
       val rowRDD = org.apache.spark.sql.test.TestSQLContext.sparkContext.parallelize(row :: Nil)
-      applySchema(rowRDD, schema).registerTempTable("testLimit")
+      createDataFrame(rowRDD, schema).registerTempTable("testLimit")
 
       val planned = sql(
         """
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index e581ac9b50c2b..21e70936102fd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -54,7 +54,7 @@ class JDBCWriteSuite extends FunSuite with BeforeAndAfter {
       StructField("seq", IntegerType) :: Nil)
 
   test("Basic CREATE") {
-    val srdd = TestSQLContext.applySchema(sc.parallelize(arr2x2), schema2)
+    val srdd = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
 
     srdd.createJDBCTable(url, "TEST.BASICCREATETEST", false)
     assert(2 == TestSQLContext.jdbcRDD(url, "TEST.BASICCREATETEST").count)
@@ -62,8 +62,8 @@ class JDBCWriteSuite extends FunSuite with BeforeAndAfter {
   }
 
   test("CREATE with overwrite") {
-    val srdd = TestSQLContext.applySchema(sc.parallelize(arr2x3), schema3)
-    val srdd2 = TestSQLContext.applySchema(sc.parallelize(arr1x2), schema2)
+    val srdd = TestSQLContext.createDataFrame(sc.parallelize(arr2x3), schema3)
+    val srdd2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
 
     srdd.createJDBCTable(url, "TEST.DROPTEST", false)
     assert(2 == TestSQLContext.jdbcRDD(url, "TEST.DROPTEST").count)
@@ -75,8 +75,8 @@ class JDBCWriteSuite extends FunSuite with BeforeAndAfter {
   }
 
   test("CREATE then INSERT to append") {
-    val srdd = TestSQLContext.applySchema(sc.parallelize(arr2x2), schema2)
-    val srdd2 = TestSQLContext.applySchema(sc.parallelize(arr1x2), schema2)
+    val srdd = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
+    val srdd2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
 
     srdd.createJDBCTable(url, "TEST.APPENDTEST", false)
     srdd2.insertIntoJDBC(url, "TEST.APPENDTEST", false)
@@ -85,8 +85,8 @@ class JDBCWriteSuite extends FunSuite with BeforeAndAfter {
   }
 
   test("CREATE then INSERT to truncate") {
-    val srdd = TestSQLContext.applySchema(sc.parallelize(arr2x2), schema2)
-    val srdd2 = TestSQLContext.applySchema(sc.parallelize(arr1x2), schema2)
+    val srdd = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
+    val srdd2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
 
     srdd.createJDBCTable(url, "TEST.TRUNCATETEST", false)
     srdd2.insertIntoJDBC(url, "TEST.TRUNCATETEST", true)
@@ -95,8 +95,8 @@ class JDBCWriteSuite extends FunSuite with BeforeAndAfter {
   }
 
   test("Incompatible INSERT to append") {
-    val srdd = TestSQLContext.applySchema(sc.parallelize(arr2x2), schema2)
-    val srdd2 = TestSQLContext.applySchema(sc.parallelize(arr2x3), schema3)
+    val srdd = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
+    val srdd2 = TestSQLContext.createDataFrame(sc.parallelize(arr2x3), schema3)
 
     srdd.createJDBCTable(url, "TEST.INCOMPATIBLETEST", false)
     intercept[org.apache.spark.SparkException] {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 4fc92e3e3b8c0..fde4b47438c56 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -820,7 +820,7 @@ class JsonSuite extends QueryTest {
       Row(values(0).toInt, values(1), values(2).toBoolean, r.split(",").toList, v5)
     }
 
-    val df1 = applySchema(rowRDD1, schema1)
+    val df1 = createDataFrame(rowRDD1, schema1)
     df1.registerTempTable("applySchema1")
     val df2 = df1.toDataFrame
     val result = df2.toJSON.collect()
@@ -841,7 +841,7 @@ class JsonSuite extends QueryTest {
       Row(Row(values(0).toInt, values(2).toBoolean), Map(values(1) -> v4))
     }
 
-    val df3 = applySchema(rowRDD2, schema2)
+    val df3 = createDataFrame(rowRDD2, schema2)
     df3.registerTempTable("applySchema2")
     val df4 = df3.toDataFrame
     val result2 = df4.toJSON.collect()
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index 43da7519ac8db..89b18c3439cf6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -97,7 +97,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
     val schema = StructType(StructField("m", MapType(StringType, StringType), true) :: Nil)
     val rowRDD = TestHive.sparkContext.parallelize(
       (1 to 100).map(i => Row(scala.collection.mutable.HashMap(s"key$i" -> s"value$i"))))
-    val df = applySchema(rowRDD, schema)
+    val df = TestHive.createDataFrame(rowRDD, schema)
     df.registerTempTable("tableWithMapValue")
     sql("CREATE TABLE hiveTableWithMapValue(m MAP <STRING, STRING>)")
     sql("INSERT OVERWRITE TABLE hiveTableWithMapValue SELECT m FROM tableWithMapValue")
@@ -142,7 +142,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
     val schema = StructType(Seq(
       StructField("a", ArrayType(StringType, containsNull = false))))
     val rowRDD = TestHive.sparkContext.parallelize((1 to 100).map(i => Row(Seq(s"value$i"))))
-    val df = applySchema(rowRDD, schema)
+    val df = TestHive.createDataFrame(rowRDD, schema)
     df.registerTempTable("tableWithArrayValue")
     sql("CREATE TABLE hiveTableWithArrayValue(a Array <STRING>)")
     sql("INSERT OVERWRITE TABLE hiveTableWithArrayValue SELECT a FROM tableWithArrayValue")
@@ -159,7 +159,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
       StructField("m", MapType(StringType, StringType, valueContainsNull = false))))
     val rowRDD = TestHive.sparkContext.parallelize(
       (1 to 100).map(i => Row(Map(s"key$i" -> s"value$i"))))
-    val df = applySchema(rowRDD, schema)
+    val df = TestHive.createDataFrame(rowRDD, schema)
     df.registerTempTable("tableWithMapValue")
     sql("CREATE TABLE hiveTableWithMapValue(m Map <STRING, STRING>)")
     sql("INSERT OVERWRITE TABLE hiveTableWithMapValue SELECT m FROM tableWithMapValue")
@@ -176,7 +176,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
       StructField("s", StructType(Seq(StructField("f", StringType, nullable = false))))))
     val rowRDD = TestHive.sparkContext.parallelize(
       (1 to 100).map(i => Row(Row(s"value$i"))))
-    val df = applySchema(rowRDD, schema)
+    val df = TestHive.createDataFrame(rowRDD, schema)
     df.registerTempTable("tableWithStructValue")
     sql("CREATE TABLE hiveTableWithStructValue(s Struct <f: STRING>)")
     sql("INSERT OVERWRITE TABLE hiveTableWithStructValue SELECT s FROM tableWithStructValue")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 49fe79d989259..9a6e8650a0ec4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.sql.hive.HiveShim
+import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{QueryTest, Row, SQLConf}
@@ -34,6 +35,7 @@ case class Nested3(f3: Int)
 class SQLQuerySuite extends QueryTest {
 
   import org.apache.spark.sql.hive.test.TestHive.implicits._
+  val sqlCtx = TestHive
 
   test("SPARK-4512 Fix attribute reference resolution error when using SORT BY") {
     checkAnswer(
@@ -277,7 +279,7 @@ class SQLQuerySuite extends QueryTest {
 
     val rowRdd = sparkContext.parallelize(row :: Nil)
 
-    applySchema(rowRdd, schema).registerTempTable("testTable")
+    sqlCtx.createDataFrame(rowRdd, schema).registerTempTable("testTable")
 
     sql(
       """CREATE TABLE nullValuesInInnerComplexTypes

From 45df77b8418873a00d770e435358bf603765595f Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Tue, 10 Feb 2015 19:40:51 -0800
Subject: [PATCH 111/357] [SPARK-5709] [SQL] Add EXPLAIN support in DataFrame
 API for debugging purpose

Author: Cheng Hao <hao.cheng@intel.com>

Closes #4496 from chenghao-intel/df_explain and squashes the following commits:

552aa58 [Cheng Hao] Add explain support for DF
---
 .../main/scala/org/apache/spark/sql/Column.scala    |  8 ++++++++
 .../main/scala/org/apache/spark/sql/DataFrame.scala |  6 ++++++
 .../scala/org/apache/spark/sql/DataFrameImpl.scala  | 13 ++++++++++---
 .../org/apache/spark/sql/execution/commands.scala   |  7 +++++--
 .../scala/org/apache/spark/sql/hive/HiveQl.scala    |  8 +++-----
 5 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 1011bf0bb5ef4..b0e95908ee71a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -600,6 +600,14 @@ trait Column extends DataFrame {
   def desc: Column = exprToColumn(SortOrder(expr, Descending), computable = false)
 
   def asc: Column = exprToColumn(SortOrder(expr, Ascending), computable = false)
+
+  override def explain(extended: Boolean): Unit = {
+    if (extended) {
+      println(expr)
+    } else {
+      println(expr.prettyString)
+    }
+  }
 }
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index ca8d552c5febf..17900c5ee3892 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -124,6 +124,12 @@ trait DataFrame extends RDDApi[Row] {
   /** Prints the schema to the console in a nice tree format. */
   def printSchema(): Unit
 
+  /** Prints the plans (logical and physical) to the console for debugging purpose. */
+  def explain(extended: Boolean): Unit
+
+  /** Only prints the physical plan to the console for debugging purpose. */
+  def explain(): Unit = explain(false)
+
   /**
    * Returns true if the `collect` and `take` methods can be run locally
    * (without any Spark executors).
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index 0134b038f3c5a..9638ce0865db0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -30,12 +30,11 @@ import org.apache.spark.api.python.SerDeUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.sql.catalyst.{SqlParser, ScalaReflection}
-import org.apache.spark.sql.catalyst.analysis.{EliminateAnalysisOperators, ResolvedStar, UnresolvedRelation}
+import org.apache.spark.sql.catalyst.analysis.{ResolvedStar, UnresolvedRelation}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.{JoinType, Inner}
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.util.sideBySide
-import org.apache.spark.sql.execution.{LogicalRDD, EvaluatePython}
+import org.apache.spark.sql.execution.{ExplainCommand, LogicalRDD, EvaluatePython}
 import org.apache.spark.sql.json.JsonRDD
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{NumericType, StructType}
@@ -115,6 +114,14 @@ private[sql] class DataFrameImpl protected[sql](
 
   override def printSchema(): Unit = println(schema.treeString)
 
+  override def explain(extended: Boolean): Unit = {
+    ExplainCommand(
+      logicalPlan,
+      extended = extended).queryExecution.executedPlan.executeCollect().map {
+      r => println(r.getString(0))
+    }
+  }
+
   override def isLocal: Boolean = {
     logicalPlan.isInstanceOf[LocalRelation]
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index 335757087deef..2b1726ad4e89f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -20,9 +20,10 @@ package org.apache.spark.sql.execution
 import org.apache.spark.Logging
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.types.StringType
 import org.apache.spark.sql.{DataFrame, SQLConf, SQLContext}
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.expressions.{Row, Attribute}
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Row, Attribute}
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import scala.collection.mutable.ArrayBuffer
@@ -116,7 +117,9 @@ case class SetCommand(
 @DeveloperApi
 case class ExplainCommand(
     logicalPlan: LogicalPlan,
-    override val output: Seq[Attribute], extended: Boolean = false) extends RunnableCommand {
+    override val output: Seq[Attribute] =
+      Seq(AttributeReference("plan", StringType, nullable = false)()),
+    extended: Boolean = false) extends RunnableCommand {
 
   // Run through the optimizer to generate the physical plan.
   override def run(sqlContext: SQLContext) = try {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 8618301ba84d6..f3c9e63652a8e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -466,23 +466,21 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
     // Just fake explain for any of the native commands.
     case Token("TOK_EXPLAIN", explainArgs)
       if noExplainCommands.contains(explainArgs.head.getText) =>
-      ExplainCommand(NoRelation, Seq(AttributeReference("plan", StringType, nullable = false)()))
+      ExplainCommand(NoRelation)
     case Token("TOK_EXPLAIN", explainArgs)
       if "TOK_CREATETABLE" == explainArgs.head.getText =>
       val Some(crtTbl) :: _ :: extended :: Nil =
         getClauses(Seq("TOK_CREATETABLE", "FORMATTED", "EXTENDED"), explainArgs)
       ExplainCommand(
         nodeToPlan(crtTbl),
-        Seq(AttributeReference("plan", StringType,nullable = false)()),
-        extended != None)
+        extended = extended.isDefined)
     case Token("TOK_EXPLAIN", explainArgs) =>
       // Ignore FORMATTED if present.
       val Some(query) :: _ :: extended :: Nil =
         getClauses(Seq("TOK_QUERY", "FORMATTED", "EXTENDED"), explainArgs)
       ExplainCommand(
         nodeToPlan(query),
-        Seq(AttributeReference("plan", StringType, nullable = false)()),
-        extended != None)
+        extended = extended.isDefined)
 
     case Token("TOK_DESCTABLE", describeArgs) =>
       // Reference: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL

From 7e24249af1e2f896328ef0402fa47db78cb6f9ec Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 10 Feb 2015 19:50:44 -0800
Subject: [PATCH 112/357] [SQL][DataFrame] Fix column computability bug.

Do not recursively strip out projects. Only strip the first level project.

```scala
df("colA") + df("colB").as("colC")
```

Previously, the above would construct an invalid plan.

Author: Reynold Xin <rxin@databricks.com>

Closes #4519 from rxin/computability and squashes the following commits:

87ff763 [Reynold Xin] Code review feedback.
015c4fc [Reynold Xin] [SQL][DataFrame] Fix column computability.
---
 .../MatrixFactorizationModel.scala            |  2 +-
 .../scala/org/apache/spark/sql/Column.scala   | 35 ++++++++++++++-----
 .../org/apache/spark/sql/SQLContext.scala     |  4 +--
 .../spark/sql/ColumnExpressionSuite.scala     | 13 +++++--
 4 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index 9ff06ac362a31..16979c9ed43ca 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -180,7 +180,7 @@ object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
     def save(model: MatrixFactorizationModel, path: String): Unit = {
       val sc = model.userFeatures.sparkContext
       val sqlContext = new SQLContext(sc)
-      import sqlContext.implicits.createDataFrame
+      import sqlContext.implicits._
       val metadata = (thisClassName, thisFormatVersion, model.rank)
       val metadataRDD = sc.parallelize(Seq(metadata), 1).toDataFrame("class", "version", "rank")
       metadataRDD.toJSON.saveAsTextFile(metadataPath(path))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index b0e95908ee71a..9d5d6e78bd487 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -66,27 +66,44 @@ trait Column extends DataFrame {
    */
   def isComputable: Boolean
 
+  /** Removes the top project so we can get to the underlying plan. */
+  private def stripProject(p: LogicalPlan): LogicalPlan = p match {
+    case Project(_, child) => child
+    case p => sys.error("Unexpected logical plan (expected Project): " + p)
+  }
+
   private def computableCol(baseCol: ComputableColumn, expr: Expression) = {
-    val plan = Project(Seq(expr match {
+    val namedExpr = expr match {
       case named: NamedExpression => named
       case unnamed: Expression => Alias(unnamed, "col")()
-    }), baseCol.plan)
+    }
+    val plan = Project(Seq(namedExpr), stripProject(baseCol.plan))
     Column(baseCol.sqlContext, plan, expr)
   }
 
+  /**
+   * Construct a new column based on the expression and the other column value.
+   *
+   * There are two cases that can happen here:
+   * If otherValue is a constant, it is first turned into a Column.
+   * If otherValue is a Column, then:
+   *   - If this column and otherValue are both computable and come from the same logical plan,
+   *     then we can construct a ComputableColumn by applying a Project on top of the base plan.
+   *   - If this column is not computable, but otherValue is computable, then we can construct
+   *     a ComputableColumn based on otherValue's base plan.
+   *   - If this column is computable, but otherValue is not, then we can construct a
+   *     ComputableColumn based on this column's base plan.
+   *   - If neither columns are computable, then we create an IncomputableColumn.
+   */
   private def constructColumn(otherValue: Any)(newExpr: Column => Expression): Column = {
-    // Removes all the top level projection and subquery so we can get to the underlying plan.
-    @tailrec def stripProject(p: LogicalPlan): LogicalPlan = p match {
-      case Project(_, child) => stripProject(child)
-      case Subquery(_, child) => stripProject(child)
-      case _ => p
-    }
-
+    // lit(otherValue) returns a Column always.
     (this, lit(otherValue)) match {
       case (left: ComputableColumn, right: ComputableColumn) =>
         if (stripProject(left.plan).sameResult(stripProject(right.plan))) {
           computableCol(right, newExpr(right))
         } else {
+          // We don't want to throw an exception here because "df1("a") === df2("b")" can be
+          // a valid expression for join conditions, even though standalone they are not valid.
           Column(newExpr(right))
         }
       case (left: ComputableColumn, right) => computableCol(left, newExpr(right))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 523911d108029..05ac1623d78ed 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -183,14 +183,14 @@ class SQLContext(@transient val sparkContext: SparkContext)
      *
      * @group userf
      */
-    implicit def createDataFrame[A <: Product : TypeTag](rdd: RDD[A]): DataFrame = {
+    implicit def rddToDataFrame[A <: Product : TypeTag](rdd: RDD[A]): DataFrame = {
       self.createDataFrame(rdd)
     }
 
     /**
      * Creates a DataFrame from a local Seq of Product.
      */
-    implicit def createDataFrame[A <: Product : TypeTag](data: Seq[A]): DataFrame = {
+    implicit def localSeqToDataFrame[A <: Product : TypeTag](data: Seq[A]): DataFrame = {
       self.createDataFrame(data)
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 1d71039872434..e3e6f652ed3ed 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.Dsl._
 import org.apache.spark.sql.test.TestSQLContext
+import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.types.{BooleanType, IntegerType, StructField, StructType}
 
 
@@ -44,10 +45,10 @@ class ColumnExpressionSuite extends QueryTest {
     shouldBeComputable(-testData2("a"))
     shouldBeComputable(!testData2("a"))
 
-    shouldBeComputable(testData2.select(($"a" + 1).as("c"))("c") + testData2("b"))
-    shouldBeComputable(
+    shouldNotBeComputable(testData2.select(($"a" + 1).as("c"))("c") + testData2("b"))
+    shouldNotBeComputable(
       testData2.select(($"a" + 1).as("c"))("c") + testData2.select(($"b" / 2).as("d"))("d"))
-    shouldBeComputable(
+    shouldNotBeComputable(
       testData2.select(($"a" + 1).as("c")).select(($"c" + 2).as("d"))("d") + testData2("b"))
 
     // Literals and unresolved columns should not be computable.
@@ -66,6 +67,12 @@ class ColumnExpressionSuite extends QueryTest {
     shouldNotBeComputable(sum(testData2("a")))
   }
 
+  test("collect on column produced by a binary operator") {
+    val df = Seq((1, 2, 3)).toDataFrame("a", "b", "c")
+    checkAnswer(df("a") + df("b"), Seq(Row(3)))
+    checkAnswer(df("a") + df("b").as("c"), Seq(Row(3)))
+  }
+
   test("star") {
     checkAnswer(testData.select($"*"), testData.collect().toSeq)
   }

From 1cb37700753437045b15c457b983532cd5a27fa5 Mon Sep 17 00:00:00 2001
From: mcheah <mcheah@palantir.com>
Date: Tue, 10 Feb 2015 20:12:18 -0800
Subject: [PATCH 113/357] [SPARK-4879] Use driver to coordinate Hadoop output
 committing for speculative tasks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, SparkHadoopWriter always committed its tasks without question. The problem is that when speculation is enabled sometimes this can result in multiple tasks committing their output to the same file. Even though an HDFS-writing task may be re-launched due to speculation, the original task is not killed and may eventually commit as well.

This can cause strange race conditions where multiple tasks that commit interfere with each other, with the result being that some partition files are actually lost entirely. For more context on these kinds of scenarios, see SPARK-4879.

In Hadoop MapReduce jobs, the application master is a central coordinator that authorizes whether or not any given task can commit. Before a task commits its output, it queries the application master as to whether or not such a commit is safe, and the application master does bookkeeping as tasks are requesting commits. Duplicate tasks that would write to files that were already written to from other tasks are prohibited from committing.

This patch emulates that functionality - the crucial missing component was a central arbitrator, which is now a module called the OutputCommitCoordinator. The coordinator lives on the driver and the executors can obtain a reference to this actor and request its permission to commit. As tasks commit and are reported as completed successfully or unsuccessfully by the DAGScheduler, the commit coordinator is informed of the task completion events as well to update its internal state.

Future work includes more rigorous unit testing and extra optimizations should this patch cause a performance regression. It is unclear what the overall cost of communicating back to the driver on every hadoop-committing task will be. It's also important for those hitting this issue to backport this onto previous version of Spark because the bug has serious consequences, that is, data is lost.

Currently, the OutputCommitCoordinator is only used when `spark.speculation` is true.  It can be disabled by setting `spark.hadoop.outputCommitCoordination.enabled=false` in SparkConf.

This patch is an updated version of #4155 (by mccheah), which in turn was an updated version of this PR.

Closes #4155.

Author: mcheah <mcheah@palantir.com>
Author: Josh Rosen <joshrosen@databricks.com>

Closes #4066 from JoshRosen/SPARK-4879-sparkhadoopwriter-fix and squashes the following commits:

658116b [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-4879-sparkhadoopwriter-fix
ed783b2 [Josh Rosen] Address Andrew’s feedback.
e7be65a [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-4879-sparkhadoopwriter-fix
14861ea [Josh Rosen] splitID -> partitionID in a few places
ed8b554 [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-4879-sparkhadoopwriter-fix
48d5c1c [Josh Rosen] Roll back copiesRunning change in TaskSetManager
3969f5f [Josh Rosen] Re-enable guarding of commit coordination with spark.speculation setting.
ede7590 [Josh Rosen] Add test to ensure that a job that denies all commits cannot complete successfully.
97da5fe [Josh Rosen] Use actor only for RPC; call methods directly in DAGScheduler.
f582574 [Josh Rosen] Some cleanup in OutputCommitCoordinatorSuite
a7c0e29 [Josh Rosen] Create fake TaskInfo using dummy fields instead of Mockito.
997b41b [Josh Rosen] Roll back unnecessary DAGSchedulerSingleThreadedProcessLoop refactoring:
459310a [Josh Rosen] Roll back TaskSetManager changes that broke other tests.
dd00b7c [Josh Rosen] Move CommitDeniedException to executors package; remove `@DeveloperAPI` annotation.
c79df98 [Josh Rosen] Some misc. code style + doc changes:
f7d69c5 [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-4879-sparkhadoopwriter-fix
92e6dc9 [Josh Rosen] Bug fix: use task ID instead of StageID to index into authorizedCommitters.
b344bad [Josh Rosen] (Temporarily) re-enable “always coordinate” for testing purposes.
0aec91e [Josh Rosen] Only coordinate when speculation is enabled; add configuration option to bypass new coordination.
594e41a [mcheah] Fixing a scalastyle error
60a47f4 [mcheah] Writing proper unit test for OutputCommitCoordinator and fixing bugs.
d63f63f [mcheah] Fixing compiler error
9fe6495 [mcheah] Fixing scalastyle
1df2a91 [mcheah] Throwing exception if SparkHadoopWriter commit denied
d431144 [mcheah] Using more concurrency to process OutputCommitCoordinator requests.
c334255 [mcheah] Properly handling messages that could be sent after actor shutdown.
8d5a091 [mcheah] Was mistakenly serializing the accumulator in test suite.
9c6a4fa [mcheah] More OutputCommitCoordinator cleanup on stop()
78eb1b5 [mcheah] Better OutputCommitCoordinatorActor stopping; simpler canCommit
83de900 [mcheah] Making the OutputCommitCoordinatorMessage serializable
abc7db4 [mcheah] TaskInfo can't be null in DAGSchedulerSuite
f135a8e [mcheah] Moving the output commit coordinator from class into method.
1c2b219 [mcheah] Renaming oudated names for test function classes
66a71cd [mcheah] Removing whitespace modifications
6b543ba [mcheah] Removing redundant accumulator in unit test
c9decc6 [mcheah] Scalastyle fixes
bc80770 [mcheah] Unit tests for OutputCommitCoordinator
6e6f748 [mcheah] [SPARK-4879] Use the Spark driver to authorize Hadoop commits.
---
 .../scala/org/apache/spark/SparkContext.scala |  11 +-
 .../scala/org/apache/spark/SparkEnv.scala     |  22 +-
 .../org/apache/spark/SparkHadoopWriter.scala  |  43 +++-
 .../org/apache/spark/TaskEndReason.scala      |  14 ++
 .../executor/CommitDeniedException.scala      |  35 +++
 .../org/apache/spark/executor/Executor.scala  |   5 +
 .../apache/spark/scheduler/DAGScheduler.scala |  15 +-
 .../scheduler/OutputCommitCoordinator.scala   | 172 ++++++++++++++
 .../spark/scheduler/TaskSchedulerImpl.scala   |   9 +-
 .../spark/scheduler/TaskSetManager.scala      |   8 +-
 .../spark/scheduler/DAGSchedulerSuite.scala   |  25 +-
 .../OutputCommitCoordinatorSuite.scala        | 213 ++++++++++++++++++
 12 files changed, 549 insertions(+), 23 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala
 create mode 100644 core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala
 create mode 100644 core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 53fce6b0defdf..24a316e40e673 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -249,7 +249,16 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   conf.set("spark.executor.id", SparkContext.DRIVER_IDENTIFIER)
 
   // Create the Spark execution environment (cache, map output tracker, etc)
-  private[spark] val env = SparkEnv.createDriverEnv(conf, isLocal, listenerBus)
+
+  // This function allows components created by SparkEnv to be mocked in unit tests:
+  private[spark] def createSparkEnv(
+      conf: SparkConf,
+      isLocal: Boolean,
+      listenerBus: LiveListenerBus): SparkEnv = {
+    SparkEnv.createDriverEnv(conf, isLocal, listenerBus)
+  }
+
+  private[spark] val env = createSparkEnv(conf, isLocal, listenerBus)
   SparkEnv.set(env)
 
   // Used to store a URL for each static file/jar together with the file's local timestamp
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index b63bea5b102b6..2a0c7e756dd3a 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -34,7 +34,8 @@ import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.network.BlockTransferService
 import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.network.nio.NioBlockTransferService
-import org.apache.spark.scheduler.LiveListenerBus
+import org.apache.spark.scheduler.{OutputCommitCoordinator, LiveListenerBus}
+import org.apache.spark.scheduler.OutputCommitCoordinator.OutputCommitCoordinatorActor
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.{ShuffleMemoryManager, ShuffleManager}
 import org.apache.spark.storage._
@@ -67,6 +68,7 @@ class SparkEnv (
     val sparkFilesDir: String,
     val metricsSystem: MetricsSystem,
     val shuffleMemoryManager: ShuffleMemoryManager,
+    val outputCommitCoordinator: OutputCommitCoordinator,
     val conf: SparkConf) extends Logging {
 
   private[spark] var isStopped = false
@@ -88,6 +90,7 @@ class SparkEnv (
     blockManager.stop()
     blockManager.master.stop()
     metricsSystem.stop()
+    outputCommitCoordinator.stop()
     actorSystem.shutdown()
     // Unfortunately Akka's awaitTermination doesn't actually wait for the Netty server to shut
     // down, but let's call it anyway in case it gets fixed in a later release
@@ -169,7 +172,8 @@ object SparkEnv extends Logging {
   private[spark] def createDriverEnv(
       conf: SparkConf,
       isLocal: Boolean,
-      listenerBus: LiveListenerBus): SparkEnv = {
+      listenerBus: LiveListenerBus,
+      mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = {
     assert(conf.contains("spark.driver.host"), "spark.driver.host is not set on the driver!")
     assert(conf.contains("spark.driver.port"), "spark.driver.port is not set on the driver!")
     val hostname = conf.get("spark.driver.host")
@@ -181,7 +185,8 @@ object SparkEnv extends Logging {
       port,
       isDriver = true,
       isLocal = isLocal,
-      listenerBus = listenerBus
+      listenerBus = listenerBus,
+      mockOutputCommitCoordinator = mockOutputCommitCoordinator
     )
   }
 
@@ -220,7 +225,8 @@ object SparkEnv extends Logging {
       isDriver: Boolean,
       isLocal: Boolean,
       listenerBus: LiveListenerBus = null,
-      numUsableCores: Int = 0): SparkEnv = {
+      numUsableCores: Int = 0,
+      mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = {
 
     // Listener bus is only used on the driver
     if (isDriver) {
@@ -368,6 +374,13 @@ object SparkEnv extends Logging {
         "levels using the RDD.persist() method instead.")
     }
 
+    val outputCommitCoordinator = mockOutputCommitCoordinator.getOrElse {
+      new OutputCommitCoordinator(conf)
+    }
+    val outputCommitCoordinatorActor = registerOrLookup("OutputCommitCoordinator",
+      new OutputCommitCoordinatorActor(outputCommitCoordinator))
+    outputCommitCoordinator.coordinatorActor = Some(outputCommitCoordinatorActor)
+
     val envInstance = new SparkEnv(
       executorId,
       actorSystem,
@@ -384,6 +397,7 @@ object SparkEnv extends Logging {
       sparkFilesDir,
       metricsSystem,
       shuffleMemoryManager,
+      outputCommitCoordinator,
       conf)
       
     // Add a reference to tmp dir created by driver, we will delete this tmp dir when stop() is
diff --git a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
index 40237596570de..6eb4537d10477 100644
--- a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
+++ b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
@@ -26,6 +26,7 @@ import org.apache.hadoop.mapred._
 import org.apache.hadoop.fs.FileSystem
 import org.apache.hadoop.fs.Path
 
+import org.apache.spark.executor.CommitDeniedException
 import org.apache.spark.mapred.SparkHadoopMapRedUtil
 import org.apache.spark.rdd.HadoopRDD
 
@@ -105,24 +106,56 @@ class SparkHadoopWriter(@transient jobConf: JobConf)
   def commit() {
     val taCtxt = getTaskContext()
     val cmtr = getOutputCommitter()
-    if (cmtr.needsTaskCommit(taCtxt)) {
+
+    // Called after we have decided to commit
+    def performCommit(): Unit = {
       try {
         cmtr.commitTask(taCtxt)
-        logInfo (taID + ": Committed")
+        logInfo (s"$taID: Committed")
       } catch {
-        case e: IOException => {
+        case e: IOException =>
           logError("Error committing the output of task: " + taID.value, e)
           cmtr.abortTask(taCtxt)
           throw e
+      }
+    }
+
+    // First, check whether the task's output has already been committed by some other attempt
+    if (cmtr.needsTaskCommit(taCtxt)) {
+      // The task output needs to be committed, but we don't know whether some other task attempt
+      // might be racing to commit the same output partition. Therefore, coordinate with the driver
+      // in order to determine whether this attempt can commit (see SPARK-4879).
+      val shouldCoordinateWithDriver: Boolean = {
+        val sparkConf = SparkEnv.get.conf
+        // We only need to coordinate with the driver if there are multiple concurrent task
+        // attempts, which should only occur if speculation is enabled
+        val speculationEnabled = sparkConf.getBoolean("spark.speculation", false)
+        // This (undocumented) setting is an escape-hatch in case the commit code introduces bugs
+        sparkConf.getBoolean("spark.hadoop.outputCommitCoordination.enabled", speculationEnabled)
+      }
+      if (shouldCoordinateWithDriver) {
+        val outputCommitCoordinator = SparkEnv.get.outputCommitCoordinator
+        val canCommit = outputCommitCoordinator.canCommit(jobID, splitID, attemptID)
+        if (canCommit) {
+          performCommit()
+        } else {
+          val msg = s"$taID: Not committed because the driver did not authorize commit"
+          logInfo(msg)
+          // We need to abort the task so that the driver can reschedule new attempts, if necessary
+          cmtr.abortTask(taCtxt)
+          throw new CommitDeniedException(msg, jobID, splitID, attemptID)
         }
+      } else {
+        // Speculation is disabled or a user has chosen to manually bypass the commit coordination
+        performCommit()
       }
     } else {
-      logInfo ("No need to commit output of task: " + taID.value)
+      // Some other attempt committed the output, so we do nothing and signal success
+      logInfo(s"No need to commit output of task because needsTaskCommit=false: ${taID.value}")
     }
   }
 
   def commitJob() {
-    // always ? Or if cmtr.needsTaskCommit ?
     val cmtr = getOutputCommitter()
     cmtr.commitJob(getJobContext())
   }
diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
index af5fd8e0ac00c..29a5cd5fdac76 100644
--- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala
+++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
@@ -146,6 +146,20 @@ case object TaskKilled extends TaskFailedReason {
   override def toErrorString: String = "TaskKilled (killed intentionally)"
 }
 
+/**
+ * :: DeveloperApi ::
+ * Task requested the driver to commit, but was denied.
+ */
+@DeveloperApi
+case class TaskCommitDenied(
+    jobID: Int,
+    partitionID: Int,
+    attemptID: Int)
+  extends TaskFailedReason {
+  override def toErrorString: String = s"TaskCommitDenied (Driver denied task commit)" +
+    s" for job: $jobID, partition: $partitionID, attempt: $attemptID"
+}
+
 /**
  * :: DeveloperApi ::
  * The task failed because the executor that it was running on was lost. This may happen because
diff --git a/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala b/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala
new file mode 100644
index 0000000000000..f7604a321f007
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/executor/CommitDeniedException.scala
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.executor
+
+import org.apache.spark.{TaskCommitDenied, TaskEndReason}
+
+/**
+ * Exception thrown when a task attempts to commit output to HDFS but is denied by the driver.
+ */
+class CommitDeniedException(
+    msg: String,
+    jobID: Int,
+    splitID: Int,
+    attemptID: Int)
+  extends Exception(msg) {
+
+  def toTaskEndReason: TaskEndReason = new TaskCommitDenied(jobID, splitID, attemptID)
+
+}
+
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 6b22dcd6f5cbf..b684fb704956b 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -253,6 +253,11 @@ private[spark] class Executor(
           execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled))
         }
 
+        case cDE: CommitDeniedException => {
+          val reason = cDE.toTaskEndReason
+          execBackend.statusUpdate(taskId, TaskState.FAILED, ser.serialize(reason))
+        }
+
         case t: Throwable => {
           // Attempt to exit cleanly by informing the driver of our failure.
           // If anything goes wrong (or this was a fatal exception), we will delegate to
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 1cfe98673773a..79035571adb05 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -38,7 +38,7 @@ import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.partial.{ApproximateActionListener, ApproximateEvaluator, PartialResult}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage._
-import org.apache.spark.util.{CallSite, EventLoop, SystemClock, Clock, Utils}
+import org.apache.spark.util._
 import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat
 
 /**
@@ -63,7 +63,7 @@ class DAGScheduler(
     mapOutputTracker: MapOutputTrackerMaster,
     blockManagerMaster: BlockManagerMaster,
     env: SparkEnv,
-    clock: Clock = SystemClock)
+    clock: org.apache.spark.util.Clock = SystemClock)
   extends Logging {
 
   def this(sc: SparkContext, taskScheduler: TaskScheduler) = {
@@ -126,6 +126,8 @@ class DAGScheduler(
   private[scheduler] val eventProcessLoop = new DAGSchedulerEventProcessLoop(this)
   taskScheduler.setDAGScheduler(this)
 
+  private val outputCommitCoordinator = env.outputCommitCoordinator
+
   // Called by TaskScheduler to report task's starting.
   def taskStarted(task: Task[_], taskInfo: TaskInfo) {
     eventProcessLoop.post(BeginEvent(task, taskInfo))
@@ -808,6 +810,7 @@ class DAGScheduler(
     // will be posted, which should always come after a corresponding SparkListenerStageSubmitted
     // event.
     stage.latestInfo = StageInfo.fromStage(stage, Some(partitionsToCompute.size))
+    outputCommitCoordinator.stageStart(stage.id)
     listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
 
     // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
@@ -865,6 +868,7 @@ class DAGScheduler(
     } else {
       // Because we posted SparkListenerStageSubmitted earlier, we should post
       // SparkListenerStageCompleted here in case there are no tasks to run.
+      outputCommitCoordinator.stageEnd(stage.id)
       listenerBus.post(SparkListenerStageCompleted(stage.latestInfo))
       logDebug("Stage " + stage + " is actually done; %b %d %d".format(
         stage.isAvailable, stage.numAvailableOutputs, stage.numPartitions))
@@ -909,6 +913,9 @@ class DAGScheduler(
     val stageId = task.stageId
     val taskType = Utils.getFormattedClassName(task)
 
+    outputCommitCoordinator.taskCompleted(stageId, task.partitionId,
+      event.taskInfo.attempt, event.reason)
+
     // The success case is dealt with separately below, since we need to compute accumulator
     // updates before posting.
     if (event.reason != Success) {
@@ -921,6 +928,7 @@ class DAGScheduler(
       // Skip all the actions if the stage has been cancelled.
       return
     }
+
     val stage = stageIdToStage(task.stageId)
 
     def markStageAsFinished(stage: Stage, errorMessage: Option[String] = None) = {
@@ -1073,6 +1081,9 @@ class DAGScheduler(
           handleExecutorLost(bmAddress.executorId, fetchFailed = true, Some(task.epoch))
         }
 
+      case commitDenied: TaskCommitDenied =>
+        // Do nothing here, left up to the TaskScheduler to decide how to handle denied commits
+
       case ExceptionFailure(className, description, stackTrace, fullStackTrace, metrics) =>
         // Do nothing here, left up to the TaskScheduler to decide how to handle user failures
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala
new file mode 100644
index 0000000000000..759df023a6dcf
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/scheduler/OutputCommitCoordinator.scala
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import scala.collection.mutable
+
+import akka.actor.{ActorRef, Actor}
+
+import org.apache.spark._
+import org.apache.spark.util.{AkkaUtils, ActorLogReceive}
+
+private sealed trait OutputCommitCoordinationMessage extends Serializable
+
+private case object StopCoordinator extends OutputCommitCoordinationMessage
+private case class AskPermissionToCommitOutput(stage: Int, task: Long, taskAttempt: Long)
+
+/**
+ * Authority that decides whether tasks can commit output to HDFS. Uses a "first committer wins"
+ * policy.
+ *
+ * OutputCommitCoordinator is instantiated in both the drivers and executors. On executors, it is
+ * configured with a reference to the driver's OutputCommitCoordinatorActor, so requests to commit
+ * output will be forwarded to the driver's OutputCommitCoordinator.
+ *
+ * This class was introduced in SPARK-4879; see that JIRA issue (and the associated pull requests)
+ * for an extensive design discussion.
+ */
+private[spark] class OutputCommitCoordinator(conf: SparkConf) extends Logging {
+
+  // Initialized by SparkEnv
+  var coordinatorActor: Option[ActorRef] = None
+  private val timeout = AkkaUtils.askTimeout(conf)
+  private val maxAttempts = AkkaUtils.numRetries(conf)
+  private val retryInterval = AkkaUtils.retryWaitMs(conf)
+
+  private type StageId = Int
+  private type PartitionId = Long
+  private type TaskAttemptId = Long
+
+  /**
+   * Map from active stages's id => partition id => task attempt with exclusive lock on committing
+   * output for that partition.
+   *
+   * Entries are added to the top-level map when stages start and are removed they finish
+   * (either successfully or unsuccessfully).
+   *
+   * Access to this map should be guarded by synchronizing on the OutputCommitCoordinator instance.
+   */
+  private val authorizedCommittersByStage: CommittersByStageMap = mutable.Map()
+  private type CommittersByStageMap = mutable.Map[StageId, mutable.Map[PartitionId, TaskAttemptId]]
+
+  /**
+   * Called by tasks to ask whether they can commit their output to HDFS.
+   *
+   * If a task attempt has been authorized to commit, then all other attempts to commit the same
+   * task will be denied.  If the authorized task attempt fails (e.g. due to its executor being
+   * lost), then a subsequent task attempt may be authorized to commit its output.
+   *
+   * @param stage the stage number
+   * @param partition the partition number
+   * @param attempt a unique identifier for this task attempt
+   * @return true if this task is authorized to commit, false otherwise
+   */
+  def canCommit(
+      stage: StageId,
+      partition: PartitionId,
+      attempt: TaskAttemptId): Boolean = {
+    val msg = AskPermissionToCommitOutput(stage, partition, attempt)
+    coordinatorActor match {
+      case Some(actor) =>
+        AkkaUtils.askWithReply[Boolean](msg, actor, maxAttempts, retryInterval, timeout)
+      case None =>
+        logError(
+          "canCommit called after coordinator was stopped (is SparkEnv shutdown in progress)?")
+        false
+    }
+  }
+
+  // Called by DAGScheduler
+  private[scheduler] def stageStart(stage: StageId): Unit = synchronized {
+    authorizedCommittersByStage(stage) = mutable.HashMap[PartitionId, TaskAttemptId]()
+  }
+
+  // Called by DAGScheduler
+  private[scheduler] def stageEnd(stage: StageId): Unit = synchronized {
+    authorizedCommittersByStage.remove(stage)
+  }
+
+  // Called by DAGScheduler
+  private[scheduler] def taskCompleted(
+      stage: StageId,
+      partition: PartitionId,
+      attempt: TaskAttemptId,
+      reason: TaskEndReason): Unit = synchronized {
+    val authorizedCommitters = authorizedCommittersByStage.getOrElse(stage, {
+      logDebug(s"Ignoring task completion for completed stage")
+      return
+    })
+    reason match {
+      case Success =>
+      // The task output has been committed successfully
+      case denied: TaskCommitDenied =>
+        logInfo(
+          s"Task was denied committing, stage: $stage, partition: $partition, attempt: $attempt")
+      case otherReason =>
+        logDebug(s"Authorized committer $attempt (stage=$stage, partition=$partition) failed;" +
+          s" clearing lock")
+        authorizedCommitters.remove(partition)
+    }
+  }
+
+  def stop(): Unit = synchronized {
+    coordinatorActor.foreach(_ ! StopCoordinator)
+    coordinatorActor = None
+    authorizedCommittersByStage.clear()
+  }
+
+  // Marked private[scheduler] instead of private so this can be mocked in tests
+  private[scheduler] def handleAskPermissionToCommit(
+      stage: StageId,
+      partition: PartitionId,
+      attempt: TaskAttemptId): Boolean = synchronized {
+    authorizedCommittersByStage.get(stage) match {
+      case Some(authorizedCommitters) =>
+        authorizedCommitters.get(partition) match {
+          case Some(existingCommitter) =>
+            logDebug(s"Denying $attempt to commit for stage=$stage, partition=$partition; " +
+              s"existingCommitter = $existingCommitter")
+            false
+          case None =>
+            logDebug(s"Authorizing $attempt to commit for stage=$stage, partition=$partition")
+            authorizedCommitters(partition) = attempt
+            true
+        }
+      case None =>
+        logDebug(s"Stage $stage has completed, so not allowing task attempt $attempt to commit")
+        false
+    }
+  }
+}
+
+private[spark] object OutputCommitCoordinator {
+
+  // This actor is used only for RPC
+  class OutputCommitCoordinatorActor(outputCommitCoordinator: OutputCommitCoordinator)
+    extends Actor with ActorLogReceive with Logging {
+
+    override def receiveWithLogging = {
+      case AskPermissionToCommitOutput(stage, partition, taskAttempt) =>
+        sender ! outputCommitCoordinator.handleAskPermissionToCommit(stage, partition, taskAttempt)
+      case StopCoordinator =>
+        logInfo("OutputCommitCoordinator stopped!")
+        context.stop(self)
+        sender ! true
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 79f84e70df9d5..54f8fcfc416d1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -158,7 +158,7 @@ private[spark] class TaskSchedulerImpl(
     val tasks = taskSet.tasks
     logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
     this.synchronized {
-      val manager = new TaskSetManager(this, taskSet, maxTaskFailures)
+      val manager = createTaskSetManager(taskSet, maxTaskFailures)
       activeTaskSets(taskSet.id) = manager
       schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
 
@@ -180,6 +180,13 @@ private[spark] class TaskSchedulerImpl(
     backend.reviveOffers()
   }
 
+  // Label as private[scheduler] to allow tests to swap in different task set managers if necessary
+  private[scheduler] def createTaskSetManager(
+      taskSet: TaskSet,
+      maxTaskFailures: Int): TaskSetManager = {
+    new TaskSetManager(this, taskSet, maxTaskFailures)
+  }
+
   override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = synchronized {
     logInfo("Cancelling stage " + stageId)
     activeTaskSets.find(_._2.stageId == stageId).foreach { case (_, tsm) =>
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 55024ecd55e61..99a5f7117790d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -292,7 +292,8 @@ private[spark] class TaskSetManager(
    * an attempt running on this host, in case the host is slow. In addition, the task should meet
    * the given locality constraint.
    */
-  private def dequeueSpeculativeTask(execId: String, host: String, locality: TaskLocality.Value)
+  // Labeled as protected to allow tests to override providing speculative tasks if necessary
+  protected def dequeueSpeculativeTask(execId: String, host: String, locality: TaskLocality.Value)
     : Option[(Int, TaskLocality.Value)] =
   {
     speculatableTasks.retain(index => !successful(index)) // Remove finished tasks from set
@@ -708,7 +709,10 @@ private[spark] class TaskSetManager(
       put(info.executorId, clock.getTime())
     sched.dagScheduler.taskEnded(tasks(index), reason, null, null, info, taskMetrics)
     addPendingTask(index)
-    if (!isZombie && state != TaskState.KILLED) {
+    if (!isZombie && state != TaskState.KILLED && !reason.isInstanceOf[TaskCommitDenied]) {
+      // If a task failed because its attempt to commit was denied, do not count this failure
+      // towards failing the stage. This is intended to prevent spurious stage failures in cases
+      // where many speculative tasks are launched and denied to commit.
       assert (null != failureReason)
       numFailures(index) += 1
       if (numFailures(index) >= maxTaskFailures) {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index eb116213f69fc..9d0c1273695f6 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -208,7 +208,7 @@ class DAGSchedulerSuite extends FunSuiteLike  with BeforeAndAfter with LocalSpar
     assert(taskSet.tasks.size >= results.size)
     for ((result, i) <- results.zipWithIndex) {
       if (i < taskSet.tasks.size) {
-        runEvent(CompletionEvent(taskSet.tasks(i), result._1, result._2, null, null, null))
+        runEvent(CompletionEvent(taskSet.tasks(i), result._1, result._2, null, createFakeTaskInfo(), null))
       }
     }
   }
@@ -219,7 +219,7 @@ class DAGSchedulerSuite extends FunSuiteLike  with BeforeAndAfter with LocalSpar
     for ((result, i) <- results.zipWithIndex) {
       if (i < taskSet.tasks.size) {
         runEvent(CompletionEvent(taskSet.tasks(i), result._1, result._2,
-          Map[Long, Any]((accumId, 1)), null, null))
+          Map[Long, Any]((accumId, 1)), createFakeTaskInfo(), null))
       }
     }
   }
@@ -476,7 +476,7 @@ class DAGSchedulerSuite extends FunSuiteLike  with BeforeAndAfter with LocalSpar
       FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored"),
       null,
       Map[Long, Any](),
-      null,
+      createFakeTaskInfo(),
       null))
     assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
     assert(sparkListener.failedStages.contains(1))
@@ -487,7 +487,7 @@ class DAGSchedulerSuite extends FunSuiteLike  with BeforeAndAfter with LocalSpar
       FetchFailed(makeBlockManagerId("hostA"), shuffleId, 1, 1, "ignored"),
       null,
       Map[Long, Any](),
-      null,
+      createFakeTaskInfo(),
       null))
     // The SparkListener should not receive redundant failure events.
     assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS))
@@ -507,14 +507,14 @@ class DAGSchedulerSuite extends FunSuiteLike  with BeforeAndAfter with LocalSpar
     assert(newEpoch > oldEpoch)
     val taskSet = taskSets(0)
     // should be ignored for being too old
-    runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostA", 1), null, null, null))
+    runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostA", 1), null, createFakeTaskInfo(), null))
     // should work because it's a non-failed host
-    runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostB", 1), null, null, null))
+    runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostB", 1), null, createFakeTaskInfo(), null))
     // should be ignored for being too old
-    runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostA", 1), null, null, null))
+    runEvent(CompletionEvent(taskSet.tasks(0), Success, makeMapStatus("hostA", 1), null, createFakeTaskInfo(), null))
     // should work because it's a new epoch
     taskSet.tasks(1).epoch = newEpoch
-    runEvent(CompletionEvent(taskSet.tasks(1), Success, makeMapStatus("hostA", 1), null, null, null))
+    runEvent(CompletionEvent(taskSet.tasks(1), Success, makeMapStatus("hostA", 1), null, createFakeTaskInfo(), null))
     assert(mapOutputTracker.getServerStatuses(shuffleId, 0).map(_._1) ===
            Array(makeBlockManagerId("hostB"), makeBlockManagerId("hostA")))
     complete(taskSets(1), Seq((Success, 42), (Success, 43)))
@@ -766,5 +766,14 @@ class DAGSchedulerSuite extends FunSuiteLike  with BeforeAndAfter with LocalSpar
     assert(scheduler.shuffleToMapStage.isEmpty)
     assert(scheduler.waitingStages.isEmpty)
   }
+
+  // Nothing in this test should break if the task info's fields are null, but
+  // OutputCommitCoordinator requires the task info itself to not be null.
+  private def createFakeTaskInfo(): TaskInfo = {
+    val info = new TaskInfo(0, 0, 0, 0L, "", "", TaskLocality.ANY, false)
+    info.finishTime = 1  // to prevent spurious errors in JobProgressListener
+    info
+  }
+
 }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
new file mode 100644
index 0000000000000..3cc860caa1d9b
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import java.io.File
+import java.util.concurrent.TimeoutException
+
+import org.mockito.Matchers
+import org.mockito.Mockito._
+import org.mockito.invocation.InvocationOnMock
+import org.mockito.stubbing.Answer
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+import org.apache.hadoop.mapred.{TaskAttemptID, JobConf, TaskAttemptContext, OutputCommitter}
+
+import org.apache.spark._
+import org.apache.spark.rdd.{RDD, FakeOutputCommitter}
+import org.apache.spark.util.Utils
+
+import scala.concurrent.Await
+import scala.concurrent.duration._
+import scala.language.postfixOps
+
+/**
+ * Unit tests for the output commit coordination functionality.
+ *
+ * The unit test makes both the original task and the speculated task
+ * attempt to commit, where committing is emulated by creating a
+ * directory. If both tasks create directories then the end result is
+ * a failure.
+ *
+ * Note that there are some aspects of this test that are less than ideal.
+ * In particular, the test mocks the speculation-dequeuing logic to always
+ * dequeue a task and consider it as speculated. Immediately after initially
+ * submitting the tasks and calling reviveOffers(), reviveOffers() is invoked
+ * again to pick up the speculated task. This may be hacking the original
+ * behavior in too much of an unrealistic fashion.
+ *
+ * Also, the validation is done by checking the number of files in a directory.
+ * Ideally, an accumulator would be used for this, where we could increment
+ * the accumulator in the output committer's commitTask() call. If the call to
+ * commitTask() was called twice erroneously then the test would ideally fail because
+ * the accumulator would be incremented twice.
+ *
+ * The problem with this test implementation is that when both a speculated task and
+ * its original counterpart complete, only one of the accumulator's increments is
+ * captured. This results in a paradox where if the OutputCommitCoordinator logic
+ * was not in SparkHadoopWriter, the tests would still pass because only one of the
+ * increments would be captured even though the commit in both tasks was executed
+ * erroneously.
+ */
+class OutputCommitCoordinatorSuite extends FunSuite with BeforeAndAfter {
+
+  var outputCommitCoordinator: OutputCommitCoordinator = null
+  var tempDir: File = null
+  var sc: SparkContext = null
+
+  before {
+    tempDir = Utils.createTempDir()
+    val conf = new SparkConf()
+      .setMaster("local[4]")
+      .setAppName(classOf[OutputCommitCoordinatorSuite].getSimpleName)
+      .set("spark.speculation", "true")
+    sc = new SparkContext(conf) {
+      override private[spark] def createSparkEnv(
+          conf: SparkConf,
+          isLocal: Boolean,
+          listenerBus: LiveListenerBus): SparkEnv = {
+        outputCommitCoordinator = spy(new OutputCommitCoordinator(conf))
+        // Use Mockito.spy() to maintain the default infrastructure everywhere else.
+        // This mocking allows us to control the coordinator responses in test cases.
+        SparkEnv.createDriverEnv(conf, isLocal, listenerBus, Some(outputCommitCoordinator))
+      }
+    }
+    // Use Mockito.spy() to maintain the default infrastructure everywhere else
+    val mockTaskScheduler = spy(sc.taskScheduler.asInstanceOf[TaskSchedulerImpl])
+
+    doAnswer(new Answer[Unit]() {
+      override def answer(invoke: InvocationOnMock): Unit = {
+        // Submit the tasks, then force the task scheduler to dequeue the
+        // speculated task
+        invoke.callRealMethod()
+        mockTaskScheduler.backend.reviveOffers()
+      }
+    }).when(mockTaskScheduler).submitTasks(Matchers.any())
+
+    doAnswer(new Answer[TaskSetManager]() {
+      override def answer(invoke: InvocationOnMock): TaskSetManager = {
+        val taskSet = invoke.getArguments()(0).asInstanceOf[TaskSet]
+        new TaskSetManager(mockTaskScheduler, taskSet, 4) {
+          var hasDequeuedSpeculatedTask = false
+          override def dequeueSpeculativeTask(
+              execId: String,
+              host: String,
+              locality: TaskLocality.Value): Option[(Int, TaskLocality.Value)] = {
+            if (!hasDequeuedSpeculatedTask) {
+              hasDequeuedSpeculatedTask = true
+              Some(0, TaskLocality.PROCESS_LOCAL)
+            } else {
+              None
+            }
+          }
+        }
+      }
+    }).when(mockTaskScheduler).createTaskSetManager(Matchers.any(), Matchers.any())
+
+    sc.taskScheduler = mockTaskScheduler
+    val dagSchedulerWithMockTaskScheduler = new DAGScheduler(sc, mockTaskScheduler)
+    sc.taskScheduler.setDAGScheduler(dagSchedulerWithMockTaskScheduler)
+    sc.dagScheduler = dagSchedulerWithMockTaskScheduler
+  }
+
+  after {
+    sc.stop()
+    tempDir.delete()
+    outputCommitCoordinator = null
+  }
+
+  test("Only one of two duplicate commit tasks should commit") {
+    val rdd = sc.parallelize(Seq(1), 1)
+    sc.runJob(rdd, OutputCommitFunctions(tempDir.getAbsolutePath).commitSuccessfully _,
+      0 until rdd.partitions.size, allowLocal = false)
+    assert(tempDir.list().size === 1)
+  }
+
+  test("If commit fails, if task is retried it should not be locked, and will succeed.") {
+    val rdd = sc.parallelize(Seq(1), 1)
+    sc.runJob(rdd, OutputCommitFunctions(tempDir.getAbsolutePath).failFirstCommitAttempt _,
+      0 until rdd.partitions.size, allowLocal = false)
+    assert(tempDir.list().size === 1)
+  }
+
+  test("Job should not complete if all commits are denied") {
+    // Create a mock OutputCommitCoordinator that denies all attempts to commit
+    doReturn(false).when(outputCommitCoordinator).handleAskPermissionToCommit(
+      Matchers.any(), Matchers.any(), Matchers.any())
+    val rdd: RDD[Int] = sc.parallelize(Seq(1), 1)
+    def resultHandler(x: Int, y: Unit): Unit = {}
+    val futureAction: SimpleFutureAction[Unit] = sc.submitJob[Int, Unit, Unit](rdd,
+      OutputCommitFunctions(tempDir.getAbsolutePath).commitSuccessfully,
+      0 until rdd.partitions.size, resultHandler, 0)
+    // It's an error if the job completes successfully even though no committer was authorized,
+    // so throw an exception if the job was allowed to complete.
+    intercept[TimeoutException] {
+      Await.result(futureAction, 5 seconds)
+    }
+    assert(tempDir.list().size === 0)
+  }
+}
+
+/**
+ * Class with methods that can be passed to runJob to test commits with a mock committer.
+ */
+private case class OutputCommitFunctions(tempDirPath: String) {
+
+  // Mock output committer that simulates a successful commit (after commit is authorized)
+  private def successfulOutputCommitter = new FakeOutputCommitter {
+    override def commitTask(context: TaskAttemptContext): Unit = {
+      Utils.createDirectory(tempDirPath)
+    }
+  }
+
+  // Mock output committer that simulates a failed commit (after commit is authorized)
+  private def failingOutputCommitter = new FakeOutputCommitter {
+    override def commitTask(taskAttemptContext: TaskAttemptContext) {
+      throw new RuntimeException
+    }
+  }
+
+  def commitSuccessfully(iter: Iterator[Int]): Unit = {
+    val ctx = TaskContext.get()
+    runCommitWithProvidedCommitter(ctx, iter, successfulOutputCommitter)
+  }
+
+  def failFirstCommitAttempt(iter: Iterator[Int]): Unit = {
+    val ctx = TaskContext.get()
+    runCommitWithProvidedCommitter(ctx, iter,
+      if (ctx.attemptNumber == 0) failingOutputCommitter else successfulOutputCommitter)
+  }
+
+  private def runCommitWithProvidedCommitter(
+      ctx: TaskContext,
+      iter: Iterator[Int],
+      outputCommitter: OutputCommitter): Unit = {
+    def jobConf = new JobConf {
+      override def getOutputCommitter(): OutputCommitter = outputCommitter
+    }
+    val sparkHadoopWriter = new SparkHadoopWriter(jobConf) {
+      override def newTaskAttemptContext(
+        conf: JobConf,
+        attemptId: TaskAttemptID): TaskAttemptContext = {
+        mock(classOf[TaskAttemptContext])
+      }
+    }
+    sparkHadoopWriter.setup(ctx.stageId, ctx.partitionId, ctx.attemptNumber)
+    sparkHadoopWriter.commit()
+  }
+}

From b969182659aa7ea94c38329b86d98a31b23efce8 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Tue, 10 Feb 2015 20:19:14 -0800
Subject: [PATCH 114/357] [SPARK-5729] Potential NPE in standalone REST API

If the user specifies a bad REST URL, the server will throw an NPE instead of propagating the error back. This is because the default `ErrorServlet` has the wrong prefix. This is a one line fix. I am will add more comprehensive tests in a separate patch.

Author: Andrew Or <andrew@databricks.com>

Closes #4518 from andrewor14/rest-npe and squashes the following commits:

16b15bc [Andrew Or] Correct ErrorServlet context prefix
---
 .../org/apache/spark/deploy/rest/StandaloneRestServer.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
index 6e4486e20fcba..acd3a2b5abe6c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
@@ -77,7 +77,7 @@ private[spark] class StandaloneRestServer(
     new SubmitRequestServlet(masterActor, masterUrl, masterConf) -> s"$baseContext/create/*",
     new KillRequestServlet(masterActor, masterConf) -> s"$baseContext/kill/*",
     new StatusRequestServlet(masterActor, masterConf) -> s"$baseContext/status/*",
-    new ErrorServlet -> "/" // default handler
+    new ErrorServlet -> "/*" // default handler
   )
 
   /** Start the server and return the bound port. */

From b8f88d32723eaea4807c10b5b79d0c76f30b0510 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 10 Feb 2015 20:40:21 -0800
Subject: [PATCH 115/357] [SPARK-5702][SQL] Allow short names for built-in data
 sources.

Also took the chance to fixed up some style ...

Author: Reynold Xin <rxin@databricks.com>

Closes #4489 from rxin/SPARK-5702 and squashes the following commits:

74f42e3 [Reynold Xin] [SPARK-5702][SQL] Allow short names for built-in data sources.
---
 .../apache/spark/sql/jdbc/JDBCRelation.scala  | 26 +++----
 .../apache/spark/sql/json/JSONRelation.scala  |  1 +
 .../org/apache/spark/sql/sources/ddl.scala    | 77 ++++++++++---------
 .../sql/sources/ResolvedDataSourceSuite.scala | 34 ++++++++
 4 files changed, 90 insertions(+), 48 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
index 66ad38eb7c45b..beb76f2c553c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JDBCRelation.scala
@@ -48,11 +48,6 @@ private[sql] object JDBCRelation {
    * exactly once.  The parameters minValue and maxValue are advisory in that
    * incorrect values may cause the partitioning to be poor, but no data
    * will fail to be represented.
-   *
-   * @param column - Column name.  Must refer to a column of integral type.
-   * @param numPartitions - Number of partitions
-   * @param minValue - Smallest value of column.  Advisory.
-   * @param maxValue - Largest value of column.  Advisory.
    */
   def columnPartition(partitioning: JDBCPartitioningInfo): Array[Partition] = {
     if (partitioning == null) return Array[Partition](JDBCPartition(null, 0))
@@ -68,12 +63,17 @@ private[sql] object JDBCRelation {
     var currentValue: Long = partitioning.lowerBound
     var ans = new ArrayBuffer[Partition]()
     while (i < numPartitions) {
-      val lowerBound = (if (i != 0) s"$column >= $currentValue" else null)
+      val lowerBound = if (i != 0) s"$column >= $currentValue" else null
       currentValue += stride
-      val upperBound = (if (i != numPartitions - 1) s"$column < $currentValue" else null)
-      val whereClause = (if (upperBound == null) lowerBound
-                    else if (lowerBound == null) upperBound
-                    else s"$lowerBound AND $upperBound")
+      val upperBound = if (i != numPartitions - 1) s"$column < $currentValue" else null
+      val whereClause =
+        if (upperBound == null) {
+          lowerBound
+        } else if (lowerBound == null) {
+          upperBound
+        } else {
+          s"$lowerBound AND $upperBound"
+        }
       ans += JDBCPartition(whereClause, i)
       i = i + 1
     }
@@ -96,8 +96,7 @@ private[sql] class DefaultSource extends RelationProvider {
 
     if (driver != null) Class.forName(driver)
 
-    if (
-      partitionColumn != null
+    if (partitionColumn != null
         && (lowerBound == null || upperBound == null || numPartitions == null)) {
       sys.error("Partitioning incompletely specified")
     }
@@ -119,7 +118,8 @@ private[sql] class DefaultSource extends RelationProvider {
 private[sql] case class JDBCRelation(
     url: String,
     table: String,
-    parts: Array[Partition])(@transient val sqlContext: SQLContext) extends PrunedFilteredScan {
+    parts: Array[Partition])(@transient val sqlContext: SQLContext)
+  extends PrunedFilteredScan {
 
   override val schema = JDBCRDD.resolveTable(url, table)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
index f828bcdd65c9e..51ff2443f3717 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.json
 import java.io.IOException
 
 import org.apache.hadoop.fs.Path
+
 import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index 6487c14b1eb8f..d3d72089c3303 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -234,65 +234,73 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
     primitiveType
 }
 
-object ResolvedDataSource {
-  def apply(
-      sqlContext: SQLContext,
-      userSpecifiedSchema: Option[StructType],
-      provider: String,
-      options: Map[String, String]): ResolvedDataSource = {
+private[sql] object ResolvedDataSource {
+
+  private val builtinSources = Map(
+    "jdbc" -> classOf[org.apache.spark.sql.jdbc.DefaultSource],
+    "json" -> classOf[org.apache.spark.sql.json.DefaultSource],
+    "parquet" -> classOf[org.apache.spark.sql.parquet.DefaultSource]
+  )
+
+  /** Given a provider name, look up the data source class definition. */
+  def lookupDataSource(provider: String): Class[_] = {
+    if (builtinSources.contains(provider)) {
+      return builtinSources(provider)
+    }
+
     val loader = Utils.getContextOrSparkClassLoader
-    val clazz: Class[_] = try loader.loadClass(provider) catch {
+    try {
+      loader.loadClass(provider)
+    } catch {
       case cnf: java.lang.ClassNotFoundException =>
-        try loader.loadClass(provider + ".DefaultSource") catch {
+        try {
+          loader.loadClass(provider + ".DefaultSource")
+        } catch {
           case cnf: java.lang.ClassNotFoundException =>
             sys.error(s"Failed to load class for data source: $provider")
         }
     }
+  }
 
+  /** Create a [[ResolvedDataSource]] for reading data in. */
+  def apply(
+      sqlContext: SQLContext,
+      userSpecifiedSchema: Option[StructType],
+      provider: String,
+      options: Map[String, String]): ResolvedDataSource = {
+    val clazz: Class[_] = lookupDataSource(provider)
     val relation = userSpecifiedSchema match {
-      case Some(schema: StructType) => {
-        clazz.newInstance match {
-          case dataSource: SchemaRelationProvider =>
-            dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options), schema)
-          case dataSource: org.apache.spark.sql.sources.RelationProvider =>
-            sys.error(s"${clazz.getCanonicalName} does not allow user-specified schemas.")
-        }
+      case Some(schema: StructType) => clazz.newInstance() match {
+        case dataSource: SchemaRelationProvider =>
+          dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options), schema)
+        case dataSource: org.apache.spark.sql.sources.RelationProvider =>
+          sys.error(s"${clazz.getCanonicalName} does not allow user-specified schemas.")
       }
-      case None => {
-        clazz.newInstance match {
-          case dataSource: RelationProvider =>
-            dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options))
-          case dataSource: org.apache.spark.sql.sources.SchemaRelationProvider =>
-            sys.error(s"A schema needs to be specified when using ${clazz.getCanonicalName}.")
-        }
+
+      case None => clazz.newInstance() match {
+        case dataSource: RelationProvider =>
+          dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options))
+        case dataSource: org.apache.spark.sql.sources.SchemaRelationProvider =>
+          sys.error(s"A schema needs to be specified when using ${clazz.getCanonicalName}.")
       }
     }
-
     new ResolvedDataSource(clazz, relation)
   }
 
+  /** Create a [[ResolvedDataSource]] for saving the content of the given [[DataFrame]]. */
   def apply(
       sqlContext: SQLContext,
       provider: String,
       mode: SaveMode,
       options: Map[String, String],
       data: DataFrame): ResolvedDataSource = {
-    val loader = Utils.getContextOrSparkClassLoader
-    val clazz: Class[_] = try loader.loadClass(provider) catch {
-      case cnf: java.lang.ClassNotFoundException =>
-        try loader.loadClass(provider + ".DefaultSource") catch {
-          case cnf: java.lang.ClassNotFoundException =>
-            sys.error(s"Failed to load class for data source: $provider")
-        }
-    }
-
-    val relation = clazz.newInstance match {
+    val clazz: Class[_] = lookupDataSource(provider)
+    val relation = clazz.newInstance() match {
       case dataSource: CreatableRelationProvider =>
         dataSource.createRelation(sqlContext, mode, options, data)
       case _ =>
         sys.error(s"${clazz.getCanonicalName} does not allow create table as select.")
     }
-
     new ResolvedDataSource(clazz, relation)
   }
 }
@@ -405,6 +413,5 @@ protected[sql] class CaseInsensitiveMap(map: Map[String, String]) extends Map[St
 
 /**
  * The exception thrown from the DDL parser.
- * @param message
  */
 protected[sql] class DDLException(message: String) extends Exception(message)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala
new file mode 100644
index 0000000000000..8331a14c9295c
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala
@@ -0,0 +1,34 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.sources
+
+import org.scalatest.FunSuite
+
+class ResolvedDataSourceSuite extends FunSuite {
+
+  test("builtin sources") {
+    assert(ResolvedDataSource.lookupDataSource("jdbc") ===
+      classOf[org.apache.spark.sql.jdbc.DefaultSource])
+
+    assert(ResolvedDataSource.lookupDataSource("json") ===
+      classOf[org.apache.spark.sql.json.DefaultSource])
+
+    assert(ResolvedDataSource.lookupDataSource("parquet") ===
+      classOf[org.apache.spark.sql.parquet.DefaultSource])
+  }
+}

From f86a89a2e081ee4593ce03398c2283fd77daac6e Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 10 Feb 2015 21:51:15 -0800
Subject: [PATCH 116/357] [SPARK-5714][Mllib] Refactor initial step of LDA to
 remove redundant operations

The `initialState` of LDA performs several RDD operations that looks redundant. This pr tries to simplify these operations.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #4501 from viirya/sim_lda and squashes the following commits:

4870fe4 [Liang-Chi Hsieh] For comments.
9af1487 [Liang-Chi Hsieh] Refactor initial step of LDA to remove redundant operations.
---
 .../apache/spark/mllib/clustering/LDA.scala   | 37 +++++++------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index a1d3df03a1140..5e17c8da61134 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -450,34 +450,23 @@ private[clustering] object LDA {
 
     // Create vertices.
     // Initially, we use random soft assignments of tokens to topics (random gamma).
-    val edgesWithGamma: RDD[(Edge[TokenCount], TopicCounts)] =
-      edges.mapPartitionsWithIndex { case (partIndex, partEdges) =>
-        val random = new Random(partIndex + randomSeed)
-        partEdges.map { edge =>
-          // Create a random gamma_{wjk}
-          (edge, normalize(BDV.fill[Double](k)(random.nextDouble()), 1.0))
+    def createVertices(): RDD[(VertexId, TopicCounts)] = {
+      val verticesTMP: RDD[(VertexId, TopicCounts)] =
+        edges.mapPartitionsWithIndex { case (partIndex, partEdges) =>
+          val random = new Random(partIndex + randomSeed)
+          partEdges.flatMap { edge =>
+            val gamma = normalize(BDV.fill[Double](k)(random.nextDouble()), 1.0)
+            val sum = gamma * edge.attr
+            Seq((edge.srcId, sum), (edge.dstId, sum))
+          }
         }
-      }
-    def createVertices(sendToWhere: Edge[TokenCount] => VertexId): RDD[(VertexId, TopicCounts)] = {
-      val verticesTMP: RDD[(VertexId, (TokenCount, TopicCounts))] =
-        edgesWithGamma.map { case (edge, gamma: TopicCounts) =>
-          (sendToWhere(edge), (edge.attr, gamma))
-        }
-      verticesTMP.aggregateByKey(BDV.zeros[Double](k))(
-        (sum, t) => {
-          brzAxpy(t._1, t._2, sum)
-          sum
-        },
-        (sum0, sum1) => {
-          sum0 += sum1
-        }
-      )
+      verticesTMP.reduceByKey(_ + _)
     }
-    val docVertices = createVertices(_.srcId)
-    val termVertices = createVertices(_.dstId)
+
+    val docTermVertices = createVertices()
 
     // Partition such that edges are grouped by document
-    val graph = Graph(docVertices ++ termVertices, edges)
+    val graph = Graph(docTermVertices, edges)
       .partitionBy(PartitionStrategy.EdgePartition1D)
 
     new EMOptimizer(graph, k, vocabSize, docConcentration, topicConcentration, checkpointInterval)

From 7e2f8821e08fddae661fcb484bf462210ad879e6 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Tue, 10 Feb 2015 22:43:32 -0800
Subject: [PATCH 117/357] HOTFIX: Java 6 compilation error in Spark SQL

---
 .../java/org/apache/spark/sql/sources/JavaSaveLoadSuite.java    | 2 +-
 .../apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/java/org/apache/spark/sql/sources/JavaSaveLoadSuite.java b/sql/core/src/test/java/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
index 852baf0e09245..311f1bdd07510 100644
--- a/sql/core/src/test/java/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
+++ b/sql/core/src/test/java/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
@@ -87,7 +87,7 @@ public void saveAndLoadWithSchema() {
     options.put("path", path.toString());
     df.save("org.apache.spark.sql.json", SaveMode.ErrorIfExists, options);
 
-    List<StructField> fields = new ArrayList<>();
+    List<StructField> fields = new ArrayList<StructField>();
     fields.add(DataTypes.createStructField("b", DataTypes.StringType, true));
     StructType schema = DataTypes.createStructType(fields);
     DataFrame loadedDF = sqlContext.load("org.apache.spark.sql.json", schema, options);
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
index 9744a2aa3f59c..313e84756b6bb 100644
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
@@ -121,7 +121,7 @@ public void saveExternalTableWithSchemaAndQueryIt() {
       sqlContext.sql("SELECT * FROM javaSavedTable"),
       df.collectAsList());
 
-    List<StructField> fields = new ArrayList<>();
+    List<StructField> fields = new ArrayList<StructField>();
     fields.add(DataTypes.createStructField("b", DataTypes.StringType, true));
     StructType schema = DataTypes.createStructType(fields);
     DataFrame loadedDF =

From c2131c0cdc57a4871ea23cd71e27e066d3c9a42c Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Tue, 10 Feb 2015 23:39:21 -0800
Subject: [PATCH 118/357] HOTFIX: Adding Junit to Hive tests for Maven build

---
 sql/hive/pom.xml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 58b0722464be8..72c474d66055c 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -84,6 +84,11 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <profiles>
     <profile>

From 658687b25491047f30ee8558733d11e5a0572070 Mon Sep 17 00:00:00 2001
From: cody koeninger <cody@koeninger.org>
Date: Wed, 11 Feb 2015 00:13:27 -0800
Subject: [PATCH 119/357] [SPARK-4964] [Streaming] refactor createRDD to take
 leaders via map instead of array

Author: cody koeninger <cody@koeninger.org>

Closes #4511 from koeninger/kafkaRdd-leader-to-broker and squashes the following commits:

f7151d4 [cody koeninger] [SPARK-4964] test refactoring
6f8680b [cody koeninger] [SPARK-4964] add test of the scala api for KafkaUtils.createRDD
f81e016 [cody koeninger] [SPARK-4964] leave KafkaStreamSuite host and port as private
5173f3f [cody koeninger] [SPARK-4964] test the Java variations of createRDD
e9cece4 [cody koeninger] [SPARK-4964] pass leaders as a map to ensure 1 leader per TopicPartition
---
 .../kafka/{Leader.scala => Broker.scala}      |  57 ++++---
 .../spark/streaming/kafka/KafkaUtils.scala    |  44 +++--
 .../streaming/kafka/JavaKafkaRDDSuite.java    | 156 ++++++++++++++++++
 .../spark/streaming/kafka/KafkaRDDSuite.scala |  96 +++++++----
 4 files changed, 287 insertions(+), 66 deletions(-)
 rename external/kafka/src/main/scala/org/apache/spark/streaming/kafka/{Leader.scala => Broker.scala} (50%)
 create mode 100644 external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java

diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Leader.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala
similarity index 50%
rename from external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Leader.scala
rename to external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala
index c129a26836c0d..5a74febb4bd46 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Leader.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala
@@ -17,41 +17,52 @@
 
 package org.apache.spark.streaming.kafka
 
-import kafka.common.TopicAndPartition
-
 import org.apache.spark.annotation.Experimental
 
 /**
  * :: Experimental ::
- * Represent the host info for the leader of a Kafka partition.
+ * Represent the host and port info for a Kafka broker.
+ * Differs from the Kafka project's internal kafka.cluster.Broker, which contains a server ID
  */
 @Experimental
-final class Leader private(
-    /** Kafka topic name */
-    val topic: String,
-    /** Kafka partition id */
-    val partition: Int,
-    /** Leader's hostname */
+final class Broker private(
+    /** Broker's hostname */
     val host: String,
-    /** Leader's port */
-    val port: Int) extends Serializable
+    /** Broker's port */
+    val port: Int) extends Serializable {
+  override def equals(obj: Any): Boolean = obj match {
+    case that: Broker =>
+      this.host == that.host &&
+      this.port == that.port
+    case _ => false
+  }
+
+  override def hashCode: Int = {
+    41 * (41 + host.hashCode) + port
+  }
+
+  override def toString(): String = {
+    s"Broker($host, $port)"
+  }
+}
 
 /**
  * :: Experimental ::
- * Companion object the provides methods to create instances of [[Leader]].
+ * Companion object that provides methods to create instances of [[Broker]].
  */
 @Experimental
-object Leader {
-  def create(topic: String, partition: Int, host: String, port: Int): Leader =
-    new Leader(topic, partition, host, port)
-
-  def create(topicAndPartition: TopicAndPartition, host: String, port: Int): Leader =
-    new Leader(topicAndPartition.topic, topicAndPartition.partition, host, port)
-
-  def apply(topic: String, partition: Int, host: String, port: Int): Leader =
-    new Leader(topic, partition, host, port)
+object Broker {
+  def create(host: String, port: Int): Broker =
+    new Broker(host, port)
 
-  def apply(topicAndPartition: TopicAndPartition, host: String, port: Int): Leader =
-    new Leader(topicAndPartition.topic, topicAndPartition.partition, host, port)
+  def apply(host: String, port: Int): Broker =
+    new Broker(host, port)
 
+  def unapply(broker: Broker): Option[(String, Int)] = {
+    if (broker == null) {
+      None
+    } else {
+      Some((broker.host, broker.port))
+    }
+  }
 }
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index 7a2c3abdcc24b..af04bc6576148 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -154,6 +154,19 @@ object KafkaUtils {
       jssc.ssc, kafkaParams.toMap, Map(topics.mapValues(_.intValue()).toSeq: _*), storageLevel)
   }
 
+  /** get leaders for the given offset ranges, or throw an exception */
+  private def leadersForRanges(
+      kafkaParams: Map[String, String],
+      offsetRanges: Array[OffsetRange]): Map[TopicAndPartition, (String, Int)] = {
+    val kc = new KafkaCluster(kafkaParams)
+    val topics = offsetRanges.map(o => TopicAndPartition(o.topic, o.partition)).toSet
+    val leaders = kc.findLeaders(topics).fold(
+      errs => throw new SparkException(errs.mkString("\n")),
+      ok => ok
+    )
+    leaders
+  }
+
   /**
    * Create a RDD from Kafka using offset ranges for each topic and partition.
    *
@@ -176,12 +189,7 @@ object KafkaUtils {
       offsetRanges: Array[OffsetRange]
     ): RDD[(K, V)] = {
     val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message)
-    val kc = new KafkaCluster(kafkaParams)
-    val topics = offsetRanges.map(o => TopicAndPartition(o.topic, o.partition)).toSet
-    val leaders = kc.findLeaders(topics).fold(
-      errs => throw new SparkException(errs.mkString("\n")),
-      ok => ok
-    )
+    val leaders = leadersForRanges(kafkaParams, offsetRanges)
     new KafkaRDD[K, V, KD, VD, (K, V)](sc, kafkaParams, offsetRanges, leaders, messageHandler)
   }
 
@@ -198,7 +206,8 @@ object KafkaUtils {
    *    host1:port1,host2:port2 form.
    * @param offsetRanges Each OffsetRange in the batch corresponds to a
    *   range of offsets for a given Kafka topic/partition
-   * @param leaders Kafka leaders for each offset range in batch
+   * @param leaders Kafka brokers for each TopicAndPartition in offsetRanges.  May be an empty map,
+   *   in which case leaders will be looked up on the driver.
    * @param messageHandler Function for translating each message and metadata into the desired type
    */
   @Experimental
@@ -211,12 +220,17 @@ object KafkaUtils {
       sc: SparkContext,
       kafkaParams: Map[String, String],
       offsetRanges: Array[OffsetRange],
-      leaders: Array[Leader],
+      leaders: Map[TopicAndPartition, Broker],
       messageHandler: MessageAndMetadata[K, V] => R
     ): RDD[R] = {
-    val leaderMap = leaders
-      .map(l => TopicAndPartition(l.topic, l.partition) -> (l.host, l.port))
-      .toMap
+    val leaderMap = if (leaders.isEmpty) {
+      leadersForRanges(kafkaParams, offsetRanges)
+    } else {
+      // This could be avoided by refactoring KafkaRDD.leaders and KafkaCluster to use Broker
+      leaders.map {
+        case (tp: TopicAndPartition, Broker(host, port)) => (tp, (host, port))
+      }.toMap
+    }
     new KafkaRDD[K, V, KD, VD, R](sc, kafkaParams, offsetRanges, leaderMap, messageHandler)
   }
 
@@ -263,7 +277,8 @@ object KafkaUtils {
    *    host1:port1,host2:port2 form.
    * @param offsetRanges Each OffsetRange in the batch corresponds to a
    *   range of offsets for a given Kafka topic/partition
-   * @param leaders Kafka leaders for each offset range in batch
+   * @param leaders Kafka brokers for each TopicAndPartition in offsetRanges.  May be an empty map,
+   *   in which case leaders will be looked up on the driver.
    * @param messageHandler Function for translating each message and metadata into the desired type
    */
   @Experimental
@@ -276,7 +291,7 @@ object KafkaUtils {
       recordClass: Class[R],
       kafkaParams: JMap[String, String],
       offsetRanges: Array[OffsetRange],
-      leaders: Array[Leader],
+      leaders: JMap[TopicAndPartition, Broker],
       messageHandler: JFunction[MessageAndMetadata[K, V], R]
     ): JavaRDD[R] = {
     implicit val keyCmt: ClassTag[K] = ClassTag(keyClass)
@@ -284,8 +299,9 @@ object KafkaUtils {
     implicit val keyDecoderCmt: ClassTag[KD] = ClassTag(keyDecoderClass)
     implicit val valueDecoderCmt: ClassTag[VD] = ClassTag(valueDecoderClass)
     implicit val recordCmt: ClassTag[R] = ClassTag(recordClass)
+    val leaderMap = Map(leaders.toSeq: _*)
     createRDD[K, V, KD, VD, R](
-      jsc.sc, Map(kafkaParams.toSeq: _*), offsetRanges, leaders, messageHandler.call _)
+      jsc.sc, Map(kafkaParams.toSeq: _*), offsetRanges, leaderMap, messageHandler.call _)
   }
 
   /**
diff --git a/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java
new file mode 100644
index 0000000000000..9d2e1705c6c73
--- /dev/null
+++ b/external/kafka/src/test/java/org/apache/spark/streaming/kafka/JavaKafkaRDDSuite.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka;
+
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+
+import scala.Tuple2;
+
+import junit.framework.Assert;
+
+import kafka.common.TopicAndPartition;
+import kafka.message.MessageAndMetadata;
+import kafka.serializer.StringDecoder;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function;
+
+import org.junit.Test;
+import org.junit.After;
+import org.junit.Before;
+
+public class JavaKafkaRDDSuite implements Serializable {
+  private transient JavaSparkContext sc = null;
+  private transient KafkaStreamSuiteBase suiteBase = null;
+
+  @Before
+  public void setUp() {
+    suiteBase = new KafkaStreamSuiteBase() { };
+    suiteBase.setupKafka();
+    System.clearProperty("spark.driver.port");
+    SparkConf sparkConf = new SparkConf()
+      .setMaster("local[4]").setAppName(this.getClass().getSimpleName());
+    sc = new JavaSparkContext(sparkConf);
+  }
+
+  @After
+  public void tearDown() {
+    sc.stop();
+    sc = null;
+    System.clearProperty("spark.driver.port");
+    suiteBase.tearDownKafka();
+  }
+
+  @Test
+  public void testKafkaRDD() throws InterruptedException {
+    String topic1 = "topic1";
+    String topic2 = "topic2";
+
+    String[] topic1data = createTopicAndSendData(topic1);
+    String[] topic2data = createTopicAndSendData(topic2);
+
+    HashMap<String, String> kafkaParams = new HashMap<String, String>();
+    kafkaParams.put("metadata.broker.list", suiteBase.brokerAddress());
+
+    OffsetRange[] offsetRanges = {
+      OffsetRange.create(topic1, 0, 0, 1),
+      OffsetRange.create(topic2, 0, 0, 1)
+    };
+
+    HashMap<TopicAndPartition, Broker> emptyLeaders = new HashMap();
+    HashMap<TopicAndPartition, Broker> leaders = new HashMap();
+    String[] hostAndPort = suiteBase.brokerAddress().split(":");
+    Broker broker = Broker.create(hostAndPort[0], Integer.parseInt(hostAndPort[1]));
+    leaders.put(new TopicAndPartition(topic1, 0), broker);
+    leaders.put(new TopicAndPartition(topic2, 0), broker);
+
+    JavaRDD<String> rdd1 = KafkaUtils.createRDD(
+        sc,
+        String.class,
+        String.class,
+        StringDecoder.class,
+        StringDecoder.class,
+        kafkaParams,
+        offsetRanges
+    ).map(
+        new Function<Tuple2<String, String>, String>() {
+          @Override
+          public String call(scala.Tuple2<String, String> kv) throws Exception {
+            return kv._2();
+          }
+        }
+    );
+
+    JavaRDD<String> rdd2 = KafkaUtils.createRDD(
+        sc,
+        String.class,
+        String.class,
+        StringDecoder.class,
+        StringDecoder.class,
+        String.class,
+        kafkaParams,
+        offsetRanges,
+        emptyLeaders,
+        new Function<MessageAndMetadata<String, String>, String>() {
+          @Override
+          public String call(MessageAndMetadata<String, String> msgAndMd) throws Exception {
+            return msgAndMd.message();
+          }
+        }
+    );
+
+    JavaRDD<String> rdd3 = KafkaUtils.createRDD(
+        sc,
+        String.class,
+        String.class,
+        StringDecoder.class,
+        StringDecoder.class,
+        String.class,
+        kafkaParams,
+        offsetRanges,
+        leaders,
+        new Function<MessageAndMetadata<String, String>, String>() {
+          @Override
+          public String call(MessageAndMetadata<String, String> msgAndMd) throws Exception {
+            return msgAndMd.message();
+          }
+        }
+    );
+
+    // just making sure the java user apis work; the scala tests handle logic corner cases
+    long count1 = rdd1.count();
+    long count2 = rdd2.count();
+    long count3 = rdd3.count();
+    Assert.assertTrue(count1 > 0);
+    Assert.assertEquals(count1, count2);
+    Assert.assertEquals(count1, count3);
+  }
+
+  private  String[] createTopicAndSendData(String topic) {
+    String[] data = { topic + "-1", topic + "-2", topic + "-3"};
+    suiteBase.createTopic(topic);
+    suiteBase.sendMessages(topic, data);
+    return data;
+  }
+}
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala
index 6774db854a0d0..a223da70b043f 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaRDDSuite.scala
@@ -21,18 +21,22 @@ import scala.util.Random
 
 import kafka.serializer.StringDecoder
 import kafka.common.TopicAndPartition
-import org.scalatest.BeforeAndAfter
+import kafka.message.MessageAndMetadata
+import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark._
 import org.apache.spark.SparkContext._
 
-class KafkaRDDSuite extends KafkaStreamSuiteBase with BeforeAndAfter {
+class KafkaRDDSuite extends KafkaStreamSuiteBase with BeforeAndAfterAll {
+  val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName)
   var sc: SparkContext = _
-  before {
+  override def beforeAll {
+    sc = new SparkContext(sparkConf)
+
     setupKafka()
   }
 
-  after {
+  override def afterAll {
     if (sc != null) {
       sc.stop
       sc = null
@@ -40,60 +44,94 @@ class KafkaRDDSuite extends KafkaStreamSuiteBase with BeforeAndAfter {
     tearDownKafka()
   }
 
-  test("Kafka RDD") {
-    val sparkConf = new SparkConf().setMaster("local[4]").setAppName(this.getClass.getSimpleName)
-    sc = new SparkContext(sparkConf)
+  test("basic usage") {
+    val topic = "topicbasic"
+    createTopic(topic)
+    val messages = Set("the", "quick", "brown", "fox")
+    sendMessages(topic, messages.toArray)
+
+
+    val kafkaParams = Map("metadata.broker.list" -> brokerAddress,
+      "group.id" -> s"test-consumer-${Random.nextInt(10000)}")
+
+    val offsetRanges = Array(OffsetRange(topic, 0, 0, messages.size))
+
+    val rdd =  KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder](
+      sc, kafkaParams, offsetRanges)
+
+    val received = rdd.map(_._2).collect.toSet
+    assert(received === messages)
+  }
+
+  test("iterator boundary conditions") {
+    // the idea is to find e.g. off-by-one errors between what kafka has available and the rdd
     val topic = "topic1"
     val sent = Map("a" -> 5, "b" -> 3, "c" -> 10)
     createTopic(topic)
-    sendMessages(topic, sent)
 
     val kafkaParams = Map("metadata.broker.list" -> brokerAddress,
       "group.id" -> s"test-consumer-${Random.nextInt(10000)}")
 
     val kc = new KafkaCluster(kafkaParams)
 
-    val rdd = getRdd(kc, Set(topic))
     // this is the "lots of messages" case
-    // make sure we get all of them
+    sendMessages(topic, sent)
+    // rdd defined from leaders after sending messages, should get the number sent
+    val rdd = getRdd(kc, Set(topic))
+
     assert(rdd.isDefined)
-    assert(rdd.get.count === sent.values.sum)
+    assert(rdd.get.count === sent.values.sum, "didn't get all sent messages")
 
-    kc.setConsumerOffsets(
-      kafkaParams("group.id"),
-      rdd.get.offsetRanges.map(o => TopicAndPartition(o.topic, o.partition) -> o.untilOffset).toMap)
+    val ranges = rdd.get.asInstanceOf[HasOffsetRanges]
+      .offsetRanges.map(o => TopicAndPartition(o.topic, o.partition) -> o.untilOffset).toMap
+
+    kc.setConsumerOffsets(kafkaParams("group.id"), ranges)
 
-    val rdd2 = getRdd(kc, Set(topic))
-    val sent2 = Map("d" -> 1)
-    sendMessages(topic, sent2)
     // this is the "0 messages" case
-    // make sure we dont get anything, since messages were sent after rdd was defined
+    val rdd2 = getRdd(kc, Set(topic))
+    // shouldn't get anything, since message is sent after rdd was defined
+    val sentOnlyOne = Map("d" -> 1)
+
+    sendMessages(topic, sentOnlyOne)
     assert(rdd2.isDefined)
-    assert(rdd2.get.count === 0)
+    assert(rdd2.get.count === 0, "got messages when there shouldn't be any")
 
+    // this is the "exactly 1 message" case, namely the single message from sentOnlyOne above
     val rdd3 = getRdd(kc, Set(topic))
+    // send lots of messages after rdd was defined, they shouldn't show up
     sendMessages(topic, Map("extra" -> 22))
-    // this is the "exactly 1 message" case
-    // make sure we get exactly one message, despite there being lots more available
+
     assert(rdd3.isDefined)
-    assert(rdd3.get.count === sent2.values.sum)
+    assert(rdd3.get.count === sentOnlyOne.values.sum, "didn't get exactly one message")
 
   }
 
   // get an rdd from the committed consumer offsets until the latest leader offsets,
   private def getRdd(kc: KafkaCluster, topics: Set[String]) = {
     val groupId = kc.kafkaParams("group.id")
-    for {
-      topicPartitions <- kc.getPartitions(topics).right.toOption
-      from <- kc.getConsumerOffsets(groupId, topicPartitions).right.toOption.orElse(
+    def consumerOffsets(topicPartitions: Set[TopicAndPartition]) = {
+      kc.getConsumerOffsets(groupId, topicPartitions).right.toOption.orElse(
         kc.getEarliestLeaderOffsets(topicPartitions).right.toOption.map { offs =>
           offs.map(kv => kv._1 -> kv._2.offset)
         }
       )
-      until <- kc.getLatestLeaderOffsets(topicPartitions).right.toOption
-    } yield {
-      KafkaRDD[String, String, StringDecoder, StringDecoder, String](
-        sc, kc.kafkaParams, from, until, mmd => s"${mmd.offset} ${mmd.message}")
+    }
+    kc.getPartitions(topics).right.toOption.flatMap { topicPartitions =>
+      consumerOffsets(topicPartitions).flatMap { from =>
+        kc.getLatestLeaderOffsets(topicPartitions).right.toOption.map { until =>
+          val offsetRanges = from.map { case (tp: TopicAndPartition, fromOffset: Long) =>
+              OffsetRange(tp.topic, tp.partition, fromOffset, until(tp).offset)
+          }.toArray
+
+          val leaders = until.map { case (tp: TopicAndPartition, lo: KafkaCluster.LeaderOffset) =>
+              tp -> Broker(lo.host, lo.port)
+          }.toMap
+
+          KafkaUtils.createRDD[String, String, StringDecoder, StringDecoder, String](
+            sc, kc.kafkaParams, offsetRanges, leaders,
+            (mmd: MessageAndMetadata[String, String]) => s"${mmd.offset} ${mmd.message}")
+        }
+      }
     }
   }
 }

From da89720bf4023392436e75b6ed5e10ed8588a132 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 11 Feb 2015 08:13:51 +0000
Subject: [PATCH 120/357] SPARK-5728 [STREAMING] MQTTStreamSuite leaves behind
 ActiveMQ database files

Use temp dir for ActiveMQ database

Author: Sean Owen <sowen@cloudera.com>

Closes #4517 from srowen/SPARK-5728 and squashes the following commits:

1d3aeb8 [Sean Owen] Use temp dir for ActiveMQ database
---
 .../scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
index e84adc088a680..19c9271af77be 100644
--- a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
+++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
@@ -93,6 +93,7 @@ class MQTTStreamSuite extends FunSuite with Eventually with BeforeAndAfter {
 
   private def setupMQTT() {
     broker = new BrokerService()
+    broker.setDataDirectoryFile(Utils.createTempDir())
     connector = new TransportConnector()
     connector.setName("mqtt")
     connector.setUri(new URI("mqtt:" + brokerUri))

From bd0d6e0cc3a329c4a1c08451a6d8a9281a422958 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 11 Feb 2015 08:30:16 +0000
Subject: [PATCH 121/357] SPARK-5727 [BUILD] Deprecate Debian packaging

This just adds a deprecation message. It's intended for backporting to branch 1.3 but can go in master too, to be followed by another PR that removes it for 1.4.

Author: Sean Owen <sowen@cloudera.com>

Closes #4516 from srowen/SPARK-5727.1 and squashes the following commits:

d48989f [Sean Owen] Refer to Spark 1.4
6c1c8b3 [Sean Owen] Deprecate Debian packaging
---
 assembly/pom.xml       | 18 ++++++++++++++++++
 docs/building-spark.md |  2 ++
 2 files changed, 20 insertions(+)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index 301ff69c2ae3b..c1bcdbb664dd0 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -221,6 +221,24 @@
       <id>deb</id>
       <build>
         <plugins>
+          <plugin>
+            <artifactId>maven-antrun-plugin</artifactId>
+            <executions>
+              <execution>
+                <phase>prepare-package</phase>
+                <goals>
+                  <goal>run</goal>
+                </goals>
+                <configuration>
+                  <target>
+                    <echo>
+                      NOTE: Debian packaging is deprecated and is scheduled to be removed in Spark 1.4.
+                    </echo>
+                  </target>
+                </configuration>
+              </execution>
+            </executions>
+          </plugin>
           <plugin>
             <groupId>org.codehaus.mojo</groupId>
             <artifactId>buildnumber-maven-plugin</artifactId>
diff --git a/docs/building-spark.md b/docs/building-spark.md
index db69905813817..d3824fb61eef9 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -161,6 +161,8 @@ For help in setting up IntelliJ IDEA or Eclipse for Spark development, and troub
 
 # Building Spark Debian Packages
 
+_NOTE: Debian packaging is deprecated and is scheduled to be removed in Spark 1.4._
+
 The Maven build includes support for building a Debian package containing the assembly 'fat-jar', PySpark, and the necessary scripts and configuration files. This can be created by specifying the following:
 
     mvn -Pdeb -DskipTests clean package

From 1ac099e3e00ddb01af8e6e3a84c70f8363f04b5c Mon Sep 17 00:00:00 2001
From: guliangliang <guliangliang@qiyi.com>
Date: Wed, 11 Feb 2015 15:55:49 +0000
Subject: [PATCH 122/357] [SPARK-5733] Error Link in Pagination of HistroyPage
 when showing Incomplete Applications

The links in pagination of HistroyPage is wrong when showing Incomplete Applications.

If "2" is click on the following page "http://history-server:18080/?page=1&showIncomplete=true", it will go to "http://history-server:18080/?page=2" instead of "http://history-server:18080/?page=2&showIncomplete=true".

Author: guliangliang <guliangliang@qiyi.com>

Closes #4523 from marsishandsome/Spark5733 and squashes the following commits:

9d7b593 [guliangliang] [SPARK-5733] Error Link in Pagination of HistroyPage when showing Incomplete Applications
---
 .../org/apache/spark/deploy/history/HistoryPage.scala | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
index e4e7bc2216014..26ebc75971c66 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
@@ -61,9 +61,10 @@ private[spark] class HistoryPage(parent: HistoryServer) extends WebUIPage("") {
             // page, `...` will be displayed.
             if (allApps.size > 0) {
               val leftSideIndices =
-                rangeIndices(actualPage - plusOrMinus until actualPage, 1 < _)
+                rangeIndices(actualPage - plusOrMinus until actualPage, 1 < _, requestedIncomplete)
               val rightSideIndices =
-                rangeIndices(actualPage + 1 to actualPage + plusOrMinus, _ < pageCount)
+                rangeIndices(actualPage + 1 to actualPage + plusOrMinus, _ < pageCount,
+                  requestedIncomplete)
 
               <h4>
                 Showing {actualFirst + 1}-{last + 1} of {allApps.size}
@@ -122,8 +123,10 @@ private[spark] class HistoryPage(parent: HistoryServer) extends WebUIPage("") {
     "Spark User",
     "Last Updated")
 
-  private def rangeIndices(range: Seq[Int], condition: Int => Boolean): Seq[Node] = {
-    range.filter(condition).map(nextPage => <a href={"/?page=" + nextPage}> {nextPage} </a>)
+  private def rangeIndices(range: Seq[Int], condition: Int => Boolean, showIncomplete: Boolean):
+  Seq[Node] = {
+    range.filter(condition).map(nextPage =>
+      <a href={makePageLink(nextPage, showIncomplete)}> {nextPage} </a>)
   }
 
   private def appRow(info: ApplicationHistoryInfo): Seq[Node] = {

From b694eb9c2fefeaa33891d3e61f9bea369bc09984 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 11 Feb 2015 12:13:16 -0800
Subject: [PATCH 123/357] [SPARK-5677] [SPARK-5734] [SQL] [PySpark] Python
 DataFrame API remaining tasks

1. DataFrame.renameColumn

2. DataFrame.show() and _repr_

3. Use simpleString() rather than jsonValue in DataFrame.dtypes

4. createDataFrame from local Python data, including pandas.DataFrame

Author: Davies Liu <davies@databricks.com>

Closes #4528 from davies/df3 and squashes the following commits:

014acea [Davies Liu] fix typo
6ba526e [Davies Liu] fix tests
46f5f95 [Davies Liu] address comments
6cbc154 [Davies Liu] dataframe.show() and improve dtypes
6f94f25 [Davies Liu] create DataFrame from local Python data
---
 python/pyspark/sql/context.py                 | 114 +++++++++++-------
 python/pyspark/sql/dataframe.py               |  42 ++++++-
 python/pyspark/sql/tests.py                   |   2 +-
 python/pyspark/sql/types.py                   |  32 +++++
 .../org/apache/spark/sql/DataFrameImpl.scala  |  15 ++-
 5 files changed, 155 insertions(+), 50 deletions(-)

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 9d29ef4839a43..db4bcbece2c1b 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -23,12 +23,18 @@
 from py4j.protocol import Py4JError
 from py4j.java_collections import MapConverter
 
-from pyspark.rdd import _prepare_for_python_RDD
+from pyspark.rdd import RDD, _prepare_for_python_RDD
 from pyspark.serializers import AutoBatchedSerializer, PickleSerializer
-from pyspark.sql.types import StringType, StructType, _infer_type, _verify_type, \
+from pyspark.sql.types import StringType, StructType, _verify_type, \
     _infer_schema, _has_nulltype, _merge_type, _create_converter, _python_to_sql_converter
 from pyspark.sql.dataframe import DataFrame
 
+try:
+    import pandas
+    has_pandas = True
+except ImportError:
+    has_pandas = False
+
 __all__ = ["SQLContext", "HiveContext"]
 
 
@@ -116,6 +122,31 @@ def registerFunction(self, name, f, returnType=StringType()):
                                             self._sc._javaAccumulator,
                                             returnType.json())
 
+    def _inferSchema(self, rdd, samplingRatio=None):
+        first = rdd.first()
+        if not first:
+            raise ValueError("The first row in RDD is empty, "
+                             "can not infer schema")
+        if type(first) is dict:
+            warnings.warn("Using RDD of dict to inferSchema is deprecated,"
+                          "please use pyspark.sql.Row instead")
+
+        if samplingRatio is None:
+            schema = _infer_schema(first)
+            if _has_nulltype(schema):
+                for row in rdd.take(100)[1:]:
+                    schema = _merge_type(schema, _infer_schema(row))
+                    if not _has_nulltype(schema):
+                        break
+                else:
+                    raise ValueError("Some of types cannot be determined by the "
+                                     "first 100 rows, please try again with sampling")
+        else:
+            if samplingRatio < 0.99:
+                rdd = rdd.sample(False, float(samplingRatio))
+            schema = rdd.map(_infer_schema).reduce(_merge_type)
+        return schema
+
     def inferSchema(self, rdd, samplingRatio=None):
         """Infer and apply a schema to an RDD of L{Row}.
 
@@ -171,29 +202,7 @@ def inferSchema(self, rdd, samplingRatio=None):
         if isinstance(rdd, DataFrame):
             raise TypeError("Cannot apply schema to DataFrame")
 
-        first = rdd.first()
-        if not first:
-            raise ValueError("The first row in RDD is empty, "
-                             "can not infer schema")
-        if type(first) is dict:
-            warnings.warn("Using RDD of dict to inferSchema is deprecated,"
-                          "please use pyspark.sql.Row instead")
-
-        if samplingRatio is None:
-            schema = _infer_schema(first)
-            if _has_nulltype(schema):
-                for row in rdd.take(100)[1:]:
-                    schema = _merge_type(schema, _infer_schema(row))
-                    if not _has_nulltype(schema):
-                        break
-                else:
-                    warnings.warn("Some of types cannot be determined by the "
-                                  "first 100 rows, please try again with sampling")
-        else:
-            if samplingRatio < 0.99:
-                rdd = rdd.sample(False, float(samplingRatio))
-            schema = rdd.map(_infer_schema).reduce(_merge_type)
-
+        schema = self._inferSchema(rdd, samplingRatio)
         converter = _create_converter(schema)
         rdd = rdd.map(converter)
         return self.applySchema(rdd, schema)
@@ -274,7 +283,7 @@ def applySchema(self, rdd, schema):
             raise TypeError("Cannot apply schema to DataFrame")
 
         if not isinstance(schema, StructType):
-            raise TypeError("schema should be StructType")
+            raise TypeError("schema should be StructType, but got %s" % schema)
 
         # take the first few rows to verify schema
         rows = rdd.take(10)
@@ -294,9 +303,9 @@ def applySchema(self, rdd, schema):
         df = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
         return DataFrame(df, self)
 
-    def createDataFrame(self, rdd, schema=None, samplingRatio=None):
+    def createDataFrame(self, data, schema=None, samplingRatio=None):
         """
-        Create a DataFrame from an RDD of tuple/list and an optional `schema`.
+        Create a DataFrame from an RDD of tuple/list, list or pandas.DataFrame.
 
         `schema` could be :class:`StructType` or a list of column names.
 
@@ -311,12 +320,20 @@ def createDataFrame(self, rdd, schema=None, samplingRatio=None):
         rows will be used to do referring. The first row will be used if
         `samplingRatio` is None.
 
-        :param rdd: an RDD of Row or tuple or list or dict
+        :param data: an RDD of Row/tuple/list/dict, list, or pandas.DataFrame
         :param schema: a StructType or list of names of columns
         :param samplingRatio: the sample ratio of rows used for inferring
         :return: a DataFrame
 
-        >>> rdd = sc.parallelize([('Alice', 1)])
+        >>> l = [('Alice', 1)]
+        >>> sqlCtx.createDataFrame(l, ['name', 'age']).collect()
+        [Row(name=u'Alice', age=1)]
+
+        >>> d = [{'name': 'Alice', 'age': 1}]
+        >>> sqlCtx.createDataFrame(d).collect()
+        [Row(age=1, name=u'Alice')]
+
+        >>> rdd = sc.parallelize(l)
         >>> df = sqlCtx.createDataFrame(rdd, ['name', 'age'])
         >>> df.collect()
         [Row(name=u'Alice', age=1)]
@@ -336,19 +353,32 @@ def createDataFrame(self, rdd, schema=None, samplingRatio=None):
         >>> df3.collect()
         [Row(name=u'Alice', age=1)]
         """
-        if isinstance(rdd, DataFrame):
-            raise TypeError("rdd is already a DataFrame")
+        if isinstance(data, DataFrame):
+            raise TypeError("data is already a DataFrame")
 
-        if isinstance(schema, StructType):
-            return self.applySchema(rdd, schema)
-        else:
-            if isinstance(schema, (list, tuple)):
-                first = rdd.first()
-                if not isinstance(first, (list, tuple)):
-                    raise ValueError("each row in `rdd` should be list or tuple")
-                row_cls = Row(*schema)
-                rdd = rdd.map(lambda r: row_cls(*r))
-            return self.inferSchema(rdd, samplingRatio)
+        if has_pandas and isinstance(data, pandas.DataFrame):
+            data = self._sc.parallelize(data.to_records(index=False))
+            if schema is None:
+                schema = list(data.columns)
+
+        if not isinstance(data, RDD):
+            try:
+                # data could be list, tuple, generator ...
+                data = self._sc.parallelize(data)
+            except Exception:
+                raise ValueError("cannot create an RDD from type: %s" % type(data))
+
+        if schema is None:
+            return self.inferSchema(data, samplingRatio)
+
+        if isinstance(schema, (list, tuple)):
+            first = data.first()
+            if not isinstance(first, (list, tuple)):
+                raise ValueError("each row in `rdd` should be list or tuple")
+            row_cls = Row(*schema)
+            schema = self._inferSchema(data.map(lambda r: row_cls(*r)), samplingRatio)
+
+        return self.applySchema(data, schema)
 
     def registerRDDAsTable(self, rdd, tableName):
         """Registers the given RDD as a temporary table in the catalog.
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 3eef0cc376a2d..3eb56ed74cc6f 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -236,6 +236,24 @@ def printSchema(self):
         """
         print (self._jdf.schema().treeString())
 
+    def show(self):
+        """
+        Print the first 20 rows.
+
+        >>> df.show()
+        age name
+        2   Alice
+        5   Bob
+        >>> df
+        age name
+        2   Alice
+        5   Bob
+        """
+        print (self)
+
+    def __repr__(self):
+        return self._jdf.showString()
+
     def count(self):
         """Return the number of elements in this RDD.
 
@@ -380,9 +398,9 @@ def dtypes(self):
         """Return all column names and their data types as a list.
 
         >>> df.dtypes
-        [('age', 'integer'), ('name', 'string')]
+        [('age', 'int'), ('name', 'string')]
         """
-        return [(str(f.name), f.dataType.jsonValue()) for f in self.schema().fields]
+        return [(str(f.name), f.dataType.simpleString()) for f in self.schema().fields]
 
     @property
     def columns(self):
@@ -606,6 +624,17 @@ def addColumn(self, colName, col):
         """
         return self.select('*', col.alias(colName))
 
+    def renameColumn(self, existing, new):
+        """ Rename an existing column to a new name
+
+        >>> df.renameColumn('age', 'age2').collect()
+        [Row(age2=2, name=u'Alice'), Row(age2=5, name=u'Bob')]
+        """
+        cols = [Column(_to_java_column(c), self.sql_ctx).alias(new)
+                if c == existing else c
+                for c in self.columns]
+        return self.select(*cols)
+
     def to_pandas(self):
         """
         Collect all the rows and return a `pandas.DataFrame`.
@@ -885,6 +914,12 @@ def cast(self, dataType):
             jc = self._jc.cast(jdt)
         return Column(jc, self.sql_ctx)
 
+    def __repr__(self):
+        if self._jdf.isComputable():
+            return self._jdf.samples()
+        else:
+            return 'Column<%s>' % self._jdf.toString()
+
     def to_pandas(self):
         """
         Return a pandas.Series from the column
@@ -1030,7 +1065,8 @@ def _test():
     globs['df'] = sqlCtx.inferSchema(rdd2)
     globs['df2'] = sqlCtx.inferSchema(rdd3)
     (failure_count, test_count) = doctest.testmod(
-        pyspark.sql.dataframe, globs=globs, optionflags=doctest.ELLIPSIS)
+        pyspark.sql.dataframe, globs=globs,
+        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 5e41e36897b5d..43e5c3a1b00fa 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -194,7 +194,7 @@ def test_infer_schema(self):
         result = self.sqlCtx.sql("SELECT l[0].a from test where d['key'].d = '2'")
         self.assertEqual(1, result.head()[0])
 
-        df2 = self.sqlCtx.createDataFrame(rdd, 1.0)
+        df2 = self.sqlCtx.createDataFrame(rdd, samplingRatio=1.0)
         self.assertEqual(df.schema(), df2.schema())
         self.assertEqual({}, df2.map(lambda r: r.d).first())
         self.assertEqual([None, ""], df2.map(lambda r: r.s).collect())
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 41afefe48ee5e..40bd7e54a9d7b 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -52,6 +52,9 @@ def __ne__(self, other):
     def typeName(cls):
         return cls.__name__[:-4].lower()
 
+    def simpleString(self):
+        return self.typeName()
+
     def jsonValue(self):
         return self.typeName()
 
@@ -145,6 +148,12 @@ def __init__(self, precision=None, scale=None):
         self.scale = scale
         self.hasPrecisionInfo = precision is not None
 
+    def simpleString(self):
+        if self.hasPrecisionInfo:
+            return "decimal(%d,%d)" % (self.precision, self.scale)
+        else:
+            return "decimal(10,0)"
+
     def jsonValue(self):
         if self.hasPrecisionInfo:
             return "decimal(%d,%d)" % (self.precision, self.scale)
@@ -180,6 +189,8 @@ class ByteType(PrimitiveType):
 
     The data type representing int values with 1 singed byte.
     """
+    def simpleString(self):
+        return 'tinyint'
 
 
 class IntegerType(PrimitiveType):
@@ -188,6 +199,8 @@ class IntegerType(PrimitiveType):
 
     The data type representing int values.
     """
+    def simpleString(self):
+        return 'int'
 
 
 class LongType(PrimitiveType):
@@ -198,6 +211,8 @@ class LongType(PrimitiveType):
     beyond the range of [-9223372036854775808, 9223372036854775807],
     please use DecimalType.
     """
+    def simpleString(self):
+        return 'bigint'
 
 
 class ShortType(PrimitiveType):
@@ -206,6 +221,8 @@ class ShortType(PrimitiveType):
 
     The data type representing int values with 2 signed bytes.
     """
+    def simpleString(self):
+        return 'smallint'
 
 
 class ArrayType(DataType):
@@ -233,6 +250,9 @@ def __init__(self, elementType, containsNull=True):
         self.elementType = elementType
         self.containsNull = containsNull
 
+    def simpleString(self):
+        return 'array<%s>' % self.elementType.simpleString()
+
     def __repr__(self):
         return "ArrayType(%s,%s)" % (self.elementType,
                                      str(self.containsNull).lower())
@@ -283,6 +303,9 @@ def __init__(self, keyType, valueType, valueContainsNull=True):
         self.valueType = valueType
         self.valueContainsNull = valueContainsNull
 
+    def simpleString(self):
+        return 'map<%s,%s>' % (self.keyType.simpleString(), self.valueType.simpleString())
+
     def __repr__(self):
         return "MapType(%s,%s,%s)" % (self.keyType, self.valueType,
                                       str(self.valueContainsNull).lower())
@@ -337,6 +360,9 @@ def __init__(self, name, dataType, nullable=True, metadata=None):
         self.nullable = nullable
         self.metadata = metadata or {}
 
+    def simpleString(self):
+        return '%s:%s' % (self.name, self.dataType.simpleString())
+
     def __repr__(self):
         return "StructField(%s,%s,%s)" % (self.name, self.dataType,
                                           str(self.nullable).lower())
@@ -379,6 +405,9 @@ def __init__(self, fields):
         """
         self.fields = fields
 
+    def simpleString(self):
+        return 'struct<%s>' % (','.join(f.simpleString() for f in self.fields))
+
     def __repr__(self):
         return ("StructType(List(%s))" %
                 ",".join(str(field) for field in self.fields))
@@ -435,6 +464,9 @@ def deserialize(self, datum):
         """
         raise NotImplementedError("UDT must implement deserialize().")
 
+    def simpleString(self):
+        return 'null'
+
     def json(self):
         return json.dumps(self.jsonValue(), separators=(',', ':'), sort_keys=True)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index 9638ce0865db0..41da4424ae459 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -126,7 +126,10 @@ private[sql] class DataFrameImpl protected[sql](
     logicalPlan.isInstanceOf[LocalRelation]
   }
 
-  override def show(): Unit = {
+  /**
+   * Internal API for Python
+   */
+  private[sql] def showString(): String = {
     val data = take(20)
     val numCols = schema.fieldNames.length
 
@@ -146,12 +149,16 @@ private[sql] class DataFrameImpl protected[sql](
       }
     }
 
-    // Pad the cells and print them
-    println(rows.map { row =>
+    // Pad the cells
+    rows.map { row =>
       row.zipWithIndex.map { case (cell, i) =>
         String.format(s"%-${colWidths(i)}s", cell)
       }.mkString(" ")
-    }.mkString("\n"))
+    }.mkString("\n")
+  }
+
+  override def show(): Unit = {
+    println(showString)
   }
 
   override def join(right: DataFrame): DataFrame = {

From 03bf704bf442ac7dd960795295b51957ce972491 Mon Sep 17 00:00:00 2001
From: Daniel Darabos <darabos.daniel@gmail.com>
Date: Wed, 11 Feb 2015 20:24:17 +0000
Subject: [PATCH 124/357] Remove outdated remark about take(n).

Looking at the code, I believe this remark about `take(n)` computing partitions on the driver is no longer correct. Apologies if I'm wrong.

This came up in http://stackoverflow.com/q/28436559/3318517.

Author: Daniel Darabos <darabos.daniel@gmail.com>

Closes #4533 from darabos/patch-2 and squashes the following commits:

cc80f3a [Daniel Darabos] Remove outdated remark about take(n).
---
 docs/programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 6b365e83fb56d..118701549a759 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -975,7 +975,7 @@ for details.
 </tr>
 <tr>
   <td> <b>take</b>(<i>n</i>) </td>
-  <td> Return an array with the first <i>n</i> elements of the dataset. Note that this is currently not executed in parallel. Instead, the driver program computes all the elements. </td>
+  <td> Return an array with the first <i>n</i> elements of the dataset. </td>
 </tr>
 <tr>
   <td> <b>takeSample</b>(<i>withReplacement</i>, <i>num</i>, [<i>seed</i>]) </td>

From a60d2b70adff3a8fb3bdfac226b1d86fdb443da4 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Wed, 11 Feb 2015 12:31:56 -0800
Subject: [PATCH 125/357] [SPARK-5454] More robust handling of self joins

Also I fix a bunch of bad output in test cases.

Author: Michael Armbrust <michael@databricks.com>

Closes #4520 from marmbrus/selfJoin and squashes the following commits:

4f4a85c [Michael Armbrust] comments
49c8e26 [Michael Armbrust] fix tests
6fc38de [Michael Armbrust] fix style
55d64b3 [Michael Armbrust] fix dataframe selfjoins
---
 .../sql/catalyst/analysis/Analyzer.scala      | 26 ++++++++++++++++---
 .../analysis/MultiInstanceRelation.scala      | 21 +--------------
 .../spark/sql/catalyst/plans/PlanTest.scala   |  4 +--
 .../org/apache/spark/sql/SQLContext.scala     |  2 ++
 sql/core/src/test/resources/log4j.properties  |  3 +++
 .../org/apache/spark/sql/DataFrameSuite.scala | 10 +++++++
 .../spark/sql/catalyst/plans/PlanTest.scala   |  4 +--
 7 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 3f0d77ad6322a..2d1fa106a2aa9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -53,14 +53,11 @@ class Analyzer(catalog: Catalog,
   val extendedRules: Seq[Rule[LogicalPlan]] = Nil
 
   lazy val batches: Seq[Batch] = Seq(
-    Batch("MultiInstanceRelations", Once,
-      NewRelationInstances),
     Batch("Resolution", fixedPoint,
-      ResolveReferences ::
       ResolveRelations ::
+      ResolveReferences ::
       ResolveGroupingAnalytics ::
       ResolveSortReferences ::
-      NewRelationInstances ::
       ImplicitGenerate ::
       ResolveFunctions ::
       GlobalAggregates ::
@@ -285,6 +282,27 @@ class Analyzer(catalog: Catalog,
           }
         )
 
+      // Special handling for cases when self-join introduce duplicate expression ids.
+      case j @ Join(left, right, _, _) if left.outputSet.intersect(right.outputSet).nonEmpty =>
+        val conflictingAttributes = left.outputSet.intersect(right.outputSet)
+
+        val (oldRelation, newRelation, attributeRewrites) = right.collect {
+          case oldVersion: MultiInstanceRelation
+              if oldVersion.outputSet.intersect(conflictingAttributes).nonEmpty =>
+            val newVersion = oldVersion.newInstance()
+            val newAttributes = AttributeMap(oldVersion.output.zip(newVersion.output))
+            (oldVersion, newVersion, newAttributes)
+        }.head // Only handle first case found, others will be fixed on the next pass.
+
+        val newRight = right transformUp {
+          case r if r == oldRelation => newRelation
+          case other => other transformExpressions {
+            case a: Attribute => attributeRewrites.get(a).getOrElse(a)
+          }
+        }
+
+        j.copy(right = newRight)
+
       case q: LogicalPlan =>
         logTrace(s"Attempting to resolve ${q.simpleString}")
         q transformExpressionsUp  {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala
index 4c5fb3f45bf49..894c3500cf533 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/MultiInstanceRelation.scala
@@ -26,28 +26,9 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
  * produced by distinct operators in a query tree as this breaks the guarantee that expression
  * ids, which are used to differentiate attributes, are unique.
  *
- * Before analysis, all operators that include this trait will be asked to produce a new version
+ * During analysis, operators that include this trait may be asked to produce a new version
  * of itself with globally unique expression ids.
  */
 trait MultiInstanceRelation {
   def newInstance(): this.type
 }
-
-/**
- * If any MultiInstanceRelation appears more than once in the query plan then the plan is updated so
- * that each instance has unique expression ids for the attributes produced.
- */
-object NewRelationInstances extends Rule[LogicalPlan] {
-  def apply(plan: LogicalPlan): LogicalPlan = {
-    val localRelations = plan collect { case l: MultiInstanceRelation => l}
-    val multiAppearance = localRelations
-      .groupBy(identity[MultiInstanceRelation])
-      .filter { case (_, ls) => ls.size > 1 }
-      .map(_._1)
-      .toSet
-
-    plan transform {
-      case l: MultiInstanceRelation if multiAppearance.contains(l) => l.newInstance()
-    }
-  }
-}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index c4a1f899d8a13..7d609b91389c6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -33,11 +33,9 @@ class PlanTest extends FunSuite {
    * we must normalize them to check if two different queries are identical.
    */
   protected def normalizeExprIds(plan: LogicalPlan) = {
-    val list = plan.flatMap(_.expressions.flatMap(_.references).map(_.exprId.id))
-    val minId = if (list.isEmpty) 0 else list.min
     plan transformAllExpressions {
       case a: AttributeReference =>
-        AttributeReference(a.name, a.dataType, a.nullable)(exprId = ExprId(a.exprId.id - minId))
+        AttributeReference(a.name, a.dataType, a.nullable)(exprId = ExprId(0))
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 05ac1623d78ed..fd121ce05698c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -122,6 +122,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
     case _ =>
   }
 
+  @transient
   protected[sql] val cacheManager = new CacheManager(this)
 
   /**
@@ -159,6 +160,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *       DataTypes.StringType);
    * }}}
    */
+  @transient
   val udf: UDFRegistration = new UDFRegistration(this)
 
   /** Returns true if the table is currently cached in-memory. */
diff --git a/sql/core/src/test/resources/log4j.properties b/sql/core/src/test/resources/log4j.properties
index fbed0a782dd3e..28e90b9520b2c 100644
--- a/sql/core/src/test/resources/log4j.properties
+++ b/sql/core/src/test/resources/log4j.properties
@@ -39,6 +39,9 @@ log4j.appender.FA.Threshold = INFO
 log4j.additivity.parquet.hadoop.ParquetRecordReader=false
 log4j.logger.parquet.hadoop.ParquetRecordReader=OFF
 
+log4j.additivity.parquet.hadoop.ParquetOutputCommitter=false
+log4j.logger.parquet.hadoop.ParquetOutputCommitter=OFF
+
 log4j.additivity.org.apache.hadoop.hive.serde2.lazy.LazyStruct=false
 log4j.logger.org.apache.hadoop.hive.serde2.lazy.LazyStruct=OFF
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 02623f73c7f76..7be9215a443f0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.types._
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext.logicalPlanToSparkQuery
 import org.apache.spark.sql.test.TestSQLContext.implicits._
+import org.apache.spark.sql.test.TestSQLContext.sql
 
 
 class DataFrameSuite extends QueryTest {
@@ -88,6 +89,15 @@ class DataFrameSuite extends QueryTest {
       testData.collect().toSeq)
   }
 
+  test("self join") {
+    val df1 = testData.select(testData("key")).as('df1)
+    val df2 = testData.select(testData("key")).as('df2)
+
+    checkAnswer(
+      df1.join(df2, $"df1.key" === $"df2.key"),
+      sql("SELECT a.key, b.key FROM testData a JOIN testData b ON a.key = b.key").collect().toSeq)
+  }
+
   test("selectExpr") {
     checkAnswer(
       testData.selectExpr("abs(key)", "value"),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index 081d94b6fc020..44ee5ab5975fb 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -35,11 +35,9 @@ class PlanTest extends FunSuite {
    * we must normalize them to check if two different queries are identical.
    */
   protected def normalizeExprIds(plan: LogicalPlan) = {
-    val list = plan.flatMap(_.expressions.flatMap(_.references).map(_.exprId.id))
-    val minId = if (list.isEmpty) 0 else list.min
     plan transformAllExpressions {
       case a: AttributeReference =>
-        AttributeReference(a.name, a.dataType, a.nullable)(exprId = ExprId(a.exprId.id - minId))
+        AttributeReference(a.name, a.dataType, a.nullable)(exprId = ExprId(0))
     }
   }
 

From 44b2311d946981c8251cb7807d70c8e99db5bbed Mon Sep 17 00:00:00 2001
From: tianyi <tianyi.asiainfo@gmail.com>
Date: Wed, 11 Feb 2015 12:50:17 -0800
Subject: [PATCH 126/357] [SPARK-3688][SQL]LogicalPlan can't resolve column
 correctlly

This PR fixed the resolving problem described in https://issues.apache.org/jira/browse/SPARK-3688
```
CREATE TABLE t1(x INT);
CREATE TABLE t2(a STRUCT<x: INT>, k INT);
SELECT a.x FROM t1 a JOIN t2 b ON a.x = b.k;
```

Author: tianyi <tianyi.asiainfo@gmail.com>

Closes #4524 from tianyi/SPARK-3688 and squashes the following commits:

237a256 [tianyi] resolve a name with table.column pattern first.
---
 .../catalyst/plans/logical/LogicalPlan.scala  | 50 ++++++++++++-------
 ...as hive-0-c6d02549aec166e16bfc44d5905fa33a |  0
 ...as hive-1-a8987ff8c7b9ca95bf8b32314694ed1f |  0
 ...as hive-2-26f54240cf5b909086fc34a34d7fdb56 |  0
 ...as hive-3-d08d5280027adea681001ad82a5a6974 |  0
 ...as hive-4-22eb25b5be6daf72a6649adfe5041749 |  1 +
 .../hive/execution/HiveResolutionSuite.scala  |  9 ++++
 7 files changed, 42 insertions(+), 18 deletions(-)
 create mode 100644 sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-0-c6d02549aec166e16bfc44d5905fa33a
 create mode 100644 sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-1-a8987ff8c7b9ca95bf8b32314694ed1f
 create mode 100644 sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-2-26f54240cf5b909086fc34a34d7fdb56
 create mode 100644 sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-3-d08d5280027adea681001ad82a5a6974
 create mode 100644 sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-4-22eb25b5be6daf72a6649adfe5041749

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 7cf4b81274906..b23f8d03df574 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -128,6 +128,29 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
   def resolve(name: String, resolver: Resolver): Option[NamedExpression] =
     resolve(name, output, resolver)
 
+  def resolveAsTableColumn(
+      nameParts: Array[String],
+      resolver: Resolver,
+      attribute: Attribute): List[(Attribute, List[String])] = {
+    if (attribute.qualifiers.find(resolver(_, nameParts.head)).nonEmpty && nameParts.size > 1) {
+      val remainingParts = nameParts.drop(1)
+      resolveAsColumn(remainingParts, resolver, attribute)
+    } else {
+      Nil
+    }
+  }
+
+  def resolveAsColumn(
+      nameParts: Array[String],
+      resolver: Resolver,
+      attribute: Attribute): List[(Attribute, List[String])] = {
+    if (resolver(attribute.name, nameParts.head)) {
+      (attribute.withName(nameParts.head), nameParts.tail.toList) :: Nil
+    } else {
+      Nil
+    }
+  }
+
   /** Performs attribute resolution given a name and a sequence of possible attributes. */
   protected def resolve(
       name: String,
@@ -136,24 +159,15 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
 
     val parts = name.split("\\.")
 
-    // Collect all attributes that are output by this nodes children where either the first part
-    // matches the name or where the first part matches the scope and the second part matches the
-    // name.  Return these matches along with any remaining parts, which represent dotted access to
-    // struct fields.
-    val options = input.flatMap { option =>
-      // If the first part of the desired name matches a qualifier for this possible match, drop it.
-      val remainingParts =
-        if (option.qualifiers.find(resolver(_, parts.head)).nonEmpty && parts.size > 1) {
-          parts.drop(1)
-        } else {
-          parts
-        }
-
-      if (resolver(option.name, remainingParts.head)) {
-        // Preserve the case of the user's attribute reference.
-        (option.withName(remainingParts.head), remainingParts.tail.toList) :: Nil
-      } else {
-        Nil
+    // We will try to resolve this name as `table.column` pattern first.
+    var options = input.flatMap { option =>
+      resolveAsTableColumn(parts, resolver, option)
+    }
+
+    // If none of attributes match `table.column` pattern, we try to resolve it as a column.
+    if(options.isEmpty) {
+      options = input.flatMap { option =>
+        resolveAsColumn(parts, resolver, option)
       }
     }
 
diff --git a/sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-0-c6d02549aec166e16bfc44d5905fa33a b/sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-0-c6d02549aec166e16bfc44d5905fa33a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-1-a8987ff8c7b9ca95bf8b32314694ed1f b/sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-1-a8987ff8c7b9ca95bf8b32314694ed1f
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-2-26f54240cf5b909086fc34a34d7fdb56 b/sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-2-26f54240cf5b909086fc34a34d7fdb56
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-3-d08d5280027adea681001ad82a5a6974 b/sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-3-d08d5280027adea681001ad82a5a6974
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-4-22eb25b5be6daf72a6649adfe5041749 b/sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-4-22eb25b5be6daf72a6649adfe5041749
new file mode 100644
index 0000000000000..d00491fd7e5bb
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/test ambiguousReferences resolved as hive-4-22eb25b5be6daf72a6649adfe5041749	
@@ -0,0 +1 @@
+1
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
index ab5f9cdddf508..029c36aa89b26 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
@@ -99,6 +99,15 @@ class HiveResolutionSuite extends HiveComparisonTest {
     assert(sql("SELECT nestedArray[0].a FROM nestedRepeatedTest").collect().head(0) === 1)
   }
 
+  createQueryTest("test ambiguousReferences resolved as hive",
+    """
+      |CREATE TABLE t1(x INT);
+      |CREATE TABLE t2(a STRUCT<x: INT>, k INT);
+      |INSERT OVERWRITE TABLE t1 SELECT 1 FROM src LIMIT 1;
+      |INSERT OVERWRITE TABLE t2 SELECT named_struct("x",1),1 FROM src LIMIT 1;
+      |SELECT a.x FROM t1 a JOIN t2 b ON a.x = b.k;
+    """.stripMargin)
+
   /**
    * Negative examples.  Currently only left here for documentation purposes.
    * TODO(marmbrus): Test that catalyst fails on these queries.

From fa6bdc6e819f9338248b952ec578bcd791ddbf6d Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 11 Feb 2015 15:26:31 -0800
Subject: [PATCH 127/357] [SPARK-3688][SQL] More inline comments for
 LogicalPlan.

As a follow-up to https://github.com/apache/spark/pull/4524

Author: Reynold Xin <rxin@databricks.com>

Closes #4539 from rxin/SPARK-3688 and squashes the following commits:

5ac56c7 [Reynold Xin] exists
da8eea4 [Reynold Xin] [SPARK-3688][SQL] More inline comments for LogicalPlan.
---
 .../spark/sql/catalyst/analysis/package.scala |   4 +-
 .../expressions/namedExpressions.scala        |  11 ++
 .../spark/sql/catalyst/plans/QueryPlan.scala  |   5 +
 .../catalyst/plans/logical/LogicalPlan.scala  | 102 +++++++++++-------
 .../catalyst/plans/logical/Statistics.scala   |  35 ++++++
 5 files changed, 115 insertions(+), 42 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala
index 3f672a3e0fd91..5dc9d0e566087 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala
@@ -25,8 +25,8 @@ package org.apache.spark.sql.catalyst
 package object analysis {
 
   /**
-   * Responsible for resolving which identifiers refer to the same entity.  For example, by using
-   * case insensitive equality.
+   * Resolver should return true if the first string refers to the same entity as the second string.
+   * For example, by using case insensitive equality.
    */
   type Resolver = (String, String) => Boolean
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 7f122e9d55734..f77c56311cc8c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -40,6 +40,17 @@ abstract class NamedExpression extends Expression {
 
   def name: String
   def exprId: ExprId
+
+  /**
+   * All possible qualifiers for the expression.
+   *
+   * For now, since we do not allow using original table name to qualify a column name once the
+   * table is aliased, this can only be:
+   *
+   * 1. Empty Seq: when an attribute doesn't have a qualifier,
+   *    e.g. top level attributes aliased in the SELECT clause, or column from a LocalRelation.
+   * 2. Single element: either the table name or the alias name of the table.
+   */
   def qualifiers: Seq[String]
 
   def toAttribute: Attribute
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 619f42859cbb8..17a88e07de15f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -152,6 +152,11 @@ abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extends TreeNode[PlanTy
   /** Prints out the schema in the tree format */
   def printSchema(): Unit = println(schemaString)
 
+  /**
+   * A prefix string used when printing the plan.
+   *
+   * We use "!" to indicate an invalid plan, and "'" to indicate an unresolved plan.
+   */
   protected def statePrefix = if (missingInput.nonEmpty && children.nonEmpty) "!" else ""
 
   override def simpleString = statePrefix + super.simpleString
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index b23f8d03df574..8c4f09b58a4f2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -18,41 +18,29 @@
 package org.apache.spark.sql.catalyst.plans.logical
 
 import org.apache.spark.Logging
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedGetField, Resolver}
-import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.trees.TreeNode
-import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.catalyst.trees
 
-/**
- * Estimates of various statistics.  The default estimation logic simply lazily multiplies the
- * corresponding statistic produced by the children.  To override this behavior, override
- * `statistics` and assign it an overridden version of `Statistics`.
- *
- * '''NOTE''': concrete and/or overridden versions of statistics fields should pay attention to the
- * performance of the implementations.  The reason is that estimations might get triggered in
- * performance-critical processes, such as query plan planning.
- *
- * Note that we are using a BigInt here since it is easy to overflow a 64-bit integer in
- * cardinality estimation (e.g. cartesian joins).
- *
- * @param sizeInBytes Physical size in bytes. For leaf operators this defaults to 1, otherwise it
- *                    defaults to the product of children's `sizeInBytes`.
- */
-private[sql] case class Statistics(sizeInBytes: BigInt)
 
 abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
   self: Product =>
 
+  /**
+   * Computes [[Statistics]] for this plan. The default implementation assumes the output
+   * cardinality is the product of of all child plan's cardinality, i.e. applies in the case
+   * of cartesian joins.
+   *
+   * [[LeafNode]]s must override this.
+   */
   def statistics: Statistics = {
     if (children.size == 0) {
       throw new UnsupportedOperationException(s"LeafNode $nodeName must implement statistics.")
     }
-
-    Statistics(
-      sizeInBytes = children.map(_.statistics).map(_.sizeInBytes).product)
+    Statistics(sizeInBytes = children.map(_.statistics.sizeInBytes).product)
   }
 
   /**
@@ -128,26 +116,41 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
   def resolve(name: String, resolver: Resolver): Option[NamedExpression] =
     resolve(name, output, resolver)
 
-  def resolveAsTableColumn(
+  /**
+   * Resolve the given `name` string against the given attribute, returning either 0 or 1 match.
+   *
+   * This assumes `name` has multiple parts, where the 1st part is a qualifier
+   * (i.e. table name, alias, or subquery alias).
+   * See the comment above `candidates` variable in resolve() for semantics the returned data.
+   */
+  private def resolveAsTableColumn(
       nameParts: Array[String],
       resolver: Resolver,
-      attribute: Attribute): List[(Attribute, List[String])] = {
-    if (attribute.qualifiers.find(resolver(_, nameParts.head)).nonEmpty && nameParts.size > 1) {
-      val remainingParts = nameParts.drop(1)
+      attribute: Attribute): Option[(Attribute, List[String])] = {
+    assert(nameParts.length > 1)
+    if (attribute.qualifiers.exists(resolver(_, nameParts.head))) {
+      // At least one qualifier matches. See if remaining parts match.
+      val remainingParts = nameParts.tail
       resolveAsColumn(remainingParts, resolver, attribute)
     } else {
-      Nil
+      None
     }
   }
 
-  def resolveAsColumn(
+  /**
+   * Resolve the given `name` string against the given attribute, returning either 0 or 1 match.
+   *
+   * Different from resolveAsTableColumn, this assumes `name` does NOT start with a qualifier.
+   * See the comment above `candidates` variable in resolve() for semantics the returned data.
+   */
+  private def resolveAsColumn(
       nameParts: Array[String],
       resolver: Resolver,
-      attribute: Attribute): List[(Attribute, List[String])] = {
+      attribute: Attribute): Option[(Attribute, List[String])] = {
     if (resolver(attribute.name, nameParts.head)) {
-      (attribute.withName(nameParts.head), nameParts.tail.toList) :: Nil
+      Option((attribute.withName(nameParts.head), nameParts.tail.toList))
     } else {
-      Nil
+      None
     }
   }
 
@@ -159,25 +162,44 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
 
     val parts = name.split("\\.")
 
-    // We will try to resolve this name as `table.column` pattern first.
-    var options = input.flatMap { option =>
-      resolveAsTableColumn(parts, resolver, option)
+    // A sequence of possible candidate matches.
+    // Each candidate is a tuple. The first element is a resolved attribute, followed by a list
+    // of parts that are to be resolved.
+    // For example, consider an example where "a" is the table name, "b" is the column name,
+    // and "c" is the struct field name, i.e. "a.b.c". In this case, Attribute will be "a.b",
+    // and the second element will be List("c").
+    var candidates: Seq[(Attribute, List[String])] = {
+      // If the name has 2 or more parts, try to resolve it as `table.column` first.
+      if (parts.length > 1) {
+        input.flatMap { option =>
+          resolveAsTableColumn(parts, resolver, option)
+        }
+      } else {
+        Seq.empty
+      }
     }
 
     // If none of attributes match `table.column` pattern, we try to resolve it as a column.
-    if(options.isEmpty) {
-      options = input.flatMap { option =>
-        resolveAsColumn(parts, resolver, option)
+    if (candidates.isEmpty) {
+      candidates = input.flatMap { candidate =>
+        resolveAsColumn(parts, resolver, candidate)
       }
     }
 
-    options.distinct match {
+    candidates.distinct match {
       // One match, no nested fields, use it.
       case Seq((a, Nil)) => Some(a)
 
       // One match, but we also need to extract the requested nested field.
       case Seq((a, nestedFields)) =>
-        Some(Alias(nestedFields.foldLeft(a: Expression)(UnresolvedGetField), nestedFields.last)())
+        // The foldLeft adds UnresolvedGetField for every remaining parts of the name,
+        // and aliased it with the last part of the name.
+        // For example, consider name "a.b.c", where "a" is resolved to an existing attribute.
+        // Then this will add UnresolvedGetField("b") and UnresolvedGetField("c"), and alias
+        // the final expression as "c".
+        val fieldExprs = nestedFields.foldLeft(a: Expression)(UnresolvedGetField)
+        val aliasName = nestedFields.last
+        Some(Alias(fieldExprs, aliasName)())
 
       // No matches.
       case Seq() =>
@@ -186,8 +208,8 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
 
       // More than one match.
       case ambiguousReferences =>
-        throw new TreeNodeException(
-          this, s"Ambiguous references to $name: ${ambiguousReferences.mkString(",")}")
+        throw new AnalysisException(
+          s"Ambiguous references to $name: ${ambiguousReferences.mkString(",")}")
     }
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
new file mode 100644
index 0000000000000..9ac4c3a2a56c8
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.plans.logical
+
+/**
+ * Estimates of various statistics.  The default estimation logic simply lazily multiplies the
+ * corresponding statistic produced by the children.  To override this behavior, override
+ * `statistics` and assign it an overridden version of `Statistics`.
+ *
+ * '''NOTE''': concrete and/or overridden versions of statistics fields should pay attention to the
+ * performance of the implementations.  The reason is that estimations might get triggered in
+ * performance-critical processes, such as query plan planning.
+ *
+ * Note that we are using a BigInt here since it is easy to overflow a 64-bit integer in
+ * cardinality estimation (e.g. cartesian joins).
+ *
+ * @param sizeInBytes Physical size in bytes. For leaf operators this defaults to 1, otherwise it
+ *                    defaults to the product of children's `sizeInBytes`.
+ */
+private[sql] case class Statistics(sizeInBytes: BigInt)

From d931b01dcaaf009dcf68dcfe83428bd7f9e857cc Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 11 Feb 2015 18:32:48 -0800
Subject: [PATCH 128/357] [SQL] Two DataFrame fixes.

- Removed DataFrame.apply for projection & filtering since they are extremely confusing.
- Added implicits for RDD[Int], RDD[Long], and RDD[String]

Author: Reynold Xin <rxin@databricks.com>

Closes #4543 from rxin/df-cleanup and squashes the following commits:

81ec915 [Reynold Xin] [SQL] More DataFrame fixes.
---
 .../org/apache/spark/sql/DataFrame.scala      | 39 ++++---------
 .../org/apache/spark/sql/DataFrameImpl.scala  | 24 +++-----
 .../apache/spark/sql/IncomputableColumn.scala |  4 --
 .../org/apache/spark/sql/SQLContext.scala     | 54 +++++++++++++++---
 .../spark/sql/DataFrameImplicitsSuite.scala   | 55 +++++++++++++++++++
 5 files changed, 119 insertions(+), 57 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 17900c5ee3892..327cf87f30f4b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -85,6 +85,14 @@ trait DataFrame extends RDDApi[Row] {
 
   protected[sql] def logicalPlan: LogicalPlan
 
+  override def toString =
+    try {
+      schema.map(f => s"${f.name}: ${f.dataType.simpleString}").mkString("[", ", ", "]")
+    } catch {
+      case NonFatal(e) =>
+        s"Invalid tree; ${e.getMessage}:\n$queryExecution"
+    }
+
   /** Left here for backward compatibility. */
   @deprecated("1.3.0", "use toDataFrame")
   def toSchemaRDD: DataFrame = this
@@ -92,13 +100,9 @@ trait DataFrame extends RDDApi[Row] {
   /**
    * Returns the object itself. Used to force an implicit conversion from RDD to DataFrame in Scala.
    */
-  def toDataFrame: DataFrame = this
-
-  override def toString =
-     try schema.map(f => s"${f.name}: ${f.dataType.simpleString}").mkString("[", ", ", "]") catch {
-       case NonFatal(e) =>
-         s"Invalid tree; ${e.getMessage}:\n$queryExecution"
-     }
+  // This is declared with parentheses to prevent the Scala compiler from treating
+  // `rdd.toDataFrame("1")` as invoking this toDataFrame and then apply on the returned DataFrame.
+  def toDataFrame(): DataFrame = this
 
   /**
    * Returns a new [[DataFrame]] with columns renamed. This can be quite convenient in conversion
@@ -234,16 +238,6 @@ trait DataFrame extends RDDApi[Row] {
    */
   def col(colName: String): Column
 
-  /**
-   * Selects a set of expressions, wrapped in a Product.
-   * {{{
-   *   // The following two are equivalent:
-   *   df.apply(($"colA", $"colB" + 1))
-   *   df.select($"colA", $"colB" + 1)
-   * }}}
-   */
-  def apply(projection: Product): DataFrame
-
   /**
    * Returns a new [[DataFrame]] with an alias set.
    */
@@ -317,17 +311,6 @@ trait DataFrame extends RDDApi[Row] {
    */
   def where(condition: Column): DataFrame
 
-  /**
-   * Filters rows using the given condition. This is a shorthand meant for Scala.
-   * {{{
-   *   // The following are equivalent:
-   *   peopleDf.filter($"age" > 15)
-   *   peopleDf.where($"age" > 15)
-   *   peopleDf($"age" > 15)
-   * }}}
-   */
-  def apply(condition: Column): DataFrame
-
   /**
    * Groups the [[DataFrame]] using the specified columns, so we can run aggregation on them.
    * See [[GroupedData]] for all the available aggregate functions.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index 41da4424ae459..3863df53181a3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -49,8 +49,10 @@ private[sql] class DataFrameImpl protected[sql](
   extends DataFrame {
 
   /**
-   * A constructor that automatically analyzes the logical plan. This reports error eagerly
-   * as the [[DataFrame]] is constructed.
+   * A constructor that automatically analyzes the logical plan.
+   *
+   * This reports error eagerly as the [[DataFrame]] is constructed, unless
+   * [[SQLConf.dataFrameEagerAnalysis]] is turned off.
    */
   def this(sqlContext: SQLContext, logicalPlan: LogicalPlan) = {
     this(sqlContext, {
@@ -158,7 +160,7 @@ private[sql] class DataFrameImpl protected[sql](
   }
 
   override def show(): Unit = {
-    println(showString)
+    println(showString())
   }
 
   override def join(right: DataFrame): DataFrame = {
@@ -205,14 +207,6 @@ private[sql] class DataFrameImpl protected[sql](
       Column(sqlContext, Project(Seq(expr), logicalPlan), expr)
   }
 
-  override def apply(projection: Product): DataFrame = {
-    require(projection.productArity >= 1)
-    select(projection.productIterator.map {
-      case c: Column => c
-      case o: Any => Column(Literal(o))
-    }.toSeq :_*)
-  }
-
   override def as(alias: String): DataFrame = Subquery(alias, logicalPlan)
 
   override def as(alias: Symbol): DataFrame = Subquery(alias.name, logicalPlan)
@@ -259,10 +253,6 @@ private[sql] class DataFrameImpl protected[sql](
     filter(condition)
   }
 
-  override def apply(condition: Column): DataFrame = {
-    filter(condition)
-  }
-
   override def groupBy(cols: Column*): GroupedData = {
     new GroupedData(this, cols.map(_.expr))
   }
@@ -323,7 +313,7 @@ private[sql] class DataFrameImpl protected[sql](
   override def count(): Long = groupBy().count().rdd.collect().head.getLong(0)
 
   override def repartition(numPartitions: Int): DataFrame = {
-    sqlContext.applySchema(rdd.repartition(numPartitions), schema)
+    sqlContext.createDataFrame(rdd.repartition(numPartitions), schema)
   }
 
   override def distinct: DataFrame = Distinct(logicalPlan)
@@ -401,7 +391,7 @@ private[sql] class DataFrameImpl protected[sql](
       val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null)
 
       new Iterator[String] {
-        override def hasNext() = iter.hasNext
+        override def hasNext = iter.hasNext
         override def next(): String = {
           JsonRDD.rowToJSON(rowSchema, gen)(iter.next())
           gen.flush()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
index 494e49c1317b6..4f9d92d97646f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
@@ -80,8 +80,6 @@ private[sql] class IncomputableColumn(protected[sql] val expr: Expression) exten
 
   override def col(colName: String): Column = err()
 
-  override def apply(projection: Product): DataFrame = err()
-
   override def select(cols: Column*): DataFrame = err()
 
   override def select(col: String, cols: String*): DataFrame = err()
@@ -98,8 +96,6 @@ private[sql] class IncomputableColumn(protected[sql] val expr: Expression) exten
 
   override def where(condition: Column): DataFrame = err()
 
-  override def apply(condition: Column): DataFrame = err()
-
   override def groupBy(cols: Column*): GroupedData = err()
 
   override def groupBy(col1: String, cols: String*): GroupedData = err()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index fd121ce05698c..ca5e62f29569e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -180,21 +180,59 @@ class SQLContext(@transient val sparkContext: SparkContext)
    */
   object implicits {
     // scalastyle:on
-    /**
-     * Creates a DataFrame from an RDD of case classes.
-     *
-     * @group userf
-     */
+
+    /** Creates a DataFrame from an RDD of case classes or tuples. */
     implicit def rddToDataFrame[A <: Product : TypeTag](rdd: RDD[A]): DataFrame = {
       self.createDataFrame(rdd)
     }
 
-    /**
-     * Creates a DataFrame from a local Seq of Product.
-     */
+    /** Creates a DataFrame from a local Seq of Product. */
     implicit def localSeqToDataFrame[A <: Product : TypeTag](data: Seq[A]): DataFrame = {
       self.createDataFrame(data)
     }
+
+    // Do NOT add more implicit conversions. They are likely to break source compatibility by
+    // making existing implicit conversions ambiguous. In particular, RDD[Double] is dangerous
+    // because of [[DoubleRDDFunctions]].
+
+    /** Creates a single column DataFrame from an RDD[Int]. */
+    implicit def intRddToDataFrame(data: RDD[Int]): DataFrame = {
+      val dataType = IntegerType
+      val rows = data.mapPartitions { iter =>
+        val row = new SpecificMutableRow(dataType :: Nil)
+        iter.map { v =>
+          row.setInt(0, v)
+          row: Row
+        }
+      }
+      self.createDataFrame(rows, StructType(StructField("_1", dataType) :: Nil))
+    }
+
+    /** Creates a single column DataFrame from an RDD[Long]. */
+    implicit def longRddToDataFrame(data: RDD[Long]): DataFrame = {
+      val dataType = LongType
+      val rows = data.mapPartitions { iter =>
+        val row = new SpecificMutableRow(dataType :: Nil)
+        iter.map { v =>
+          row.setLong(0, v)
+          row: Row
+        }
+      }
+      self.createDataFrame(rows, StructType(StructField("_1", dataType) :: Nil))
+    }
+
+    /** Creates a single column DataFrame from an RDD[String]. */
+    implicit def stringRddToDataFrame(data: RDD[String]): DataFrame = {
+      val dataType = StringType
+      val rows = data.mapPartitions { iter =>
+        val row = new SpecificMutableRow(dataType :: Nil)
+        iter.map { v =>
+          row.setString(0, v)
+          row: Row
+        }
+      }
+      self.createDataFrame(rows, StructType(StructField("_1", dataType) :: Nil))
+    }
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
new file mode 100644
index 0000000000000..8fa830dd9390f
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
@@ -0,0 +1,55 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.test.TestSQLContext.{sparkContext => sc}
+import org.apache.spark.sql.test.TestSQLContext.implicits._
+
+
+class DataFrameImplicitsSuite extends QueryTest {
+
+  test("RDD of tuples") {
+    checkAnswer(
+      sc.parallelize(1 to 10).map(i => (i, i.toString)).toDataFrame("intCol", "strCol"),
+      (1 to 10).map(i => Row(i, i.toString)))
+  }
+
+  test("Seq of tuples") {
+    checkAnswer(
+      (1 to 10).map(i => (i, i.toString)).toDataFrame("intCol", "strCol"),
+      (1 to 10).map(i => Row(i, i.toString)))
+  }
+
+  test("RDD[Int]") {
+    checkAnswer(
+      sc.parallelize(1 to 10).toDataFrame("intCol"),
+      (1 to 10).map(i => Row(i)))
+  }
+
+  test("RDD[Long]") {
+    checkAnswer(
+      sc.parallelize(1L to 10L).toDataFrame("longCol"),
+      (1L to 10L).map(i => Row(i)))
+  }
+
+  test("RDD[String]") {
+    checkAnswer(
+      sc.parallelize(1 to 10).map(_.toString).toDataFrame("stringCol"),
+      (1 to 10).map(i => Row(i.toString)))
+  }
+}

From a38e23c30fb5d12f8f46a119d91a0620036e6800 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Wed, 11 Feb 2015 19:05:49 -0800
Subject: [PATCH 129/357] [SQL] Make dataframe more tolerant of being
 serialized

Eases use in the spark-shell.

Author: Michael Armbrust <michael@databricks.com>

Closes #4545 from marmbrus/serialization and squashes the following commits:

04748e6 [Michael Armbrust] @scala.annotation.varargs
b36e219 [Michael Armbrust] moreFixes
---
 .../src/main/scala/org/apache/spark/sql/DataFrame.scala  | 2 +-
 .../main/scala/org/apache/spark/sql/DataFrameImpl.scala  | 4 ++--
 sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala   | 9 +++++++++
 .../src/main/scala/org/apache/spark/sql/SQLContext.scala | 4 +++-
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 327cf87f30f4b..13aff760e9a5c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -76,7 +76,7 @@ private[sql] object DataFrame {
  */
 // TODO: Improve documentation.
 @Experimental
-trait DataFrame extends RDDApi[Row] {
+trait DataFrame extends RDDApi[Row] with Serializable {
 
   val sqlContext: SQLContext
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index 3863df53181a3..4c6e19cace8ca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -44,8 +44,8 @@ import org.apache.spark.sql.types.{NumericType, StructType}
  * Internal implementation of [[DataFrame]]. Users of the API should use [[DataFrame]] directly.
  */
 private[sql] class DataFrameImpl protected[sql](
-    override val sqlContext: SQLContext,
-    val queryExecution: SQLContext#QueryExecution)
+    @transient override val sqlContext: SQLContext,
+    @transient val queryExecution: SQLContext#QueryExecution)
   extends DataFrame {
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala
index 6bf21dd1bc79b..7bc7683576b71 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala
@@ -166,6 +166,15 @@ object Dsl {
   //////////////////////////////////////////////////////////////////////////////////////////////
   //////////////////////////////////////////////////////////////////////////////////////////////
 
+  /**
+   * Returns the first column that is not null.
+   * {{{
+   *   df.select(coalesce(df("a"), df("b")))
+   * }}}
+   */
+  @scala.annotation.varargs
+  def coalesce(e: Column*): Column = Coalesce(e.map(_.expr))
+
   /**
    * Unary minus, i.e. negate the expression.
    * {{{
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index ca5e62f29569e..8aae222acd927 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -129,9 +129,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * A collection of methods that are considered experimental, but can be used to hook into
    * the query planner for advanced functionalities.
    */
+  @transient
   val experimental: ExperimentalMethods = new ExperimentalMethods(this)
 
   /** Returns a [[DataFrame]] with no rows or columns. */
+  @transient
   lazy val emptyDataFrame = DataFrame(this, NoRelation)
 
   /**
@@ -178,7 +180,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * (Scala-specific)
    * Implicit methods available in Scala for converting common Scala objects into [[DataFrame]]s.
    */
-  object implicits {
+  object implicits extends Serializable {
     // scalastyle:on
 
     /** Creates a DataFrame from an RDD of case classes or tuples. */

From 9a3ea49f74b9a41dba527a7e995735f488fe0847 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Thu, 12 Feb 2015 12:36:26 +0000
Subject: [PATCH 130/357] SPARK-5727 [BUILD] Remove Debian packaging

(for master / 1.4 only)

Author: Sean Owen <sowen@cloudera.com>

Closes #4526 from srowen/SPARK-5727.2 and squashes the following commits:

83ba49c [Sean Owen] Remove Debian packaging
---
 assembly/pom.xml                 | 129 -------------------------------
 assembly/src/deb/RELEASE         |   2 -
 assembly/src/deb/control/control |   8 --
 docs/building-spark.md           |  10 ---
 repl/pom.xml                     |   2 -
 5 files changed, 151 deletions(-)
 delete mode 100644 assembly/src/deb/RELEASE
 delete mode 100644 assembly/src/deb/control/control

diff --git a/assembly/pom.xml b/assembly/pom.xml
index c1bcdbb664dd0..fa9f56e556d8b 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -36,10 +36,6 @@
     <spark.jar.dir>scala-${scala.binary.version}</spark.jar.dir>
     <spark.jar.basename>spark-assembly-${project.version}-hadoop${hadoop.version}.jar</spark.jar.basename>
     <spark.jar>${project.build.directory}/${spark.jar.dir}/${spark.jar.basename}</spark.jar>
-    <deb.pkg.name>spark</deb.pkg.name>
-    <deb.install.path>/usr/share/spark</deb.install.path>
-    <deb.user>root</deb.user>
-    <deb.bin.filemode>755</deb.bin.filemode>
   </properties>
 
   <dependencies>
@@ -217,131 +213,6 @@
         </plugins>
       </build>
     </profile>
-    <profile>
-      <id>deb</id>
-      <build>
-        <plugins>
-          <plugin>
-            <artifactId>maven-antrun-plugin</artifactId>
-            <executions>
-              <execution>
-                <phase>prepare-package</phase>
-                <goals>
-                  <goal>run</goal>
-                </goals>
-                <configuration>
-                  <target>
-                    <echo>
-                      NOTE: Debian packaging is deprecated and is scheduled to be removed in Spark 1.4.
-                    </echo>
-                  </target>
-                </configuration>
-              </execution>
-            </executions>
-          </plugin>
-          <plugin>
-            <groupId>org.codehaus.mojo</groupId>
-            <artifactId>buildnumber-maven-plugin</artifactId>
-            <version>1.2</version>
-            <executions>
-              <execution>
-                <phase>validate</phase>
-                <goals>
-                  <goal>create</goal>
-                </goals>
-                <configuration>
-                  <shortRevisionLength>8</shortRevisionLength>
-                </configuration>
-              </execution>
-            </executions>
-          </plugin>
-          <plugin>
-            <groupId>org.vafer</groupId>
-            <artifactId>jdeb</artifactId>
-            <version>0.11</version>
-            <executions>
-              <execution>
-                <phase>package</phase>
-                <goals>
-                  <goal>jdeb</goal>
-                </goals>
-                <configuration>
-                  <deb>${project.build.directory}/${deb.pkg.name}_${project.version}-${buildNumber}_all.deb</deb>
-                  <attach>false</attach>
-                  <compression>gzip</compression>
-                  <dataSet>
-                    <data>
-                      <src>${spark.jar}</src>
-                      <type>file</type>
-                      <mapper>
-                        <type>perm</type>
-                        <user>${deb.user}</user>
-                        <group>${deb.user}</group>
-                        <prefix>${deb.install.path}/jars</prefix>
-                      </mapper>
-                    </data>
-                    <data>
-                      <src>${basedir}/src/deb/RELEASE</src>
-                      <type>file</type>
-                      <mapper>
-                        <type>perm</type>
-                        <user>${deb.user}</user>
-                        <group>${deb.user}</group>
-                        <prefix>${deb.install.path}</prefix>
-                      </mapper>
-                    </data>
-                    <data>
-                      <src>${basedir}/../conf</src>
-                      <type>directory</type>
-                      <mapper>
-                        <type>perm</type>
-                        <user>${deb.user}</user>
-                        <group>${deb.user}</group>
-                        <prefix>${deb.install.path}/conf</prefix>
-                        <filemode>${deb.bin.filemode}</filemode>
-                      </mapper>
-                    </data>
-                    <data>
-                      <src>${basedir}/../bin</src>
-                      <type>directory</type>
-                      <mapper>
-                        <type>perm</type>
-                        <user>${deb.user}</user>
-                        <group>${deb.user}</group>
-                        <prefix>${deb.install.path}/bin</prefix>
-                        <filemode>${deb.bin.filemode}</filemode>
-                      </mapper>
-                    </data>
-                    <data>
-                      <src>${basedir}/../sbin</src>
-                      <type>directory</type>
-                      <mapper>
-                        <type>perm</type>
-                        <user>${deb.user}</user>
-                        <group>${deb.user}</group>
-                        <prefix>${deb.install.path}/sbin</prefix>
-                        <filemode>${deb.bin.filemode}</filemode>
-                      </mapper>
-                    </data>
-                    <data>
-                      <src>${basedir}/../python</src>
-                      <type>directory</type>
-                      <mapper>
-                        <type>perm</type>
-                        <user>${deb.user}</user>
-                        <group>${deb.user}</group>
-                        <prefix>${deb.install.path}/python</prefix>
-                        <filemode>${deb.bin.filemode}</filemode>
-                      </mapper>
-                    </data>
-                  </dataSet>
-                </configuration>
-              </execution>
-            </executions>
-          </plugin>
-        </plugins>
-      </build>
-    </profile>
     <profile>
       <id>kinesis-asl</id>
       <dependencies>
diff --git a/assembly/src/deb/RELEASE b/assembly/src/deb/RELEASE
deleted file mode 100644
index aad50ee73aa45..0000000000000
--- a/assembly/src/deb/RELEASE
+++ /dev/null
@@ -1,2 +0,0 @@
-compute-classpath.sh uses the existence of this file to decide whether to put the assembly jar on the
-classpath or instead to use classfiles in the source tree. 
\ No newline at end of file
diff --git a/assembly/src/deb/control/control b/assembly/src/deb/control/control
deleted file mode 100644
index a6b4471d485f4..0000000000000
--- a/assembly/src/deb/control/control
+++ /dev/null
@@ -1,8 +0,0 @@
-Package: [[deb.pkg.name]]
-Version: [[version]]-[[buildNumber]]
-Section: misc
-Priority: extra
-Architecture: all
-Maintainer: Matei Zaharia <matei.zaharia@gmail.com>
-Description: [[name]]
-Distribution: development
diff --git a/docs/building-spark.md b/docs/building-spark.md
index d3824fb61eef9..088da7da4980e 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -159,16 +159,6 @@ Thus, the full flow for running continuous-compilation of the `core` submodule m
 For help in setting up IntelliJ IDEA or Eclipse for Spark development, and troubleshooting, refer to the
 [wiki page for IDE setup](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-IDESetup).
 
-# Building Spark Debian Packages
-
-_NOTE: Debian packaging is deprecated and is scheduled to be removed in Spark 1.4._
-
-The Maven build includes support for building a Debian package containing the assembly 'fat-jar', PySpark, and the necessary scripts and configuration files. This can be created by specifying the following:
-
-    mvn -Pdeb -DskipTests clean package
-
-The debian package can then be found under assembly/target. We added the short commit hash to the file name so that we can distinguish individual packages built for SNAPSHOT versions.
-
 # Running Java 8 Test Suites
 
 Running only Java 8 tests and nothing else.
diff --git a/repl/pom.xml b/repl/pom.xml
index 3d4adf8fd5b03..b883344bf0ceb 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -33,8 +33,6 @@
 
   <properties>
     <sbt.project.name>repl</sbt.project.name>
-    <deb.install.path>/usr/share/spark</deb.install.path>
-    <deb.user>root</deb.user>
     <extra.source.dir>scala-2.10/src/main/scala</extra.source.dir>
     <extra.testsource.dir>scala-2.10/src/test/scala</extra.testsource.dir>
   </properties>

From 9a6efbccf9eb3ab13295dfd204df8e58e5112209 Mon Sep 17 00:00:00 2001
From: Oren Mazor <oren.mazor@gmail.com>
Date: Thu, 12 Feb 2015 18:37:00 +0000
Subject: [PATCH 131/357] ignore cache paths for RAT tests

RAT fails on cache paths. add to .rat-excludes

Author: Oren Mazor <oren.mazor@gmail.com>

Closes #4569 from orenmazor/apache_master and squashes the following commits:

d0c9e7e [Oren Mazor] ignore cache paths for RAT tests
---
 .rat-excludes | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.rat-excludes b/.rat-excludes
index 769defbac11b7..a788e8273d8a2 100644
--- a/.rat-excludes
+++ b/.rat-excludes
@@ -1,4 +1,5 @@
 target
+cache
 .gitignore
 .gitattributes
 .project

From 466b1f671b21f575d28f9c103f51765790914fe3 Mon Sep 17 00:00:00 2001
From: Andrew Rowson <github@growse.com>
Date: Thu, 12 Feb 2015 18:41:39 +0000
Subject: [PATCH 132/357] [SPARK-5655] Don't chmod700 application files if
 running in YARN

[Was previously PR4507]

As per SPARK-5655, recently committed code chmod 700s all application files created on the local fs by a spark executor. This is both unnecessary and broken on YARN, where files created in the nodemanager's working directory are already owned by the user running the job and the 'yarn' group. Group read permission is also needed for the auxiliary shuffle service to be able to read the files, as this is running as the 'yarn' user.

Author: Andrew Rowson <github@growse.com>

Closes #4509 from growse/master and squashes the following commits:

7ca993c [Andrew Rowson] Moved chmod700 functionality into Utils.getOrCreateLocalRootDirs
f57ce6b [Andrew Rowson] [SPARK-5655] Don't chmod700 application files if running in a YARN container
---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 6af8dd555f2aa..c06bd6fab0cc9 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -283,13 +283,6 @@ private[spark] object Utils extends Logging {
         dir = new File(root, namePrefix + "-" + UUID.randomUUID.toString)
         if (dir.exists() || !dir.mkdirs()) {
           dir = null
-        } else {
-          // Restrict file permissions via chmod if available.
-          // For Windows this step is ignored.
-          if (!isWindows && !chmod700(dir)) {
-            dir.delete()
-            dir = null
-          }
         }
       } catch { case e: SecurityException => dir = null; }
     }
@@ -703,7 +696,9 @@ private[spark] object Utils extends Logging {
           try {
             val rootDir = new File(root)
             if (rootDir.exists || rootDir.mkdirs()) {
-              Some(createDirectory(root).getAbsolutePath())
+              val dir = createDirectory(root)
+              chmod700(dir)
+              Some(dir.getAbsolutePath)
             } else {
               logError(s"Failed to create dir in $root. Ignoring this directory.")
               None

From 99bd5006650bb15ec5465ffee1ebaca81354a3df Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 12 Feb 2015 10:48:13 -0800
Subject: [PATCH 133/357] [SPARK-5757][MLLIB] replace SQL JSON usage in model
 import/export by json4s

This PR detaches MLlib model import/export code from SQL's JSON support, and hence unblocks #4544 . yhuai

Author: Xiangrui Meng <meng@databricks.com>

Closes #4555 from mengxr/SPARK-5757 and squashes the following commits:

b0415e8 [Xiangrui Meng] replace SQL JSON usage by json4s
---
 .../classification/ClassificationModel.scala  | 16 ++----
 .../classification/LogisticRegression.scala   |  3 +-
 .../mllib/classification/NaiveBayes.scala     | 18 +++----
 .../spark/mllib/classification/SVM.scala      |  6 +--
 .../impl/GLMClassificationModel.scala         | 17 ++++---
 .../MatrixFactorizationModel.scala            | 14 +++--
 .../apache/spark/mllib/regression/Lasso.scala |  2 +-
 .../mllib/regression/LinearRegression.scala   |  2 +-
 .../mllib/regression/RegressionModel.scala    | 16 ++----
 .../mllib/regression/RidgeRegression.scala    |  2 +-
 .../regression/impl/GLMRegressionModel.scala  | 11 ++--
 .../spark/mllib/tree/DecisionTree.scala       |  8 +--
 .../mllib/tree/model/DecisionTreeModel.scala  | 28 ++++------
 .../mllib/tree/model/treeEnsembleModels.scala | 51 ++++++++-----------
 .../spark/mllib/util/modelSaveLoad.scala      | 25 +++------
 15 files changed, 92 insertions(+), 127 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
index 348c1e8760a66..35a0db76f3a8c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
@@ -17,12 +17,12 @@
 
 package org.apache.spark.mllib.classification
 
+import org.json4s.{DefaultFormats, JValue}
+
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.mllib.util.Loader
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Row}
 
 /**
  * :: Experimental ::
@@ -60,16 +60,10 @@ private[mllib] object ClassificationModel {
 
   /**
    * Helper method for loading GLM classification model metadata.
-   *
-   * @param modelClass  String name for model class (used for error messages)
    * @return (numFeatures, numClasses)
    */
-  def getNumFeaturesClasses(metadata: DataFrame, modelClass: String, path: String): (Int, Int) = {
-    metadata.select("numFeatures", "numClasses").take(1)(0) match {
-      case Row(nFeatures: Int, nClasses: Int) => (nFeatures, nClasses)
-      case _ => throw new Exception(s"$modelClass unable to load" +
-        s" numFeatures, numClasses from metadata: ${Loader.metadataPath(path)}")
-    }
+  def getNumFeaturesClasses(metadata: JValue): (Int, Int) = {
+    implicit val formats = DefaultFormats
+    ((metadata \ "numFeatures").extract[Int], (metadata \ "numClasses").extract[Int])
   }
-
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index 9a391bfff76a3..420d6e2861934 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -173,8 +173,7 @@ object LogisticRegressionModel extends Loader[LogisticRegressionModel] {
     val classNameV1_0 = "org.apache.spark.mllib.classification.LogisticRegressionModel"
     (loadedClassName, version) match {
       case (className, "1.0") if className == classNameV1_0 =>
-        val (numFeatures, numClasses) =
-          ClassificationModel.getNumFeaturesClasses(metadata, classNameV1_0, path)
+        val (numFeatures, numClasses) = ClassificationModel.getNumFeaturesClasses(metadata)
         val data = GLMClassificationModel.SaveLoadV1_0.loadData(sc, path, classNameV1_0)
         // numFeatures, numClasses, weights are checked in model initialization
         val model =
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index d9ce2822dd391..f9142bc2268b7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -18,15 +18,16 @@
 package org.apache.spark.mllib.classification
 
 import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum}
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.{SparkContext, SparkException, Logging}
+import org.apache.spark.{Logging, SparkContext, SparkException}
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, SQLContext}
 
-
 /**
  * Model for Naive Bayes Classifiers.
  *
@@ -78,7 +79,7 @@ class NaiveBayesModel private[mllib] (
 
 object NaiveBayesModel extends Loader[NaiveBayesModel] {
 
-  import Loader._
+  import org.apache.spark.mllib.util.Loader._
 
   private object SaveLoadV1_0 {
 
@@ -95,10 +96,10 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
       import sqlContext.implicits._
 
       // Create JSON metadata.
-      val metadataRDD =
-        sc.parallelize(Seq((thisClassName, thisFormatVersion, data.theta(0).size, data.pi.size)), 1)
-          .toDataFrame("class", "version", "numFeatures", "numClasses")
-      metadataRDD.toJSON.saveAsTextFile(metadataPath(path))
+      val metadata = compact(render(
+        ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~
+          ("numFeatures" -> data.theta(0).length) ~ ("numClasses" -> data.pi.length)))
+      sc.parallelize(Seq(metadata), 1).saveAsTextFile(metadataPath(path))
 
       // Create Parquet data.
       val dataRDD: DataFrame = sc.parallelize(Seq(data), 1)
@@ -126,8 +127,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
     val classNameV1_0 = SaveLoadV1_0.thisClassName
     (loadedClassName, version) match {
       case (className, "1.0") if className == classNameV1_0 =>
-        val (numFeatures, numClasses) =
-          ClassificationModel.getNumFeaturesClasses(metadata, classNameV1_0, path)
+        val (numFeatures, numClasses) = ClassificationModel.getNumFeaturesClasses(metadata)
         val model = SaveLoadV1_0.load(sc, path)
         assert(model.pi.size == numClasses,
           s"NaiveBayesModel.load expected $numClasses classes," +
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
index 24d31e62ba500..cfc7f868a02f0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -23,10 +23,9 @@ import org.apache.spark.mllib.classification.impl.GLMClassificationModel
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.optimization._
 import org.apache.spark.mllib.regression._
-import org.apache.spark.mllib.util.{DataValidators, Saveable, Loader}
+import org.apache.spark.mllib.util.{DataValidators, Loader, Saveable}
 import org.apache.spark.rdd.RDD
 
-
 /**
  * Model for Support Vector Machines (SVMs).
  *
@@ -97,8 +96,7 @@ object SVMModel extends Loader[SVMModel] {
     val classNameV1_0 = "org.apache.spark.mllib.classification.SVMModel"
     (loadedClassName, version) match {
       case (className, "1.0") if className == classNameV1_0 =>
-        val (numFeatures, numClasses) =
-          ClassificationModel.getNumFeaturesClasses(metadata, classNameV1_0, path)
+        val (numFeatures, numClasses) = ClassificationModel.getNumFeaturesClasses(metadata)
         val data = GLMClassificationModel.SaveLoadV1_0.loadData(sc, path, classNameV1_0)
         val model = new SVMModel(data.weights, data.intercept)
         assert(model.weights.size == numFeatures, s"SVMModel.load with numFeatures=$numFeatures" +
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala
index 8d600572ed7f3..1d118963b49fc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala
@@ -17,10 +17,13 @@
 
 package org.apache.spark.mllib.classification.impl
 
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
 import org.apache.spark.SparkContext
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.util.Loader
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+import org.apache.spark.sql.{Row, SQLContext}
 
 /**
  * Helper class for import/export of GLM classification models.
@@ -52,16 +55,14 @@ private[classification] object GLMClassificationModel {
       import sqlContext.implicits._
 
       // Create JSON metadata.
-      val metadataRDD =
-        sc.parallelize(Seq((modelClass, thisFormatVersion, numFeatures, numClasses)), 1)
-          .toDataFrame("class", "version", "numFeatures", "numClasses")
-      metadataRDD.toJSON.saveAsTextFile(Loader.metadataPath(path))
+      val metadata = compact(render(
+        ("class" -> modelClass) ~ ("version" -> thisFormatVersion) ~
+        ("numFeatures" -> numFeatures) ~ ("numClasses" -> numClasses)))
+      sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
 
       // Create Parquet data.
       val data = Data(weights, intercept, threshold)
-      val dataRDD: DataFrame = sc.parallelize(Seq(data), 1)
-      // TODO: repartition with 1 partition after SPARK-5532 gets fixed
-      dataRDD.saveAsParquetFile(Loader.dataPath(path))
+      sc.parallelize(Seq(data), 1).saveAsParquetFile(Loader.dataPath(path))
     }
 
     /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index 16979c9ed43ca..a3a3b5d418c0a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -22,6 +22,9 @@ import java.lang.{Integer => JavaInteger}
 
 import org.apache.hadoop.fs.Path
 import org.jblas.DoubleMatrix
+import org.json4s._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.{Logging, SparkContext}
 import org.apache.spark.api.java.{JavaPairRDD, JavaRDD}
@@ -153,7 +156,7 @@ object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
   import org.apache.spark.mllib.util.Loader._
 
   override def load(sc: SparkContext, path: String): MatrixFactorizationModel = {
-    val (loadedClassName, formatVersion, metadata) = loadMetadata(sc, path)
+    val (loadedClassName, formatVersion, _) = loadMetadata(sc, path)
     val classNameV1_0 = SaveLoadV1_0.thisClassName
     (loadedClassName, formatVersion) match {
       case (className, "1.0") if className == classNameV1_0 =>
@@ -181,19 +184,20 @@ object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
       val sc = model.userFeatures.sparkContext
       val sqlContext = new SQLContext(sc)
       import sqlContext.implicits._
-      val metadata = (thisClassName, thisFormatVersion, model.rank)
-      val metadataRDD = sc.parallelize(Seq(metadata), 1).toDataFrame("class", "version", "rank")
-      metadataRDD.toJSON.saveAsTextFile(metadataPath(path))
+      val metadata = compact(render(
+        ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~ ("rank" -> model.rank)))
+      sc.parallelize(Seq(metadata), 1).saveAsTextFile(metadataPath(path))
       model.userFeatures.toDataFrame("id", "features").saveAsParquetFile(userPath(path))
       model.productFeatures.toDataFrame("id", "features").saveAsParquetFile(productPath(path))
     }
 
     def load(sc: SparkContext, path: String): MatrixFactorizationModel = {
+      implicit val formats = DefaultFormats
       val sqlContext = new SQLContext(sc)
       val (className, formatVersion, metadata) = loadMetadata(sc, path)
       assert(className == thisClassName)
       assert(formatVersion == thisFormatVersion)
-      val rank = metadata.select("rank").first().getInt(0)
+      val rank = (metadata \ "rank").extract[Int]
       val userFeatures = sqlContext.parquetFile(userPath(path))
         .map { case Row(id: Int, features: Seq[Double]) =>
           (id, features.toArray)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala
index 1159e59fff5f6..e8b03816573cf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/Lasso.scala
@@ -58,7 +58,7 @@ object LassoModel extends Loader[LassoModel] {
     val classNameV1_0 = "org.apache.spark.mllib.regression.LassoModel"
     (loadedClassName, version) match {
       case (className, "1.0") if className == classNameV1_0 =>
-        val numFeatures = RegressionModel.getNumFeatures(metadata, classNameV1_0, path)
+        val numFeatures = RegressionModel.getNumFeatures(metadata)
         val data = GLMRegressionModel.SaveLoadV1_0.loadData(sc, path, classNameV1_0, numFeatures)
         new LassoModel(data.weights, data.intercept)
       case _ => throw new Exception(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
index 0136dcfdceaef..6fa7ad52a5b33 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
@@ -58,7 +58,7 @@ object LinearRegressionModel extends Loader[LinearRegressionModel] {
     val classNameV1_0 = "org.apache.spark.mllib.regression.LinearRegressionModel"
     (loadedClassName, version) match {
       case (className, "1.0") if className == classNameV1_0 =>
-        val numFeatures = RegressionModel.getNumFeatures(metadata, classNameV1_0, path)
+        val numFeatures = RegressionModel.getNumFeatures(metadata)
         val data = GLMRegressionModel.SaveLoadV1_0.loadData(sc, path, classNameV1_0, numFeatures)
         new LinearRegressionModel(data.weights, data.intercept)
       case _ => throw new Exception(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala
index 843e59bdfbdd2..214ac4d0ed7dd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RegressionModel.scala
@@ -17,12 +17,12 @@
 
 package org.apache.spark.mllib.regression
 
+import org.json4s.{DefaultFormats, JValue}
+
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.mllib.util.Loader
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Row}
 
 @Experimental
 trait RegressionModel extends Serializable {
@@ -55,16 +55,10 @@ private[mllib] object RegressionModel {
 
   /**
    * Helper method for loading GLM regression model metadata.
-   *
-   * @param modelClass  String name for model class (used for error messages)
    * @return numFeatures
    */
-  def getNumFeatures(metadata: DataFrame, modelClass: String, path: String): Int = {
-    metadata.select("numFeatures").take(1)(0) match {
-      case Row(nFeatures: Int) => nFeatures
-      case _ => throw new Exception(s"$modelClass unable to load" +
-        s" numFeatures from metadata: ${Loader.metadataPath(path)}")
-    }
+  def getNumFeatures(metadata: JValue): Int = {
+    implicit val formats = DefaultFormats
+    (metadata \ "numFeatures").extract[Int]
   }
-
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
index f2a5f1db1ece6..8838ca8c14718 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
@@ -59,7 +59,7 @@ object RidgeRegressionModel extends Loader[RidgeRegressionModel] {
     val classNameV1_0 = "org.apache.spark.mllib.regression.RidgeRegressionModel"
     (loadedClassName, version) match {
       case (className, "1.0") if className == classNameV1_0 =>
-        val numFeatures = RegressionModel.getNumFeatures(metadata, classNameV1_0, path)
+        val numFeatures = RegressionModel.getNumFeatures(metadata)
         val data = GLMRegressionModel.SaveLoadV1_0.loadData(sc, path, classNameV1_0, numFeatures)
         new RidgeRegressionModel(data.weights, data.intercept)
       case _ => throw new Exception(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala
index 838100e949ec2..f75de6f637640 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.mllib.regression.impl
 
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
 import org.apache.spark.SparkContext
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.util.Loader
@@ -48,10 +51,10 @@ private[regression] object GLMRegressionModel {
       import sqlContext.implicits._
 
       // Create JSON metadata.
-      val metadataRDD =
-        sc.parallelize(Seq((modelClass, thisFormatVersion, weights.size)), 1)
-          .toDataFrame("class", "version", "numFeatures")
-      metadataRDD.toJSON.saveAsTextFile(Loader.metadataPath(path))
+      val metadata = compact(render(
+        ("class" -> modelClass) ~ ("version" -> thisFormatVersion) ~
+          ("numFeatures" -> weights.size)))
+      sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
 
       // Create Parquet data.
       val data = Data(weights, intercept)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index b3e8ed9af8c51..9a586b9d9c68e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -17,14 +17,13 @@
 
 package org.apache.spark.mllib.tree
 
-import scala.collection.JavaConverters._
 import scala.collection.mutable
+import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 
-
+import org.apache.spark.Logging
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
-import org.apache.spark.Logging
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.RandomForest.NodeIndexInfo
 import org.apache.spark.mllib.tree.configuration.Strategy
@@ -32,13 +31,10 @@ import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.FeatureType._
 import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
 import org.apache.spark.mllib.tree.impl._
-import org.apache.spark.mllib.tree.impurity.{Impurities, Impurity}
 import org.apache.spark.mllib.tree.impurity._
 import org.apache.spark.mllib.tree.model._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.random.XORShiftRandom
-import org.apache.spark.SparkContext._
-
 
 /**
  * :: Experimental ::
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
index 89ecf3773dd77..373192a20c0ea 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
@@ -19,6 +19,10 @@ package org.apache.spark.mllib.tree.model
 
 import scala.collection.mutable
 
+import org.json4s._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
@@ -184,10 +188,10 @@ object DecisionTreeModel extends Loader[DecisionTreeModel] {
       import sqlContext.implicits._
 
       // Create JSON metadata.
-      val metadataRDD = sc.parallelize(
-        Seq((thisClassName, thisFormatVersion, model.algo.toString, model.numNodes)), 1)
-        .toDataFrame("class", "version", "algo", "numNodes")
-      metadataRDD.toJSON.saveAsTextFile(Loader.metadataPath(path))
+      val metadata = compact(render(
+        ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~
+          ("algo" -> model.algo.toString) ~ ("numNodes" -> model.numNodes)))
+      sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
 
       // Create Parquet data.
       val nodes = model.topNode.subtreeIterator.toSeq
@@ -269,20 +273,10 @@ object DecisionTreeModel extends Loader[DecisionTreeModel] {
   }
 
   override def load(sc: SparkContext, path: String): DecisionTreeModel = {
+    implicit val formats = DefaultFormats
     val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
-    val (algo: String, numNodes: Int) = try {
-      val algo_numNodes = metadata.select("algo", "numNodes").collect()
-      assert(algo_numNodes.length == 1)
-      algo_numNodes(0) match {
-        case Row(a: String, n: Int) => (a, n)
-      }
-    } catch {
-      // Catch both Error and Exception since the checks above can throw either.
-      case e: Throwable =>
-        throw new Exception(
-          s"Unable to load DecisionTreeModel metadata from: ${Loader.metadataPath(path)}."
-          + s"  Error message: ${e.getMessage}")
-    }
+    val algo = (metadata \ "algo").extract[String]
+    val numNodes = (metadata \ "numNodes").extract[Int]
     val classNameV1_0 = SaveLoadV1_0.thisClassName
     (loadedClassName, version) match {
       case (className, "1.0") if className == classNameV1_0 =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
index 23bd46baabf65..dbd69dca60b92 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -20,18 +20,20 @@ package org.apache.spark.mllib.tree.model
 import scala.collection.mutable
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
+import org.json4s._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.Algo
+import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.EnsembleCombiningStrategy._
-import org.apache.spark.mllib.util.{Saveable, Loader}
+import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, SQLContext}
-
+import org.apache.spark.sql.SQLContext
 
 /**
  * :: Experimental ::
@@ -59,11 +61,11 @@ class RandomForestModel(override val algo: Algo, override val trees: Array[Decis
 object RandomForestModel extends Loader[RandomForestModel] {
 
   override def load(sc: SparkContext, path: String): RandomForestModel = {
-    val (loadedClassName, version, metadataRDD) = Loader.loadMetadata(sc, path)
+    val (loadedClassName, version, jsonMetadata) = Loader.loadMetadata(sc, path)
     val classNameV1_0 = SaveLoadV1_0.thisClassName
     (loadedClassName, version) match {
       case (className, "1.0") if className == classNameV1_0 =>
-        val metadata = TreeEnsembleModel.SaveLoadV1_0.readMetadata(metadataRDD, path)
+        val metadata = TreeEnsembleModel.SaveLoadV1_0.readMetadata(jsonMetadata)
         assert(metadata.treeWeights.forall(_ == 1.0))
         val trees =
           TreeEnsembleModel.SaveLoadV1_0.loadTrees(sc, path, metadata.treeAlgo)
@@ -110,11 +112,11 @@ class GradientBoostedTreesModel(
 object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
 
   override def load(sc: SparkContext, path: String): GradientBoostedTreesModel = {
-    val (loadedClassName, version, metadataRDD) = Loader.loadMetadata(sc, path)
+    val (loadedClassName, version, jsonMetadata) = Loader.loadMetadata(sc, path)
     val classNameV1_0 = SaveLoadV1_0.thisClassName
     (loadedClassName, version) match {
       case (className, "1.0") if className == classNameV1_0 =>
-        val metadata = TreeEnsembleModel.SaveLoadV1_0.readMetadata(metadataRDD, path)
+        val metadata = TreeEnsembleModel.SaveLoadV1_0.readMetadata(jsonMetadata)
         assert(metadata.combiningStrategy == Sum.toString)
         val trees =
           TreeEnsembleModel.SaveLoadV1_0.loadTrees(sc, path, metadata.treeAlgo)
@@ -252,7 +254,7 @@ private[tree] object TreeEnsembleModel {
 
   object SaveLoadV1_0 {
 
-    import DecisionTreeModel.SaveLoadV1_0.{NodeData, constructTrees}
+    import org.apache.spark.mllib.tree.model.DecisionTreeModel.SaveLoadV1_0.{NodeData, constructTrees}
 
     def thisFormatVersion = "1.0"
 
@@ -276,11 +278,13 @@ private[tree] object TreeEnsembleModel {
       import sqlContext.implicits._
 
       // Create JSON metadata.
-      val metadata = Metadata(model.algo.toString, model.trees(0).algo.toString,
+      implicit val format = DefaultFormats
+      val ensembleMetadata = Metadata(model.algo.toString, model.trees(0).algo.toString,
         model.combiningStrategy.toString, model.treeWeights)
-      val metadataRDD = sc.parallelize(Seq((className, thisFormatVersion, metadata)), 1)
-        .toDataFrame("class", "version", "metadata")
-      metadataRDD.toJSON.saveAsTextFile(Loader.metadataPath(path))
+      val metadata = compact(render(
+        ("class" -> className) ~ ("version" -> thisFormatVersion) ~
+          ("metadata" -> Extraction.decompose(ensembleMetadata))))
+      sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
 
       // Create Parquet data.
       val dataRDD = sc.parallelize(model.trees.zipWithIndex).flatMap { case (tree, treeId) =>
@@ -290,24 +294,11 @@ private[tree] object TreeEnsembleModel {
     }
 
     /**
-     * Read metadata from the loaded metadata DataFrame.
-     * @param path  Path for loading data, used for debug messages.
+     * Read metadata from the loaded JSON metadata.
      */
-    def readMetadata(metadata: DataFrame, path: String): Metadata = {
-      try {
-        // We rely on the try-catch for schema checking rather than creating a schema just for this.
-        val metadataArray = metadata.select("metadata.algo", "metadata.treeAlgo",
-          "metadata.combiningStrategy", "metadata.treeWeights").collect()
-        assert(metadataArray.size == 1)
-        Metadata(metadataArray(0).getString(0), metadataArray(0).getString(1),
-          metadataArray(0).getString(2), metadataArray(0).getAs[Seq[Double]](3).toArray)
-      } catch {
-        // Catch both Error and Exception since the checks above can throw either.
-        case e: Throwable =>
-          throw new Exception(
-            s"Unable to load TreeEnsembleModel metadata from: ${Loader.metadataPath(path)}."
-              + s"  Error message: ${e.getMessage}")
-      }
+    def readMetadata(metadata: JValue): Metadata = {
+      implicit val formats = DefaultFormats
+      (metadata \ "metadata").extract[Metadata]
     }
 
     /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
index 56b77a7d12e83..4458340497f0b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
@@ -20,13 +20,13 @@ package org.apache.spark.mllib.util
 import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.hadoop.fs.Path
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.types.{DataType, StructType, StructField}
-
+import org.apache.spark.sql.types.{DataType, StructField, StructType}
 
 /**
  * :: DeveloperApi ::
@@ -120,20 +120,11 @@ private[mllib] object Loader {
    * Load metadata from the given path.
    * @return (class name, version, metadata)
    */
-  def loadMetadata(sc: SparkContext, path: String): (String, String, DataFrame) = {
-    val sqlContext = new SQLContext(sc)
-    val metadata = sqlContext.jsonFile(metadataPath(path))
-    val (clazz, version) = try {
-      val metadataArray = metadata.select("class", "version").take(1)
-      assert(metadataArray.size == 1)
-      metadataArray(0) match {
-        case Row(clazz: String, version: String) => (clazz, version)
-      }
-    } catch {
-      case e: Exception =>
-        throw new Exception(s"Unable to load model metadata from: ${metadataPath(path)}")
-    }
+  def loadMetadata(sc: SparkContext, path: String): (String, String, JValue) = {
+    implicit val formats = DefaultFormats
+    val metadata = parse(sc.textFile(metadataPath(path)).first())
+    val clazz = (metadata \ "class").extract[String]
+    val version = (metadata \ "version").extract[String]
     (clazz, version, metadata)
   }
-
 }

From bc57789bbb2a4fa7733594d30a5818f048aacfed Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Thu, 12 Feb 2015 20:14:45 +0000
Subject: [PATCH 134/357] SPARK-5776 JIRA version not of form x.y.z breaks
 merge_spark_pr.py

Consider only x.y.z verisons from JIRA. CC JoshRosen who will probably know this script well.
Alternative is to call the version "2.0.0" after all in JIRA.

Author: Sean Owen <sowen@cloudera.com>

Closes #4570 from srowen/SPARK-5776 and squashes the following commits:

fffafde [Sean Owen] Consider only x.y.z verisons from JIRA
---
 dev/merge_spark_pr.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index dfa924d2aa0ba..3062e9c3c6651 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -244,6 +244,8 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
     versions = asf_jira.project_versions("SPARK")
     versions = sorted(versions, key=lambda x: x.name, reverse=True)
     versions = filter(lambda x: x.raw['released'] is False, versions)
+    # Consider only x.y.z versions
+    versions = filter(lambda x: re.match('\d+\.\d+\.\d+', x.name), versions)
 
     default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches)
     for v in default_fix_versions:

From 6a1be026cf37e4c8bf39133dfb4a73f7caedcc26 Mon Sep 17 00:00:00 2001
From: Antonio Navarro Perez <ajnavarro@users.noreply.github.com>
Date: Thu, 12 Feb 2015 12:46:17 -0800
Subject: [PATCH 135/357] [SQL][DOCS] Update sql documentation

Updated examples using the new api and added DataFrame concept

Author: Antonio Navarro Perez <ajnavarro@users.noreply.github.com>

Closes #4560 from ajnavarro/ajnavarro-doc-sql-update and squashes the following commits:

82ebcf3 [Antonio Navarro Perez] Changed a missing JavaSQLContext to SQLContext.
8d5376a [Antonio Navarro Perez] fixed typo
8196b6b [Antonio Navarro Perez] [SQL][DOCS] Update sql documentation
---
 docs/sql-programming-guide.md | 168 +++++++++++++++++-----------------
 1 file changed, 84 insertions(+), 84 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index b2b007509c735..8022c5ecc2430 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -14,10 +14,10 @@ title: Spark SQL
 
 Spark SQL allows relational queries expressed in SQL, HiveQL, or Scala to be executed using
 Spark.  At the core of this component is a new type of RDD,
-[SchemaRDD](api/scala/index.html#org.apache.spark.sql.SchemaRDD).  SchemaRDDs are composed of
+[DataFrame](api/scala/index.html#org.apache.spark.sql.DataFrame).  DataFrames are composed of
 [Row](api/scala/index.html#org.apache.spark.sql.package@Row:org.apache.spark.sql.catalyst.expressions.Row.type) objects, along with
-a schema that describes the data types of each column in the row.  A SchemaRDD is similar to a table
-in a traditional relational database.  A SchemaRDD can be created from an existing RDD, a [Parquet](http://parquet.io)
+a schema that describes the data types of each column in the row.  A DataFrame is similar to a table
+in a traditional relational database.  A DataFrame can be created from an existing RDD, a [Parquet](http://parquet.io)
 file, a JSON dataset, or by running HiveQL against data stored in [Apache Hive](http://hive.apache.org/).
 
 All of the examples on this page use sample data included in the Spark distribution and can be run in the `spark-shell`.
@@ -27,10 +27,10 @@ All of the examples on this page use sample data included in the Spark distribut
 <div data-lang="java"  markdown="1">
 Spark SQL allows relational queries expressed in SQL or HiveQL to be executed using
 Spark.  At the core of this component is a new type of RDD,
-[JavaSchemaRDD](api/scala/index.html#org.apache.spark.sql.api.java.JavaSchemaRDD).  JavaSchemaRDDs are composed of
+[DataFrame](api/scala/index.html#org.apache.spark.sql.DataFrame).  DataFrames are composed of
 [Row](api/scala/index.html#org.apache.spark.sql.api.java.Row) objects, along with
-a schema that describes the data types of each column in the row.  A JavaSchemaRDD is similar to a table
-in a traditional relational database.  A JavaSchemaRDD can be created from an existing RDD, a [Parquet](http://parquet.io)
+a schema that describes the data types of each column in the row.  A DataFrame is similar to a table
+in a traditional relational database.  A DataFrame can be created from an existing RDD, a [Parquet](http://parquet.io)
 file, a JSON dataset, or by running HiveQL against data stored in [Apache Hive](http://hive.apache.org/).
 </div>
 
@@ -38,10 +38,10 @@ file, a JSON dataset, or by running HiveQL against data stored in [Apache Hive](
 
 Spark SQL allows relational queries expressed in SQL or HiveQL to be executed using
 Spark.  At the core of this component is a new type of RDD,
-[SchemaRDD](api/python/pyspark.sql.SchemaRDD-class.html).  SchemaRDDs are composed of
+[DataFrame](api/python/pyspark.sql.html#pyspark.sql.DataFrame).  DataFrames are composed of
 [Row](api/python/pyspark.sql.Row-class.html) objects, along with
-a schema that describes the data types of each column in the row.  A SchemaRDD is similar to a table
-in a traditional relational database.  A SchemaRDD can be created from an existing RDD, a [Parquet](http://parquet.io)
+a schema that describes the data types of each column in the row.  A DataFrame is similar to a table
+in a traditional relational database.  A DataFrame can be created from an existing RDD, a [Parquet](http://parquet.io)
 file, a JSON dataset, or by running HiveQL against data stored in [Apache Hive](http://hive.apache.org/).
 
 All of the examples on this page use sample data included in the Spark distribution and can be run in the `pyspark` shell.
@@ -65,8 +65,8 @@ descendants.  To create a basic SQLContext, all you need is a SparkContext.
 val sc: SparkContext // An existing SparkContext.
 val sqlContext = new org.apache.spark.sql.SQLContext(sc)
 
-// createSchemaRDD is used to implicitly convert an RDD to a SchemaRDD.
-import sqlContext.createSchemaRDD
+// this is used to implicitly convert an RDD to a DataFrame.
+import sqlContext.implicits._
 {% endhighlight %}
 
 In addition to the basic SQLContext, you can also create a HiveContext, which provides a
@@ -84,12 +84,12 @@ feature parity with a HiveContext.
 <div data-lang="java" markdown="1">
 
 The entry point into all relational functionality in Spark is the
-[JavaSQLContext](api/scala/index.html#org.apache.spark.sql.api.java.JavaSQLContext) class, or one
-of its descendants.  To create a basic JavaSQLContext, all you need is a JavaSparkContext.
+[SQLContext](api/scala/index.html#org.apache.spark.sql.api.SQLContext) class, or one
+of its descendants.  To create a basic SQLContext, all you need is a JavaSparkContext.
 
 {% highlight java %}
 JavaSparkContext sc = ...; // An existing JavaSparkContext.
-JavaSQLContext sqlContext = new org.apache.spark.sql.api.java.JavaSQLContext(sc);
+SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
 {% endhighlight %}
 
 In addition to the basic SQLContext, you can also create a HiveContext, which provides a strict
@@ -138,21 +138,21 @@ default is "hiveql", though "sql" is also available.  Since the HiveQL parser is
 
 # Data Sources
 
-Spark SQL supports operating on a variety of data sources through the `SchemaRDD` interface.
-A SchemaRDD can be operated on as normal RDDs and can also be registered as a temporary table.
-Registering a SchemaRDD as a table allows you to run SQL queries over its data.  This section
-describes the various methods for loading data into a SchemaRDD.
+Spark SQL supports operating on a variety of data sources through the `DataFrame` interface.
+A DataFrame can be operated on as normal RDDs and can also be registered as a temporary table.
+Registering a DataFrame as a table allows you to run SQL queries over its data.  This section
+describes the various methods for loading data into a DataFrame.
 
 ## RDDs
 
-Spark SQL supports two different methods for converting existing RDDs into SchemaRDDs.  The first
+Spark SQL supports two different methods for converting existing RDDs into DataFrames.  The first
 method uses reflection to infer the schema of an RDD that contains specific types of objects.  This
 reflection based approach leads to more concise code and works well when you already know the schema
 while writing your Spark application.
 
-The second method for creating SchemaRDDs is through a programmatic interface that allows you to
+The second method for creating DataFrames is through a programmatic interface that allows you to
 construct a schema and then apply it to an existing RDD.  While this method is more verbose, it allows
-you to construct SchemaRDDs when the columns and their types are not known until runtime.
+you to construct DataFrames when the columns and their types are not known until runtime.
 
 ### Inferring the Schema Using Reflection
 <div class="codetabs">
@@ -160,17 +160,17 @@ you to construct SchemaRDDs when the columns and their types are not known until
 <div data-lang="scala"  markdown="1">
 
 The Scala interaface for Spark SQL supports automatically converting an RDD containing case classes
-to a SchemaRDD.  The case class
+to a DataFrame.  The case class
 defines the schema of the table.  The names of the arguments to the case class are read using
 reflection and become the names of the columns. Case classes can also be nested or contain complex
-types such as Sequences or Arrays. This RDD can be implicitly converted to a SchemaRDD and then be
+types such as Sequences or Arrays. This RDD can be implicitly converted to a DataFrame and then be
 registered as a table.  Tables can be used in subsequent SQL statements.
 
 {% highlight scala %}
 // sc is an existing SparkContext.
 val sqlContext = new org.apache.spark.sql.SQLContext(sc)
-// createSchemaRDD is used to implicitly convert an RDD to a SchemaRDD.
-import sqlContext.createSchemaRDD
+// this is used to implicitly convert an RDD to a DataFrame.
+import sqlContext.implicits._
 
 // Define the schema using a case class.
 // Note: Case classes in Scala 2.10 can support only up to 22 fields. To work around this limit,
@@ -184,7 +184,7 @@ people.registerTempTable("people")
 // SQL statements can be run by using the sql methods provided by sqlContext.
 val teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
 
-// The results of SQL queries are SchemaRDDs and support all the normal RDD operations.
+// The results of SQL queries are DataFrames and support all the normal RDD operations.
 // The columns of a row in the result can be accessed by ordinal.
 teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
 {% endhighlight %}
@@ -194,7 +194,7 @@ teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
 <div data-lang="java"  markdown="1">
 
 Spark SQL supports automatically converting an RDD of [JavaBeans](http://stackoverflow.com/questions/3295496/what-is-a-javabean-exactly)
-into a Schema RDD.  The BeanInfo, obtained using reflection, defines the schema of the table.
+into a DataFrame.  The BeanInfo, obtained using reflection, defines the schema of the table.
 Currently, Spark SQL does not support JavaBeans that contain
 nested or contain complex types such as Lists or Arrays.  You can create a JavaBean by creating a
 class that implements Serializable and has getters and setters for all of its fields.
@@ -230,7 +230,7 @@ for the JavaBean.
 
 {% highlight java %}
 // sc is an existing JavaSparkContext.
-JavaSQLContext sqlContext = new org.apache.spark.sql.api.java.JavaSQLContext(sc);
+SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
 
 // Load a text file and convert each line to a JavaBean.
 JavaRDD<Person> people = sc.textFile("examples/src/main/resources/people.txt").map(
@@ -247,13 +247,13 @@ JavaRDD<Person> people = sc.textFile("examples/src/main/resources/people.txt").m
   });
 
 // Apply a schema to an RDD of JavaBeans and register it as a table.
-JavaSchemaRDD schemaPeople = sqlContext.createDataFrame(people, Person.class);
+DataFrame schemaPeople = sqlContext.createDataFrame(people, Person.class);
 schemaPeople.registerTempTable("people");
 
 // SQL can be run over RDDs that have been registered as tables.
-JavaSchemaRDD teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
+DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
 
-// The results of SQL queries are SchemaRDDs and support all the normal RDD operations.
+// The results of SQL queries are DataFrames and support all the normal RDD operations.
 // The columns of a row in the result can be accessed by ordinal.
 List<String> teenagerNames = teenagers.map(new Function<Row, String>() {
   public String call(Row row) {
@@ -267,7 +267,7 @@ List<String> teenagerNames = teenagers.map(new Function<Row, String>() {
 
 <div data-lang="python"  markdown="1">
 
-Spark SQL can convert an RDD of Row objects to a SchemaRDD, inferring the datatypes.  Rows are constructed by passing a list of
+Spark SQL can convert an RDD of Row objects to a DataFrame, inferring the datatypes.  Rows are constructed by passing a list of
 key/value pairs as kwargs to the Row class. The keys of this list define the column names of the table,
 and the types are inferred by looking at the first row.  Since we currently only look at the first
 row, it is important that there is no missing data in the first row of the RDD. In future versions we
@@ -284,11 +284,11 @@ lines = sc.textFile("examples/src/main/resources/people.txt")
 parts = lines.map(lambda l: l.split(","))
 people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))
 
-# Infer the schema, and register the SchemaRDD as a table.
+# Infer the schema, and register the DataFrame as a table.
 schemaPeople = sqlContext.inferSchema(people)
 schemaPeople.registerTempTable("people")
 
-# SQL can be run over SchemaRDDs that have been registered as a table.
+# SQL can be run over DataFrames that have been registered as a table.
 teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
 
 # The results of SQL queries are RDDs and support all the normal RDD operations.
@@ -310,7 +310,7 @@ for teenName in teenNames.collect():
 When case classes cannot be defined ahead of time (for example,
 the structure of records is encoded in a string, or a text dataset will be parsed
 and fields will be projected differently for different users),
-a `SchemaRDD` can be created programmatically with three steps.
+a `DataFrame` can be created programmatically with three steps.
 
 1. Create an RDD of `Row`s from the original RDD;
 2. Create the schema represented by a `StructType` matching the structure of
@@ -341,15 +341,15 @@ val schema =
 val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1).trim))
 
 // Apply the schema to the RDD.
-val peopleSchemaRDD = sqlContext.createDataFrame(rowRDD, schema)
+val peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema)
 
-// Register the SchemaRDD as a table.
-peopleSchemaRDD.registerTempTable("people")
+// Register the DataFrames as a table.
+peopleDataFrame.registerTempTable("people")
 
 // SQL statements can be run by using the sql methods provided by sqlContext.
 val results = sqlContext.sql("SELECT name FROM people")
 
-// The results of SQL queries are SchemaRDDs and support all the normal RDD operations.
+// The results of SQL queries are DataFrames and support all the normal RDD operations.
 // The columns of a row in the result can be accessed by ordinal.
 results.map(t => "Name: " + t(0)).collect().foreach(println)
 {% endhighlight %}
@@ -362,13 +362,13 @@ results.map(t => "Name: " + t(0)).collect().foreach(println)
 When JavaBean classes cannot be defined ahead of time (for example,
 the structure of records is encoded in a string, or a text dataset will be parsed and
 fields will be projected differently for different users),
-a `SchemaRDD` can be created programmatically with three steps.
+a `DataFrame` can be created programmatically with three steps.
 
 1. Create an RDD of `Row`s from the original RDD;
 2. Create the schema represented by a `StructType` matching the structure of
 `Row`s in the RDD created in Step 1.
 3. Apply the schema to the RDD of `Row`s via `createDataFrame` method provided
-by `JavaSQLContext`.
+by `SQLContext`.
 
 For example:
 {% highlight java %}
@@ -381,7 +381,7 @@ import org.apache.spark.sql.api.java.StructField
 import org.apache.spark.sql.api.java.Row
 
 // sc is an existing JavaSparkContext.
-JavaSQLContext sqlContext = new org.apache.spark.sql.api.java.JavaSQLContext(sc);
+SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
 
 // Load a text file and convert each line to a JavaBean.
 JavaRDD<String> people = sc.textFile("examples/src/main/resources/people.txt");
@@ -406,15 +406,15 @@ JavaRDD<Row> rowRDD = people.map(
   });
 
 // Apply the schema to the RDD.
-JavaSchemaRDD peopleSchemaRDD = sqlContext.createDataFrame(rowRDD, schema);
+DataFrame peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema);
 
-// Register the SchemaRDD as a table.
-peopleSchemaRDD.registerTempTable("people");
+// Register the DataFrame as a table.
+peopleDataFrame.registerTempTable("people");
 
 // SQL can be run over RDDs that have been registered as tables.
-JavaSchemaRDD results = sqlContext.sql("SELECT name FROM people");
+DataFrame results = sqlContext.sql("SELECT name FROM people");
 
-// The results of SQL queries are SchemaRDDs and support all the normal RDD operations.
+// The results of SQL queries are DataFrames and support all the normal RDD operations.
 // The columns of a row in the result can be accessed by ordinal.
 List<String> names = results.map(new Function<Row, String>() {
   public String call(Row row) {
@@ -431,7 +431,7 @@ List<String> names = results.map(new Function<Row, String>() {
 When a dictionary of kwargs cannot be defined ahead of time (for example,
 the structure of records is encoded in a string, or a text dataset will be parsed and
 fields will be projected differently for different users),
-a `SchemaRDD` can be created programmatically with three steps.
+a `DataFrame` can be created programmatically with three steps.
 
 1. Create an RDD of tuples or lists from the original RDD;
 2. Create the schema represented by a `StructType` matching the structure of
@@ -460,10 +460,10 @@ schema = StructType(fields)
 # Apply the schema to the RDD.
 schemaPeople = sqlContext.createDataFrame(people, schema)
 
-# Register the SchemaRDD as a table.
+# Register the DataFrame as a table.
 schemaPeople.registerTempTable("people")
 
-# SQL can be run over SchemaRDDs that have been registered as a table.
+# SQL can be run over DataFrames that have been registered as a table.
 results = sqlContext.sql("SELECT name FROM people")
 
 # The results of SQL queries are RDDs and support all the normal RDD operations.
@@ -493,16 +493,16 @@ Using the data from the above example:
 
 {% highlight scala %}
 // sqlContext from the previous example is used in this example.
-// createSchemaRDD is used to implicitly convert an RDD to a SchemaRDD.
-import sqlContext.createSchemaRDD
+// This is used to implicitly convert an RDD to a DataFrame.
+import sqlContext.implicits._
 
 val people: RDD[Person] = ... // An RDD of case class objects, from the previous example.
 
-// The RDD is implicitly converted to a SchemaRDD by createSchemaRDD, allowing it to be stored using Parquet.
+// The RDD is implicitly converted to a DataFrame by implicits, allowing it to be stored using Parquet.
 people.saveAsParquetFile("people.parquet")
 
 // Read in the parquet file created above.  Parquet files are self-describing so the schema is preserved.
-// The result of loading a Parquet file is also a SchemaRDD.
+// The result of loading a Parquet file is also a DataFrame.
 val parquetFile = sqlContext.parquetFile("people.parquet")
 
 //Parquet files can also be registered as tables and then used in SQL statements.
@@ -518,18 +518,18 @@ teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
 {% highlight java %}
 // sqlContext from the previous example is used in this example.
 
-JavaSchemaRDD schemaPeople = ... // The JavaSchemaRDD from the previous example.
+DataFrame schemaPeople = ... // The DataFrame from the previous example.
 
-// JavaSchemaRDDs can be saved as Parquet files, maintaining the schema information.
+// DataFrames can be saved as Parquet files, maintaining the schema information.
 schemaPeople.saveAsParquetFile("people.parquet");
 
 // Read in the Parquet file created above.  Parquet files are self-describing so the schema is preserved.
-// The result of loading a parquet file is also a JavaSchemaRDD.
-JavaSchemaRDD parquetFile = sqlContext.parquetFile("people.parquet");
+// The result of loading a parquet file is also a DataFrame.
+DataFrame parquetFile = sqlContext.parquetFile("people.parquet");
 
 //Parquet files can also be registered as tables and then used in SQL statements.
 parquetFile.registerTempTable("parquetFile");
-JavaSchemaRDD teenagers = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
+DataFrame teenagers = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
 List<String> teenagerNames = teenagers.map(new Function<Row, String>() {
   public String call(Row row) {
     return "Name: " + row.getString(0);
@@ -544,13 +544,13 @@ List<String> teenagerNames = teenagers.map(new Function<Row, String>() {
 {% highlight python %}
 # sqlContext from the previous example is used in this example.
 
-schemaPeople # The SchemaRDD from the previous example.
+schemaPeople # The DataFrame from the previous example.
 
-# SchemaRDDs can be saved as Parquet files, maintaining the schema information.
+# DataFrames can be saved as Parquet files, maintaining the schema information.
 schemaPeople.saveAsParquetFile("people.parquet")
 
 # Read in the Parquet file created above.  Parquet files are self-describing so the schema is preserved.
-# The result of loading a parquet file is also a SchemaRDD.
+# The result of loading a parquet file is also a DataFrame.
 parquetFile = sqlContext.parquetFile("people.parquet")
 
 # Parquet files can also be registered as tables and then used in SQL statements.
@@ -629,7 +629,7 @@ Configuration of Parquet can be done using the `setConf` method on SQLContext or
 <div class="codetabs">
 
 <div data-lang="scala"  markdown="1">
-Spark SQL can automatically infer the schema of a JSON dataset and load it as a SchemaRDD.
+Spark SQL can automatically infer the schema of a JSON dataset and load it as a DataFrame.
 This conversion can be done using one of two methods in a SQLContext:
 
 * `jsonFile` - loads data from a directory of JSON files where each line of the files is a JSON object.
@@ -646,7 +646,7 @@ val sqlContext = new org.apache.spark.sql.SQLContext(sc)
 // A JSON dataset is pointed to by path.
 // The path can be either a single text file or a directory storing text files.
 val path = "examples/src/main/resources/people.json"
-// Create a SchemaRDD from the file(s) pointed to by path
+// Create a DataFrame from the file(s) pointed to by path
 val people = sqlContext.jsonFile(path)
 
 // The inferred schema can be visualized using the printSchema() method.
@@ -655,13 +655,13 @@ people.printSchema()
 //  |-- age: integer (nullable = true)
 //  |-- name: string (nullable = true)
 
-// Register this SchemaRDD as a table.
+// Register this DataFrame as a table.
 people.registerTempTable("people")
 
 // SQL statements can be run by using the sql methods provided by sqlContext.
 val teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
 
-// Alternatively, a SchemaRDD can be created for a JSON dataset represented by
+// Alternatively, a DataFrame can be created for a JSON dataset represented by
 // an RDD[String] storing one JSON object per string.
 val anotherPeopleRDD = sc.parallelize(
   """{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}""" :: Nil)
@@ -671,8 +671,8 @@ val anotherPeople = sqlContext.jsonRDD(anotherPeopleRDD)
 </div>
 
 <div data-lang="java"  markdown="1">
-Spark SQL can automatically infer the schema of a JSON dataset and load it as a JavaSchemaRDD.
-This conversion can be done using one of two methods in a JavaSQLContext :
+Spark SQL can automatically infer the schema of a JSON dataset and load it as a DataFrame.
+This conversion can be done using one of two methods in a SQLContext :
 
 * `jsonFile` - loads data from a directory of JSON files where each line of the files is a JSON object.
 * `jsonRDD` - loads data from an existing RDD where each element of the RDD is a string containing a JSON object.
@@ -683,13 +683,13 @@ a regular multi-line JSON file will most often fail.
 
 {% highlight java %}
 // sc is an existing JavaSparkContext.
-JavaSQLContext sqlContext = new org.apache.spark.sql.api.java.JavaSQLContext(sc);
+SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
 
 // A JSON dataset is pointed to by path.
 // The path can be either a single text file or a directory storing text files.
 String path = "examples/src/main/resources/people.json";
-// Create a JavaSchemaRDD from the file(s) pointed to by path
-JavaSchemaRDD people = sqlContext.jsonFile(path);
+// Create a DataFrame from the file(s) pointed to by path
+DataFrame people = sqlContext.jsonFile(path);
 
 // The inferred schema can be visualized using the printSchema() method.
 people.printSchema();
@@ -697,23 +697,23 @@ people.printSchema();
 //  |-- age: integer (nullable = true)
 //  |-- name: string (nullable = true)
 
-// Register this JavaSchemaRDD as a table.
+// Register this DataFrame as a table.
 people.registerTempTable("people");
 
 // SQL statements can be run by using the sql methods provided by sqlContext.
-JavaSchemaRDD teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
+DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
 
-// Alternatively, a JavaSchemaRDD can be created for a JSON dataset represented by
+// Alternatively, a DataFrame can be created for a JSON dataset represented by
 // an RDD[String] storing one JSON object per string.
 List<String> jsonData = Arrays.asList(
   "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
 JavaRDD<String> anotherPeopleRDD = sc.parallelize(jsonData);
-JavaSchemaRDD anotherPeople = sqlContext.jsonRDD(anotherPeopleRDD);
+DataFrame anotherPeople = sqlContext.jsonRDD(anotherPeopleRDD);
 {% endhighlight %}
 </div>
 
 <div data-lang="python"  markdown="1">
-Spark SQL can automatically infer the schema of a JSON dataset and load it as a SchemaRDD.
+Spark SQL can automatically infer the schema of a JSON dataset and load it as a DataFrame.
 This conversion can be done using one of two methods in a SQLContext:
 
 * `jsonFile` - loads data from a directory of JSON files where each line of the files is a JSON object.
@@ -731,7 +731,7 @@ sqlContext = SQLContext(sc)
 # A JSON dataset is pointed to by path.
 # The path can be either a single text file or a directory storing text files.
 path = "examples/src/main/resources/people.json"
-# Create a SchemaRDD from the file(s) pointed to by path
+# Create a DataFrame from the file(s) pointed to by path
 people = sqlContext.jsonFile(path)
 
 # The inferred schema can be visualized using the printSchema() method.
@@ -740,13 +740,13 @@ people.printSchema()
 #  |-- age: integer (nullable = true)
 #  |-- name: string (nullable = true)
 
-# Register this SchemaRDD as a table.
+# Register this DataFrame as a table.
 people.registerTempTable("people")
 
 # SQL statements can be run by using the sql methods provided by sqlContext.
 teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
 
-# Alternatively, a SchemaRDD can be created for a JSON dataset represented by
+# Alternatively, a DataFrame can be created for a JSON dataset represented by
 # an RDD[String] storing one JSON object per string.
 anotherPeopleRDD = sc.parallelize([
   '{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}'])
@@ -792,14 +792,14 @@ sqlContext.sql("FROM src SELECT key, value").collect().foreach(println)
 
 <div data-lang="java"  markdown="1">
 
-When working with Hive one must construct a `JavaHiveContext`, which inherits from `JavaSQLContext`, and
+When working with Hive one must construct a `HiveContext`, which inherits from `SQLContext`, and
 adds support for finding tables in the MetaStore and writing queries using HiveQL. In addition to
-the `sql` method a `JavaHiveContext` also provides an `hql` methods, which allows queries to be
+the `sql` method a `HiveContext` also provides an `hql` methods, which allows queries to be
 expressed in HiveQL.
 
 {% highlight java %}
 // sc is an existing JavaSparkContext.
-JavaHiveContext sqlContext = new org.apache.spark.sql.hive.api.java.HiveContext(sc);
+HiveContext sqlContext = new org.apache.spark.sql.hive.HiveContext(sc);
 
 sqlContext.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)");
 sqlContext.sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src");
@@ -841,7 +841,7 @@ turning on some experimental options.
 
 ## Caching Data In Memory
 
-Spark SQL can cache tables using an in-memory columnar format by calling `sqlContext.cacheTable("tableName")` or `schemaRDD.cache()`.
+Spark SQL can cache tables using an in-memory columnar format by calling `sqlContext.cacheTable("tableName")` or `dataFrame.cache()`.
 Then Spark SQL will scan only required columns and will automatically tune compression to minimize
 memory usage and GC pressure. You can call `sqlContext.uncacheTable("tableName")` to remove the table from memory.
 
@@ -1161,7 +1161,7 @@ teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
 The DSL uses Scala symbols to represent columns in the underlying table, which are identifiers
 prefixed with a tick (`'`).  Implicit conversions turn these symbols into expressions that are
 evaluated by the SQL execution engine.  A full list of the functions supported can be found in the
-[ScalaDoc](api/scala/index.html#org.apache.spark.sql.SchemaRDD).
+[ScalaDoc](api/scala/index.html#org.apache.spark.sql.DataFrame).
 
 <!-- TODO: Include the table of operations here. -->
 

From aa4ca8b873fd83e64e5faea6f7febcc830e30b02 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Thu, 12 Feb 2015 13:11:28 -0800
Subject: [PATCH 136/357] [SQL] Improve error messages

Author: Michael Armbrust <michael@databricks.com>
Author: wangfei <wangfei1@huawei.com>

Closes #4558 from marmbrus/errorMessages and squashes the following commits:

5e5ab50 [Michael Armbrust] Merge pull request #15 from scwf/errorMessages
fa38881 [wangfei] fix for grouping__id
f279a71 [wangfei] make right references for ScriptTransformation
d29fbde [Michael Armbrust] extra case
1a797b4 [Michael Armbrust] comments
d4e9015 [Michael Armbrust] add comment
af9e668 [Michael Armbrust] no braces
34eb3a4 [Michael Armbrust] more work
6197cd5 [Michael Armbrust] [SQL] Better error messages for analysis failures
---
 .../sql/catalyst/analysis/Analyzer.scala      | 123 ++++++++++--------
 .../catalyst/expressions/AttributeSet.scala   |   6 +-
 .../expressions/namedExpressions.scala        |   2 +-
 .../plans/logical/ScriptTransformation.scala  |   6 +-
 .../spark/sql/catalyst/trees/TreeNode.scala   |   9 ++
 .../sql/catalyst/analysis/AnalysisSuite.scala |  79 +++++++----
 .../BooleanSimplificationSuite.scala          |   4 +-
 .../optimizer/ConstantFoldingSuite.scala      |   4 +-
 .../optimizer/FilterPushdownSuite.scala       |  12 +-
 .../catalyst/optimizer/OptimizeInSuite.scala  |   4 +-
 .../optimizer/UnionPushdownSuite.scala        |   4 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |   6 +-
 .../apache/spark/sql/hive/HiveContext.scala   |   4 +-
 .../spark/sql/hive/execution/commands.scala   |   4 +-
 14 files changed, 164 insertions(+), 103 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 2d1fa106a2aa9..58a7003977c93 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
-import org.apache.spark.sql.types.{ArrayType, StructField, StructType, IntegerType}
+import org.apache.spark.sql.types._
 
 /**
  * A trivial [[Analyzer]] with an [[EmptyCatalog]] and [[EmptyFunctionRegistry]]. Used for testing
@@ -66,32 +66,82 @@ class Analyzer(catalog: Catalog,
       typeCoercionRules ++
       extendedRules : _*),
     Batch("Check Analysis", Once,
-      CheckResolution ::
-      CheckAggregation ::
-      Nil: _*),
-    Batch("AnalysisOperators", fixedPoint,
-      EliminateAnalysisOperators)
+      CheckResolution),
+    Batch("Remove SubQueries", fixedPoint,
+      EliminateSubQueries)
   )
 
   /**
    * Makes sure all attributes and logical plans have been resolved.
    */
   object CheckResolution extends Rule[LogicalPlan] {
+    def failAnalysis(msg: String) = { throw new AnalysisException(msg) }
+
     def apply(plan: LogicalPlan): LogicalPlan = {
-      plan.transformUp {
-        case p if p.expressions.exists(!_.resolved) =>
-          val missing = p.expressions.filterNot(_.resolved).map(_.prettyString).mkString(",")
-          val from = p.inputSet.map(_.name).mkString("{", ", ", "}")
-
-          throw new AnalysisException(s"Cannot resolve '$missing' given input columns $from")
-        case p if !p.resolved && p.childrenResolved =>
-          throw new AnalysisException(s"Unresolved operator in the query plan ${p.simpleString}")
-      } match {
-        // As a backstop, use the root node to check that the entire plan tree is resolved.
-        case p if !p.resolved =>
-          throw new AnalysisException(s"Unresolved operator in the query plan ${p.simpleString}")
-        case p => p
+      // We transform up and order the rules so as to catch the first possible failure instead
+      // of the result of cascading resolution failures.
+      plan.foreachUp {
+        case operator: LogicalPlan =>
+          operator transformExpressionsUp {
+            case a: Attribute if !a.resolved =>
+              val from = operator.inputSet.map(_.name).mkString(", ")
+              failAnalysis(s"cannot resolve '${a.prettyString}' given input columns $from")
+
+            case c: Cast if !c.resolved =>
+              failAnalysis(
+                s"invalid cast from ${c.child.dataType.simpleString} to ${c.dataType.simpleString}")
+
+            case b: BinaryExpression if !b.resolved =>
+              failAnalysis(
+                s"invalid expression ${b.prettyString} " +
+                s"between ${b.left.simpleString} and ${b.right.simpleString}")
+          }
+
+          operator match {
+            case f: Filter if f.condition.dataType != BooleanType =>
+              failAnalysis(
+                s"filter expression '${f.condition.prettyString}' " +
+                s"of type ${f.condition.dataType.simpleString} is not a boolean.")
+
+            case aggregatePlan @ Aggregate(groupingExprs, aggregateExprs, child) =>
+              def checkValidAggregateExpression(expr: Expression): Unit = expr match {
+                case _: AggregateExpression => // OK
+                case e: Attribute if !groupingExprs.contains(e) =>
+                  failAnalysis(
+                    s"expression '${e.prettyString}' is neither present in the group by, " +
+                    s"nor is it an aggregate function. " +
+                     "Add to group by or wrap in first() if you don't care which value you get.")
+                case e if groupingExprs.contains(e) => // OK
+                case e if e.references.isEmpty => // OK
+                case e => e.children.foreach(checkValidAggregateExpression)
+              }
+
+              val cleaned = aggregateExprs.map(_.transform {
+                // Should trim aliases around `GetField`s. These aliases are introduced while
+                // resolving struct field accesses, because `GetField` is not a `NamedExpression`.
+                // (Should we just turn `GetField` into a `NamedExpression`?)
+                case Alias(g, _) => g
+              })
+
+              cleaned.foreach(checkValidAggregateExpression)
+
+            case o if o.children.nonEmpty &&
+              !o.references.filter(_.name != "grouping__id").subsetOf(o.inputSet) =>
+              val missingAttributes = (o.references -- o.inputSet).map(_.prettyString).mkString(",")
+              val input = o.inputSet.map(_.prettyString).mkString(",")
+
+              failAnalysis(s"resolved attributes $missingAttributes missing from $input")
+
+            // Catch all
+            case o if !o.resolved =>
+              failAnalysis(
+                s"unresolved operator ${operator.simpleString}")
+
+            case _ => // Analysis successful!
+          }
       }
+
+      plan
     }
   }
 
@@ -192,37 +242,6 @@ class Analyzer(catalog: Catalog,
     }
   }
 
-  /**
-   * Checks for non-aggregated attributes with aggregation
-   */
-  object CheckAggregation extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = {
-      plan.transform {
-        case aggregatePlan @ Aggregate(groupingExprs, aggregateExprs, child) =>
-          def isValidAggregateExpression(expr: Expression): Boolean = expr match {
-            case _: AggregateExpression => true
-            case e: Attribute => groupingExprs.contains(e)
-            case e if groupingExprs.contains(e) => true
-            case e if e.references.isEmpty => true
-            case e => e.children.forall(isValidAggregateExpression)
-          }
-
-          aggregateExprs.find { e =>
-            !isValidAggregateExpression(e.transform {
-              // Should trim aliases around `GetField`s. These aliases are introduced while
-              // resolving struct field accesses, because `GetField` is not a `NamedExpression`.
-              // (Should we just turn `GetField` into a `NamedExpression`?)
-              case Alias(g: GetField, _) => g
-            })
-          }.foreach { e =>
-            throw new TreeNodeException(plan, s"Expression not in GROUP BY: $e")
-          }
-
-          aggregatePlan
-      }
-    }
-  }
-
   /**
    * Replaces [[UnresolvedRelation]]s with concrete relations from the catalog.
    */
@@ -230,7 +249,7 @@ class Analyzer(catalog: Catalog,
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
       case i @ InsertIntoTable(UnresolvedRelation(tableIdentifier, alias), _, _, _) =>
         i.copy(
-          table = EliminateAnalysisOperators(catalog.lookupRelation(tableIdentifier, alias)))
+          table = EliminateSubQueries(catalog.lookupRelation(tableIdentifier, alias)))
       case UnresolvedRelation(tableIdentifier, alias) =>
         catalog.lookupRelation(tableIdentifier, alias)
     }
@@ -477,7 +496,7 @@ class Analyzer(catalog: Catalog,
  * only required to provide scoping information for attributes and can be removed once analysis is
  * complete.
  */
-object EliminateAnalysisOperators extends Rule[LogicalPlan] {
+object EliminateSubQueries extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     case Subquery(_, child) => child
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
index 171845ad14e3e..a9ba0be596349 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
@@ -20,7 +20,11 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.analysis.Star
 
 protected class AttributeEquals(val a: Attribute) {
-  override def hashCode() = a.exprId.hashCode()
+  override def hashCode() = a match {
+    case ar: AttributeReference => ar.exprId.hashCode()
+    case a => a.hashCode()
+  }
+
   override def equals(other: Any) = (a, other.asInstanceOf[AttributeEquals].a) match {
     case (a1: AttributeReference, a2: AttributeReference) => a1.exprId == a2.exprId
     case (a1, a2) => a1 == a2
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index f77c56311cc8c..62c062be6d820 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -218,7 +218,7 @@ case class PrettyAttribute(name: String) extends Attribute with trees.LeafNode[E
   override def exprId: ExprId = ???
   override def eval(input: Row): EvaluatedType = ???
   override def nullable: Boolean = ???
-  override def dataType: DataType = ???
+  override def dataType: DataType = NullType
 }
 
 object VirtualColumn {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/ScriptTransformation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/ScriptTransformation.scala
index cfe2c7a39a17c..ccf5291219add 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/ScriptTransformation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/ScriptTransformation.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.plans.logical
 
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
+import org.apache.spark.sql.catalyst.expressions.{AttributeSet, Attribute, Expression}
 
 /**
  * Transforms the input by forking and running the specified script.
@@ -32,7 +32,9 @@ case class ScriptTransformation(
     script: String,
     output: Seq[Attribute],
     child: LogicalPlan,
-    ioschema: ScriptInputOutputSchema) extends UnaryNode
+    ioschema: ScriptInputOutputSchema) extends UnaryNode {
+  override def references: AttributeSet = AttributeSet(input.flatMap(_.references))
+}
 
 /**
  * A placeholder for implementation specific input and output properties when passing data
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 2013ae4f7bd13..e0930b056d5fa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -46,6 +46,15 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
     children.foreach(_.foreach(f))
   }
 
+  /**
+   * Runs the given function recursively on [[children]] then on this node.
+   * @param f the function to be applied to each node in the tree.
+   */
+  def foreachUp(f: BaseType => Unit): Unit = {
+    children.foreach(_.foreach(f))
+    f(this)
+  }
+
   /**
    * Returns a Seq containing the result of applying the given function to each
    * node in this tree in a preorder traversal.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index f011a5ff15ea9..e70c651e1486e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.analysis
 import org.scalatest.{BeforeAndAfter, FunSuite}
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference}
+import org.apache.spark.sql.catalyst.expressions.{Literal, Alias, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
 
@@ -108,24 +108,56 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
         testRelation)
   }
 
-  test("throw errors for unresolved attributes during analysis") {
-    val e = intercept[AnalysisException] {
-      caseSensitiveAnalyze(Project(Seq(UnresolvedAttribute("abcd")), testRelation))
+  def errorTest(
+      name: String,
+      plan: LogicalPlan,
+      errorMessages: Seq[String],
+      caseSensitive: Boolean = true) = {
+    test(name) {
+      val error = intercept[AnalysisException] {
+        if(caseSensitive) {
+          caseSensitiveAnalyze(plan)
+        } else {
+          caseInsensitiveAnalyze(plan)
+        }
+      }
+
+      errorMessages.foreach(m => assert(error.getMessage contains m))
     }
-    assert(e.getMessage().toLowerCase.contains("cannot resolve"))
   }
 
-  test("throw errors for unresolved plans during analysis") {
-    case class UnresolvedTestPlan() extends LeafNode {
-      override lazy val resolved = false
-      override def output = Nil
-    }
-    val e = intercept[AnalysisException] {
-      caseSensitiveAnalyze(UnresolvedTestPlan())
-    }
-    assert(e.getMessage().toLowerCase.contains("unresolved"))
+  errorTest(
+    "unresolved attributes",
+    testRelation.select('abcd),
+    "cannot resolve" :: "abcd" :: Nil)
+
+  errorTest(
+    "bad casts",
+    testRelation.select(Literal(1).cast(BinaryType).as('badCast)),
+    "invalid cast" :: Literal(1).dataType.simpleString :: BinaryType.simpleString :: Nil)
+
+  errorTest(
+    "non-boolean filters",
+    testRelation.where(Literal(1)),
+    "filter" :: "'1'" :: "not a boolean" :: Literal(1).dataType.simpleString :: Nil)
+
+  errorTest(
+    "missing group by",
+    testRelation2.groupBy('a)('b),
+    "'b'" :: "group by" :: Nil
+  )
+
+  case class UnresolvedTestPlan() extends LeafNode {
+    override lazy val resolved = false
+    override def output = Nil
   }
 
+  errorTest(
+    "catch all unresolved plan",
+    UnresolvedTestPlan(),
+    "unresolved" :: Nil)
+
+
   test("divide should be casted into fractional types") {
     val testRelation2 = LocalRelation(
       AttributeReference("a", StringType)(),
@@ -134,18 +166,15 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
       AttributeReference("d", DecimalType.Unlimited)(),
       AttributeReference("e", ShortType)())
 
-    val expr0 = 'a / 2
-    val expr1 = 'a / 'b
-    val expr2 = 'a / 'c
-    val expr3 = 'a / 'd
-    val expr4 = 'e / 'e
-    val plan = caseInsensitiveAnalyze(Project(
-      Alias(expr0, s"Analyzer($expr0)")() ::
-      Alias(expr1, s"Analyzer($expr1)")() ::
-      Alias(expr2, s"Analyzer($expr2)")() ::
-      Alias(expr3, s"Analyzer($expr3)")() ::
-      Alias(expr4, s"Analyzer($expr4)")() :: Nil, testRelation2))
+    val plan = caseInsensitiveAnalyze(
+      testRelation2.select(
+        'a / Literal(2) as 'div1,
+        'a / 'b as 'div2,
+        'a / 'c as 'div3,
+        'a / 'd as 'div4,
+        'e / 'e as 'div5))
     val pl = plan.asInstanceOf[Project].projectList
+
     assert(pl(0).dataType == DoubleType)
     assert(pl(1).dataType == DoubleType)
     assert(pl(2).dataType == DoubleType)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
index 264a0eff37d34..72f06e26e05f1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.analysis.EliminateAnalysisOperators
+import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.PlanTest
@@ -30,7 +30,7 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper {
   object Optimize extends RuleExecutor[LogicalPlan] {
     val batches =
       Batch("AnalysisNodes", Once,
-        EliminateAnalysisOperators) ::
+        EliminateSubQueries) ::
       Batch("Constant Folding", FixedPoint(50),
         NullPropagation,
         ConstantFolding,
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
index e22c62505860a..ef10c0aece716 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.analysis.{UnresolvedGetField, EliminateAnalysisOperators}
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedGetField, EliminateSubQueries}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.plans.PlanTest
@@ -33,7 +33,7 @@ class ConstantFoldingSuite extends PlanTest {
   object Optimize extends RuleExecutor[LogicalPlan] {
     val batches =
       Batch("AnalysisNodes", Once,
-        EliminateAnalysisOperators) ::
+        EliminateSubQueries) ::
       Batch("ConstantFolding", Once,
         ConstantFolding,
         BooleanSimplification) :: Nil
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 1158b5dfc6147..0b74bacb18f4b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.catalyst.analysis
-import org.apache.spark.sql.catalyst.analysis.EliminateAnalysisOperators
+import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
 import org.apache.spark.sql.catalyst.expressions.Explode
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.{PlanTest, LeftOuter, RightOuter}
@@ -32,7 +32,7 @@ class FilterPushdownSuite extends PlanTest {
   object Optimize extends RuleExecutor[LogicalPlan] {
     val batches =
       Batch("Subqueries", Once,
-        EliminateAnalysisOperators) ::
+        EliminateSubQueries) ::
       Batch("Filter Pushdown", Once,
         CombineFilters,
         PushPredicateThroughProject,
@@ -351,7 +351,7 @@ class FilterPushdownSuite extends PlanTest {
     }
     val optimized = Optimize(originalQuery.analyze)
 
-    comparePlans(analysis.EliminateAnalysisOperators(originalQuery.analyze), optimized)
+    comparePlans(analysis.EliminateSubQueries(originalQuery.analyze), optimized)
   }
 
   test("joins: conjunctive predicates") {
@@ -370,7 +370,7 @@ class FilterPushdownSuite extends PlanTest {
       left.join(right, condition = Some("x.b".attr === "y.b".attr))
         .analyze
 
-    comparePlans(optimized, analysis.EliminateAnalysisOperators(correctAnswer))
+    comparePlans(optimized, analysis.EliminateSubQueries(correctAnswer))
   }
 
   test("joins: conjunctive predicates #2") {
@@ -389,7 +389,7 @@ class FilterPushdownSuite extends PlanTest {
       left.join(right, condition = Some("x.b".attr === "y.b".attr))
         .analyze
 
-    comparePlans(optimized, analysis.EliminateAnalysisOperators(correctAnswer))
+    comparePlans(optimized, analysis.EliminateSubQueries(correctAnswer))
   }
 
   test("joins: conjunctive predicates #3") {
@@ -412,7 +412,7 @@ class FilterPushdownSuite extends PlanTest {
           condition = Some("z.a".attr === "x.b".attr))
         .analyze
 
-    comparePlans(optimized, analysis.EliminateAnalysisOperators(correctAnswer))
+    comparePlans(optimized, analysis.EliminateSubQueries(correctAnswer))
   }
 
   val testRelationWithArrayType = LocalRelation('a.int, 'b.int, 'c_arr.array(IntegerType))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
index da912ab382179..233e329cb2038 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.optimizer
 
 import scala.collection.immutable.HashSet
-import org.apache.spark.sql.catalyst.analysis.{EliminateAnalysisOperators, UnresolvedAttribute}
+import org.apache.spark.sql.catalyst.analysis.{EliminateSubQueries, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.plans.PlanTest
@@ -34,7 +34,7 @@ class OptimizeInSuite extends PlanTest {
   object Optimize extends RuleExecutor[LogicalPlan] {
     val batches =
       Batch("AnalysisNodes", Once,
-        EliminateAnalysisOperators) ::
+        EliminateSubQueries) ::
       Batch("ConstantFolding", Once,
         ConstantFolding,
         BooleanSimplification,
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/UnionPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/UnionPushdownSuite.scala
index dfef87bd9133d..a54751dfa9a12 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/UnionPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/UnionPushdownSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.catalyst.analysis
-import org.apache.spark.sql.catalyst.analysis.EliminateAnalysisOperators
+import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.{PlanTest, LeftOuter, RightOuter}
 import org.apache.spark.sql.catalyst.rules._
@@ -29,7 +29,7 @@ class UnionPushdownSuite extends PlanTest {
   object Optimize extends RuleExecutor[LogicalPlan] {
     val batches =
       Batch("Subqueries", Once,
-        EliminateAnalysisOperators) ::
+        EliminateSubQueries) ::
       Batch("Union Pushdown", Once,
         UnionPushdown) :: Nil
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index bba8899651259..a1c8cf58f2357 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -806,10 +806,8 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("throw errors for non-aggregate attributes with aggregation") {
     def checkAggregation(query: String, isInvalidQuery: Boolean = true) {
       if (isInvalidQuery) {
-        val e = intercept[TreeNodeException[LogicalPlan]](sql(query).queryExecution.analyzed)
-        assert(
-          e.getMessage.startsWith("Expression not in GROUP BY"),
-          "Non-aggregate attribute(s) not detected\n")
+        val e = intercept[AnalysisException](sql(query).queryExecution.analyzed)
+        assert(e.getMessage contains "group by")
       } else {
         // Should not throw
         sql(query).queryExecution.analyzed
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 7ae6ed6f841bf..ddc7b181d4d46 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -37,7 +37,7 @@ import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.catalyst.analysis.{Analyzer, EliminateAnalysisOperators, OverrideCatalog, OverrideFunctionRegistry}
+import org.apache.spark.sql.catalyst.analysis.{Analyzer, EliminateSubQueries, OverrideCatalog, OverrideFunctionRegistry}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUdfs, SetCommand, QueryExecutionException}
 import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DescribeHiveTableCommand}
@@ -104,7 +104,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    */
   @Experimental
   def analyze(tableName: String) {
-    val relation = EliminateAnalysisOperators(catalog.lookupRelation(Seq(tableName)))
+    val relation = EliminateSubQueries(catalog.lookupRelation(Seq(tableName)))
 
     relation match {
       case relation: MetastoreRelation =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index f6bea1c6a6fe1..ce0db1125c27f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.sql.catalyst.analysis.EliminateAnalysisOperators
+import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.{DataFrame, SQLContext}
@@ -175,7 +175,7 @@ case class CreateMetastoreDataSourceAsSelect(
           val resolved =
             ResolvedDataSource(sqlContext, Some(query.schema), provider, optionsWithPath)
           val createdRelation = LogicalRelation(resolved.relation)
-          EliminateAnalysisOperators(sqlContext.table(tableName).logicalPlan) match {
+          EliminateSubQueries(sqlContext.table(tableName).logicalPlan) match {
             case l @ LogicalRelation(i: InsertableRelation) =>
               if (l.schema != createdRelation.schema) {
                 val errorDescription =

From 893d6fd7049daf3c4d01eb6a960801cd064d5f73 Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Thu, 12 Feb 2015 14:35:44 -0800
Subject: [PATCH 137/357] [SPARK-5645] Added local read bytes/time to task
 metrics

ksakellis I stumbled on your JIRA for this yesterday; I know it's assigned to you but I'd already done this for my own uses a while ago so thought I could help save you the work of doing it!  Hopefully this doesn't duplicate any work you've already done.

Here's a screenshot of what the UI looks like:
![image](https://cloud.githubusercontent.com/assets/1108612/6135352/c03e7276-b11c-11e4-8f11-c6aefe1f35b9.png)
Based on a discussion with pwendell, I put the data read remotely in as an additional metric rather than showing it in brackets as you'd suggested, Kostas.  The assumption here is that the average user doesn't care about the differentiation between local / remote data, so it's better not to pollute the UI.

I also added data about the local read time, which I've found very helpful for debugging, but I didn't put it in the UI because I think it's probably something not a ton of people will need to use.

With this change, the total read time and total write time shown in the UI will be equal, fixing a long-term source of user confusion:
![image](https://cloud.githubusercontent.com/assets/1108612/6135399/25f14490-b11d-11e4-8086-20be5f4002e6.png)

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #4510 from kayousterhout/SPARK-5645 and squashes the following commits:

4a0182c [Kay Ousterhout] oops
5f5da1b [Kay Ousterhout] Small style fix
5da04cf [Kay Ousterhout] Addressed more comments from Kostas
ba05149 [Kay Ousterhout] Remove parens
a9dc685 [Kay Ousterhout] Kostas comment, test fix
33d2e2d [Kay Ousterhout] Merge remote-tracking branch 'upstream/master' into SPARK-5645
347e2cd [Kay Ousterhout] [SPARK-5645] Added local read bytes/time to task metrics
---
 .../org/apache/spark/ui/static/webui.css      |  4 +-
 .../apache/spark/executor/TaskMetrics.scala   | 21 ++++++
 .../apache/spark/scheduler/JobLogger.scala    |  4 +-
 .../storage/ShuffleBlockFetcherIterator.scala |  3 +
 .../scala/org/apache/spark/ui/ToolTips.scala  |  8 ++-
 .../spark/ui/jobs/JobProgressListener.scala   |  6 +-
 .../org/apache/spark/ui/jobs/StagePage.scala  | 67 +++++++++++++++----
 .../org/apache/spark/ui/jobs/StageTable.scala |  2 +-
 .../spark/ui/jobs/TaskDetailsClassNames.scala |  1 +
 .../org/apache/spark/ui/jobs/UIData.scala     |  4 +-
 .../org/apache/spark/util/JsonProtocol.scala  |  4 ++
 .../ui/jobs/JobProgressListenerSuite.scala    | 12 ++--
 .../apache/spark/util/JsonProtocolSuite.scala | 17 +++++
 13 files changed, 125 insertions(+), 28 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.css b/core/src/main/resources/org/apache/spark/ui/static/webui.css
index 68b33b5f0d7c7..6c37cc8b98236 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/webui.css
+++ b/core/src/main/resources/org/apache/spark/ui/static/webui.css
@@ -196,7 +196,7 @@ span.additional-metric-title {
 
 /* Hide all additional metrics by default. This is done here rather than using JavaScript to
  * avoid slow page loads for stage pages with large numbers (e.g., thousands) of tasks. */
-.scheduler_delay, .deserialization_time, .fetch_wait_time, .serialization_time,
-.getting_result_time {
+.scheduler_delay, .deserialization_time, .fetch_wait_time, .shuffle_read_remote,
+.serialization_time, .getting_result_time {
   display: none;
 }
diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
index bf3f1e4fc7832..df36566bec4b1 100644
--- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
+++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
@@ -202,6 +202,8 @@ class TaskMetrics extends Serializable {
         merged.incLocalBlocksFetched(depMetrics.localBlocksFetched)
         merged.incRemoteBlocksFetched(depMetrics.remoteBlocksFetched)
         merged.incRemoteBytesRead(depMetrics.remoteBytesRead)
+        merged.incLocalBytesRead(depMetrics.localBytesRead)
+        merged.incLocalReadTime(depMetrics.localReadTime)
         merged.incRecordsRead(depMetrics.recordsRead)
       }
       _shuffleReadMetrics = Some(merged)
@@ -343,6 +345,25 @@ class ShuffleReadMetrics extends Serializable {
   private[spark] def incRemoteBytesRead(value: Long) = _remoteBytesRead += value
   private[spark] def decRemoteBytesRead(value: Long) = _remoteBytesRead -= value
 
+  /**
+   * Time the task spent (in milliseconds) reading local shuffle blocks (from the local disk).
+   */
+  private var _localReadTime: Long = _
+  def localReadTime = _localReadTime
+  private[spark] def incLocalReadTime(value: Long) = _localReadTime += value
+
+  /**
+   * Shuffle data that was read from the local disk (as opposed to from a remote executor).
+   */
+  private var _localBytesRead: Long = _
+  def localBytesRead = _localBytesRead
+  private[spark] def incLocalBytesRead(value: Long) = _localBytesRead += value
+
+  /**
+   * Total bytes fetched in the shuffle by this task (both remote and local).
+   */
+  def totalBytesRead = _remoteBytesRead + _localBytesRead
+
   /**
    * Number of blocks fetched in this shuffle by this task (remote or local)
    */
diff --git a/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala b/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala
index 3bb54855bae44..f9fc8aa30454e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala
@@ -169,7 +169,9 @@ class JobLogger(val user: String, val logDirName: String) extends SparkListener
         " BLOCK_FETCHED_LOCAL=" + metrics.localBlocksFetched +
         " BLOCK_FETCHED_REMOTE=" + metrics.remoteBlocksFetched +
         " REMOTE_FETCH_WAIT_TIME=" + metrics.fetchWaitTime +
-        " REMOTE_BYTES_READ=" + metrics.remoteBytesRead
+        " REMOTE_BYTES_READ=" + metrics.remoteBytesRead +
+        " LOCAL_READ_TIME=" + metrics.localReadTime +
+        " LOCAL_BYTES_READ=" + metrics.localBytesRead
       case None => ""
     }
     val writeMetrics = taskMetrics.shuffleWriteMetrics match {
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index ab9ee4f0096bf..2ebb79989da43 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -228,12 +228,14 @@ final class ShuffleBlockFetcherIterator(
    * track in-memory are the ManagedBuffer references themselves.
    */
   private[this] def fetchLocalBlocks() {
+    val startTime = System.currentTimeMillis
     val iter = localBlocks.iterator
     while (iter.hasNext) {
       val blockId = iter.next()
       try {
         val buf = blockManager.getBlockData(blockId)
         shuffleMetrics.incLocalBlocksFetched(1)
+        shuffleMetrics.incLocalBytesRead(buf.size)
         buf.retain()
         results.put(new SuccessFetchResult(blockId, 0, buf))
       } catch {
@@ -244,6 +246,7 @@ final class ShuffleBlockFetcherIterator(
           return
       }
     }
+    shuffleMetrics.incLocalReadTime(System.currentTimeMillis - startTime)
   }
 
   private[this] def initialize(): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/ui/ToolTips.scala b/core/src/main/scala/org/apache/spark/ui/ToolTips.scala
index 3a15e603b1969..cae6870c2ab20 100644
--- a/core/src/main/scala/org/apache/spark/ui/ToolTips.scala
+++ b/core/src/main/scala/org/apache/spark/ui/ToolTips.scala
@@ -37,8 +37,12 @@ private[spark] object ToolTips {
     "Bytes and records written to disk in order to be read by a shuffle in a future stage."
 
   val SHUFFLE_READ =
-    """Bytes and records read from remote executors. Typically less than shuffle write bytes
-       because this does not include shuffle data read locally."""
+    """Total shuffle bytes and records read (includes both data read locally and data read from
+       remote executors). """
+
+  val SHUFFLE_READ_REMOTE_SIZE =
+    """Total shuffle bytes read from remote executors. This is a subset of the shuffle
+       read bytes; the remaining shuffle data is read locally. """
 
   val GETTING_RESULT_TIME =
     """Time that the driver spends fetching task results from workers. If this is large, consider
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
index f463f8d7c7215..0b6fe70bd2062 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
@@ -401,9 +401,9 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
     execSummary.shuffleWriteRecords += shuffleWriteRecordsDelta
 
     val shuffleReadDelta =
-      (taskMetrics.shuffleReadMetrics.map(_.remoteBytesRead).getOrElse(0L)
-      - oldMetrics.flatMap(_.shuffleReadMetrics).map(_.remoteBytesRead).getOrElse(0L))
-    stageData.shuffleReadBytes += shuffleReadDelta
+      (taskMetrics.shuffleReadMetrics.map(_.totalBytesRead).getOrElse(0L)
+        - oldMetrics.flatMap(_.shuffleReadMetrics).map(_.totalBytesRead).getOrElse(0L))
+    stageData.shuffleReadTotalBytes += shuffleReadDelta
     execSummary.shuffleRead += shuffleReadDelta
 
     val shuffleReadRecordsDelta =
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 05ffd5bc58fbb..d752434ad58ae 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -85,7 +85,7 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
             {if (stageData.hasShuffleRead) {
               <li>
                 <strong>Shuffle read: </strong>
-                {s"${Utils.bytesToString(stageData.shuffleReadBytes)} / " +
+                {s"${Utils.bytesToString(stageData.shuffleReadTotalBytes)} / " +
                  s"${stageData.shuffleReadRecords}"}
               </li>
             }}
@@ -143,6 +143,13 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
                     <span class="additional-metric-title">Shuffle Read Blocked Time</span>
                   </span>
                 </li>
+                <li>
+                  <span data-toggle="tooltip"
+                        title={ToolTips.SHUFFLE_READ_REMOTE_SIZE} data-placement="right">
+                    <input type="checkbox" name={TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE}/>
+                    <span class="additional-metric-title">Shuffle Remote Reads</span>
+                  </span>
+                </li>
               }}
               <li>
                 <span data-toggle="tooltip"
@@ -181,7 +188,8 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
         {if (stageData.hasOutput) Seq(("Output Size / Records", "")) else Nil} ++
         {if (stageData.hasShuffleRead) {
           Seq(("Shuffle Read Blocked Time", TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME),
-            ("Shuffle Read Size / Records", ""))
+            ("Shuffle Read Size / Records", ""),
+            ("Shuffle Remote Reads", TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE))
         } else {
           Nil
         }} ++
@@ -320,19 +328,41 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
           val shuffleReadBlockedTimes = validTasks.map { case TaskUIData(_, metrics, _) =>
             metrics.get.shuffleReadMetrics.map(_.fetchWaitTime).getOrElse(0L).toDouble
           }
-          val shuffleReadBlockedQuantiles = <td>Shuffle Read Blocked Time</td> +:
+          val shuffleReadBlockedQuantiles =
+            <td>
+              <span data-toggle="tooltip"
+                    title={ToolTips.SHUFFLE_READ_BLOCKED_TIME} data-placement="right">
+                Shuffle Read Blocked Time
+              </span>
+            </td> +:
             getFormattedTimeQuantiles(shuffleReadBlockedTimes)
 
-          val shuffleReadSizes = validTasks.map { case TaskUIData(_, metrics, _) =>
-            metrics.get.shuffleReadMetrics.map(_.remoteBytesRead).getOrElse(0L).toDouble
+          val shuffleReadTotalSizes = validTasks.map { case TaskUIData(_, metrics, _) =>
+            metrics.get.shuffleReadMetrics.map(_.totalBytesRead).getOrElse(0L).toDouble
           }
-
-          val shuffleReadRecords = validTasks.map { case TaskUIData(_, metrics, _) =>
+          val shuffleReadTotalRecords = validTasks.map { case TaskUIData(_, metrics, _) =>
             metrics.get.shuffleReadMetrics.map(_.recordsRead).getOrElse(0L).toDouble
           }
+          val shuffleReadTotalQuantiles =
+            <td>
+              <span data-toggle="tooltip"
+                    title={ToolTips.SHUFFLE_READ} data-placement="right">
+                Shuffle Read Size / Records
+              </span>
+            </td> +:
+            getFormattedSizeQuantilesWithRecords(shuffleReadTotalSizes, shuffleReadTotalRecords)
 
-          val shuffleReadQuantiles = <td>Shuffle Read Size / Records (Remote)</td> +:
-            getFormattedSizeQuantilesWithRecords(shuffleReadSizes, shuffleReadRecords)
+          val shuffleReadRemoteSizes = validTasks.map { case TaskUIData(_, metrics, _) =>
+            metrics.get.shuffleReadMetrics.map(_.remoteBytesRead).getOrElse(0L).toDouble
+          }
+          val shuffleReadRemoteQuantiles =
+            <td>
+              <span data-toggle="tooltip"
+                    title={ToolTips.SHUFFLE_READ_REMOTE_SIZE} data-placement="right">
+                Shuffle Remote Reads
+              </span>
+            </td> +:
+            getFormattedSizeQuantiles(shuffleReadRemoteSizes)
 
           val shuffleWriteSizes = validTasks.map { case TaskUIData(_, metrics, _) =>
             metrics.get.shuffleWriteMetrics.map(_.shuffleBytesWritten).getOrElse(0L).toDouble
@@ -374,7 +404,10 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
               <tr class={TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME}>
                 {shuffleReadBlockedQuantiles}
               </tr>
-              <tr>{shuffleReadQuantiles}</tr>
+              <tr>{shuffleReadTotalQuantiles}</tr>
+              <tr class={TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE}>
+                {shuffleReadRemoteQuantiles}
+              </tr>
             } else {
               Nil
             },
@@ -454,11 +487,15 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
       val shuffleReadBlockedTimeReadable =
         maybeShuffleRead.map(ms => UIUtils.formatDuration(ms.fetchWaitTime)).getOrElse("")
 
-      val shuffleReadSortable = maybeShuffleRead.map(_.remoteBytesRead.toString).getOrElse("")
-      val shuffleReadReadable = maybeShuffleRead
-        .map(m => s"${Utils.bytesToString(m.remoteBytesRead)}").getOrElse("")
+      val totalShuffleBytes = maybeShuffleRead.map(_.totalBytesRead)
+      val shuffleReadSortable = totalShuffleBytes.map(_.toString).getOrElse("")
+      val shuffleReadReadable = totalShuffleBytes.map(Utils.bytesToString).getOrElse("")
       val shuffleReadRecords = maybeShuffleRead.map(_.recordsRead.toString).getOrElse("")
 
+      val remoteShuffleBytes = maybeShuffleRead.map(_.remoteBytesRead)
+      val shuffleReadRemoteSortable = remoteShuffleBytes.map(_.toString).getOrElse("")
+      val shuffleReadRemoteReadable = remoteShuffleBytes.map(Utils.bytesToString).getOrElse("")
+
       val maybeShuffleWrite = metrics.flatMap(_.shuffleWriteMetrics)
       val shuffleWriteSortable = maybeShuffleWrite.map(_.shuffleBytesWritten.toString).getOrElse("")
       val shuffleWriteReadable = maybeShuffleWrite
@@ -536,6 +573,10 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
            <td sorttable_customkey={shuffleReadSortable}>
              {s"$shuffleReadReadable / $shuffleReadRecords"}
            </td>
+           <td sorttable_customkey={shuffleReadRemoteSortable}
+               class={TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE}>
+             {shuffleReadRemoteReadable}
+           </td>
         }}
         {if (hasShuffleWrite) {
            <td sorttable_customkey={writeTimeSortable}>
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
index 703d43f9c640d..5865850fa09b5 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala
@@ -138,7 +138,7 @@ private[ui] class StageTableBase(
     val inputReadWithUnit = if (inputRead > 0) Utils.bytesToString(inputRead) else ""
     val outputWrite = stageData.outputBytes
     val outputWriteWithUnit = if (outputWrite > 0) Utils.bytesToString(outputWrite) else ""
-    val shuffleRead = stageData.shuffleReadBytes
+    val shuffleRead = stageData.shuffleReadTotalBytes
     val shuffleReadWithUnit = if (shuffleRead > 0) Utils.bytesToString(shuffleRead) else ""
     val shuffleWrite = stageData.shuffleWriteBytes
     val shuffleWriteWithUnit = if (shuffleWrite > 0) Utils.bytesToString(shuffleWrite) else ""
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/TaskDetailsClassNames.scala b/core/src/main/scala/org/apache/spark/ui/jobs/TaskDetailsClassNames.scala
index 37cf2c207ba40..9bf67db8acde1 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/TaskDetailsClassNames.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/TaskDetailsClassNames.scala
@@ -28,6 +28,7 @@ private[spark] object TaskDetailsClassNames {
   val SCHEDULER_DELAY = "scheduler_delay"
   val TASK_DESERIALIZATION_TIME = "deserialization_time"
   val SHUFFLE_READ_BLOCKED_TIME = "fetch_wait_time"
+  val SHUFFLE_READ_REMOTE_SIZE = "shuffle_read_remote"
   val RESULT_SERIALIZATION_TIME = "serialization_time"
   val GETTING_RESULT_TIME = "getting_result_time"
 }
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
index 69aac6c862de5..dbf1ceeda1878 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
@@ -80,7 +80,7 @@ private[jobs] object UIData {
     var inputRecords: Long = _
     var outputBytes: Long = _
     var outputRecords: Long = _
-    var shuffleReadBytes: Long = _
+    var shuffleReadTotalBytes: Long = _
     var shuffleReadRecords : Long = _
     var shuffleWriteBytes: Long = _
     var shuffleWriteRecords: Long = _
@@ -96,7 +96,7 @@ private[jobs] object UIData {
 
     def hasInput = inputBytes > 0
     def hasOutput = outputBytes > 0
-    def hasShuffleRead = shuffleReadBytes > 0
+    def hasShuffleRead = shuffleReadTotalBytes > 0
     def hasShuffleWrite = shuffleWriteBytes > 0
     def hasBytesSpilled = memoryBytesSpilled > 0 && diskBytesSpilled > 0
   }
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index b0b545640f5aa..58d37e2d667f7 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -294,6 +294,8 @@ private[spark] object JsonProtocol {
     ("Local Blocks Fetched" -> shuffleReadMetrics.localBlocksFetched) ~
     ("Fetch Wait Time" -> shuffleReadMetrics.fetchWaitTime) ~
     ("Remote Bytes Read" -> shuffleReadMetrics.remoteBytesRead) ~
+    ("Local Read Time" -> shuffleReadMetrics.localReadTime) ~
+    ("Local Bytes Read" -> shuffleReadMetrics.localBytesRead) ~
     ("Total Records Read" -> shuffleReadMetrics.recordsRead)
   }
 
@@ -674,6 +676,8 @@ private[spark] object JsonProtocol {
     metrics.incLocalBlocksFetched((json \ "Local Blocks Fetched").extract[Int])
     metrics.incFetchWaitTime((json \ "Fetch Wait Time").extract[Long])
     metrics.incRemoteBytesRead((json \ "Remote Bytes Read").extract[Long])
+    metrics.incLocalReadTime((json \ "Local Read Time").extractOpt[Long].getOrElse(0))
+    metrics.incLocalBytesRead((json \ "Local Bytes Read").extractOpt[Long].getOrElse(0))
     metrics.incRecordsRead((json \ "Total Records Read").extractOpt[Long].getOrElse(0))
     metrics
   }
diff --git a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
index e8405baa8e3ea..6019282d2fb70 100644
--- a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
@@ -227,6 +227,7 @@ class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matc
       taskMetrics.setShuffleReadMetrics(Some(shuffleReadMetrics))
       taskMetrics.shuffleWriteMetrics = Some(shuffleWriteMetrics)
       shuffleReadMetrics.incRemoteBytesRead(base + 1)
+      shuffleReadMetrics.incLocalBytesRead(base + 9)
       shuffleReadMetrics.incRemoteBlocksFetched(base + 2)
       shuffleWriteMetrics.incShuffleBytesWritten(base + 3)
       taskMetrics.setExecutorRunTime(base + 4)
@@ -260,8 +261,8 @@ class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matc
 
     var stage0Data = listener.stageIdToData.get((0, 0)).get
     var stage1Data = listener.stageIdToData.get((1, 0)).get
-    assert(stage0Data.shuffleReadBytes == 102)
-    assert(stage1Data.shuffleReadBytes == 201)
+    assert(stage0Data.shuffleReadTotalBytes == 220)
+    assert(stage1Data.shuffleReadTotalBytes == 410)
     assert(stage0Data.shuffleWriteBytes == 106)
     assert(stage1Data.shuffleWriteBytes == 203)
     assert(stage0Data.executorRunTime == 108)
@@ -290,8 +291,11 @@ class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matc
 
     stage0Data = listener.stageIdToData.get((0, 0)).get
     stage1Data = listener.stageIdToData.get((1, 0)).get
-    assert(stage0Data.shuffleReadBytes == 402)
-    assert(stage1Data.shuffleReadBytes == 602)
+    // Task 1235 contributed (100+1)+(100+9) = 210 shuffle bytes, and task 1234 contributed
+    // (300+1)+(300+9) = 610 total shuffle bytes, so the total for the stage is 820.
+    assert(stage0Data.shuffleReadTotalBytes == 820)
+    // Task 1236 contributed 410 shuffle bytes, and task 1237 contributed 810 shuffle bytes.
+    assert(stage1Data.shuffleReadTotalBytes == 1220)
     assert(stage0Data.shuffleWriteBytes == 406)
     assert(stage1Data.shuffleWriteBytes == 606)
     assert(stage0Data.executorRunTime == 408)
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index f3017dc42cd5c..c181baf6844b0 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -260,6 +260,19 @@ class JsonProtocolSuite extends FunSuite {
     assert(expectedFetchFailed === JsonProtocol.taskEndReasonFromJson(oldEvent))
   }
 
+  test("ShuffleReadMetrics: Local bytes read and time taken backwards compatibility") {
+    // Metrics about local shuffle bytes read and local read time were added in 1.3.1.
+    val metrics = makeTaskMetrics(1L, 2L, 3L, 4L, 5, 6,
+      hasHadoopInput = false, hasOutput = false, hasRecords = false)
+    assert(metrics.shuffleReadMetrics.nonEmpty)
+    val newJson = JsonProtocol.taskMetricsToJson(metrics)
+    val oldJson = newJson.removeField { case (field, _) => field == "Local Bytes Read" }
+      .removeField { case (field, _) => field == "Local Read Time" }
+    val newMetrics = JsonProtocol.taskMetricsFromJson(oldJson)
+    assert(newMetrics.shuffleReadMetrics.get.localBytesRead == 0)
+    assert(newMetrics.shuffleReadMetrics.get.localReadTime == 0)
+  }
+
   test("SparkListenerApplicationStart backwards compatibility") {
     // SparkListenerApplicationStart in Spark 1.0.0 do not have an "appId" property.
     val applicationStart = SparkListenerApplicationStart("test", None, 1L, "user")
@@ -695,6 +708,8 @@ class JsonProtocolSuite extends FunSuite {
       sr.incFetchWaitTime(a + d)
       sr.incRemoteBlocksFetched(f)
       sr.incRecordsRead(if (hasRecords) (b + d) / 100 else -1)
+      sr.incLocalReadTime(a + e)
+      sr.incLocalBytesRead(a + f)
       t.setShuffleReadMetrics(Some(sr))
     }
     if (hasOutput) {
@@ -941,6 +956,8 @@ class JsonProtocolSuite extends FunSuite {
       |      "Local Blocks Fetched": 700,
       |      "Fetch Wait Time": 900,
       |      "Remote Bytes Read": 1000,
+      |      "Local Read Time": 1000,
+      |      "Local Bytes Read": 1100,
       |      "Total Records Read" : 10
       |    },
       |    "Shuffle Write Metrics": {

From 9c8076502f0c2c6a6dcdc6720d16b34132dfc06a Mon Sep 17 00:00:00 2001
From: Katsunori Kanda <potix2@gmail.com>
Date: Thu, 12 Feb 2015 14:38:42 -0800
Subject: [PATCH 138/357] [EC2] Update default Spark version to 1.2.1

Author: Katsunori Kanda <potix2@gmail.com>

Closes #4566 from potix2/ec2-update-version-1-2-1 and squashes the following commits:

77e7840 [Katsunori Kanda] [EC2] Update default Spark version to 1.2.1
---
 ec2/spark_ec2.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 0ea7365d75b83..35209aec5fcfe 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -42,7 +42,7 @@
 from optparse import OptionParser
 from sys import stderr
 
-SPARK_EC2_VERSION = "1.2.0"
+SPARK_EC2_VERSION = "1.2.1"
 SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
 
 VALID_SPARK_VERSIONS = set([
@@ -58,6 +58,7 @@
     "1.1.0",
     "1.1.1",
     "1.2.0",
+    "1.2.1",
 ])
 
 DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION

From 629d0143eeb3c153dac9c65e7b556723c6b4bfc7 Mon Sep 17 00:00:00 2001
From: Venkata Ramana Gollamudi <ramana.gollamudi@huawei.com>
Date: Thu, 12 Feb 2015 14:44:21 -0800
Subject: [PATCH 139/357] [SPARK-5765][Examples]Fixed word split problem in
 run-example and compute-classpath

Author: Venkata Ramana G <ramana.gollamudihuawei.com>

Author: Venkata Ramana Gollamudi <ramana.gollamudi@huawei.com>

Closes #4561 from gvramana/word_split and squashes the following commits:

285c8d4 [Venkata Ramana Gollamudi] Fixed word split problem in run-example and compute-classpath
---
 bin/compute-classpath.sh | 4 ++--
 bin/run-example          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
index a8c344b1ca594..f4f6b7b909490 100755
--- a/bin/compute-classpath.sh
+++ b/bin/compute-classpath.sh
@@ -76,7 +76,7 @@ fi
 
 num_jars=0
 
-for f in ${assembly_folder}/spark-assembly*hadoop*.jar; do
+for f in "${assembly_folder}"/spark-assembly*hadoop*.jar; do
   if [[ ! -e "$f" ]]; then
     echo "Failed to find Spark assembly in $assembly_folder" 1>&2
     echo "You need to build Spark before running this program." 1>&2
@@ -88,7 +88,7 @@ done
 
 if [ "$num_jars" -gt "1" ]; then
   echo "Found multiple Spark assembly jars in $assembly_folder:" 1>&2
-  ls ${assembly_folder}/spark-assembly*hadoop*.jar 1>&2
+  ls "${assembly_folder}"/spark-assembly*hadoop*.jar 1>&2
   echo "Please remove all but one jar." 1>&2
   exit 1
 fi
diff --git a/bin/run-example b/bin/run-example
index c567acf9a6b5c..a106411392e06 100755
--- a/bin/run-example
+++ b/bin/run-example
@@ -42,7 +42,7 @@ fi
 
 JAR_COUNT=0
 
-for f in ${JAR_PATH}/spark-examples-*hadoop*.jar; do
+for f in "${JAR_PATH}"/spark-examples-*hadoop*.jar; do
   if [[ ! -e "$f" ]]; then
     echo "Failed to find Spark examples assembly in $FWDIR/lib or $FWDIR/examples/target" 1>&2
     echo "You need to build Spark before running this program" 1>&2
@@ -54,7 +54,7 @@ done
 
 if [ "$JAR_COUNT" -gt "1" ]; then
   echo "Found multiple Spark examples assembly jars in ${JAR_PATH}" 1>&2
-  ls ${JAR_PATH}/spark-examples-*hadoop*.jar 1>&2
+  ls "${JAR_PATH}"/spark-examples-*hadoop*.jar 1>&2
   echo "Please remove all but one jar." 1>&2
   exit 1
 fi

From 47c73d410ab533c3196184d2b6004081e79daeaa Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Thu, 12 Feb 2015 14:46:37 -0800
Subject: [PATCH 140/357] [SPARK-5762] Fix shuffle write time for sort-based
 shuffle

mateiz was excluding the time to write this final file from the shuffle write time intentional?

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #4559 from kayousterhout/SPARK-5762 and squashes the following commits:

5c6f3d9 [Kay Ousterhout] Use foreach
94e4237 [Kay Ousterhout] Removed open time metrics added inadvertently
ace156c [Kay Ousterhout] Moved metrics to finally block
d773276 [Kay Ousterhout] Use nano time
5a59906 [Kay Ousterhout] [SPARK-5762] Fix shuffle write time for sort-based shuffle
---
 .../org/apache/spark/util/collection/ExternalSorter.scala      | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index eaec5a71e6819..d69f2d9048055 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -723,6 +723,7 @@ private[spark] class ExternalSorter[K, V, C](
       partitionWriters.foreach(_.commitAndClose())
       var out: FileOutputStream = null
       var in: FileInputStream = null
+      val writeStartTime = System.nanoTime
       try {
         out = new FileOutputStream(outputFile, true)
         for (i <- 0 until numPartitions) {
@@ -739,6 +740,8 @@ private[spark] class ExternalSorter[K, V, C](
         if (in != null) {
           in.close()
         }
+        context.taskMetrics.shuffleWriteMetrics.foreach(
+          _.incShuffleWriteTime(System.nanoTime - writeStartTime))
       }
     } else {
       // Either we're not bypassing merge-sort or we have only in-memory data; get an iterator by

From 1d5663e92cdaaa3dabfa58fdd7aede7e4fa4ec63 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Thu, 12 Feb 2015 14:47:52 -0800
Subject: [PATCH 141/357] [SPARK-5760][SPARK-5761] Fix standalone rest protocol
 corner cases + revamp tests

The changes are summarized in the commit message. Test or test-related code accounts for 90% of the lines changed.

Author: Andrew Or <andrew@databricks.com>

Closes #4557 from andrewor14/rest-tests and squashes the following commits:

b4dc980 [Andrew Or] Merge branch 'master' of github.com:apache/spark into rest-tests
b55e40f [Andrew Or] Add test for unknown fields
cc96993 [Andrew Or] private[spark] -> private[rest]
578cf45 [Andrew Or] Clean up test code a little
d82d971 [Andrew Or] v1 -> serverVersion
ea48f65 [Andrew Or] Merge branch 'master' of github.com:apache/spark into rest-tests
00999a8 [Andrew Or] Revamp tests + fix a few corner cases
---
 .../deploy/rest/StandaloneRestClient.scala    |  52 +-
 .../deploy/rest/StandaloneRestServer.scala    | 105 ++-
 .../rest/StandaloneRestSubmitSuite.scala      | 671 +++++++++++++-----
 3 files changed, 589 insertions(+), 239 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestClient.scala b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestClient.scala
index 115aa5278bb62..c4be1f19e8e9f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestClient.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestClient.scala
@@ -19,10 +19,11 @@ package org.apache.spark.deploy.rest
 
 import java.io.{DataOutputStream, FileNotFoundException}
 import java.net.{HttpURLConnection, SocketException, URL}
+import javax.servlet.http.HttpServletResponse
 
 import scala.io.Source
 
-import com.fasterxml.jackson.databind.JsonMappingException
+import com.fasterxml.jackson.core.JsonProcessingException
 import com.google.common.base.Charsets
 
 import org.apache.spark.{Logging, SparkConf, SPARK_VERSION => sparkVersion}
@@ -155,10 +156,21 @@ private[spark] class StandaloneRestClient extends Logging {
   /**
    * Read the response from the server and return it as a validated [[SubmitRestProtocolResponse]].
    * If the response represents an error, report the embedded message to the user.
+   * Exposed for testing.
    */
-  private def readResponse(connection: HttpURLConnection): SubmitRestProtocolResponse = {
+  private[rest] def readResponse(connection: HttpURLConnection): SubmitRestProtocolResponse = {
     try {
-      val responseJson = Source.fromInputStream(connection.getInputStream).mkString
+      val dataStream =
+        if (connection.getResponseCode == HttpServletResponse.SC_OK) {
+          connection.getInputStream
+        } else {
+          connection.getErrorStream
+        }
+      // If the server threw an exception while writing a response, it will not have a body
+      if (dataStream == null) {
+        throw new SubmitRestProtocolException("Server returned empty body")
+      }
+      val responseJson = Source.fromInputStream(dataStream).mkString
       logDebug(s"Response from the server:\n$responseJson")
       val response = SubmitRestProtocolMessage.fromJson(responseJson)
       response.validate()
@@ -177,7 +189,7 @@ private[spark] class StandaloneRestClient extends Logging {
       case unreachable @ (_: FileNotFoundException | _: SocketException) =>
         throw new SubmitRestConnectionException(
           s"Unable to connect to server ${connection.getURL}", unreachable)
-      case malformed @ (_: SubmitRestProtocolException | _: JsonMappingException) =>
+      case malformed @ (_: JsonProcessingException | _: SubmitRestProtocolException) =>
         throw new SubmitRestProtocolException(
           "Malformed response received from server", malformed)
     }
@@ -284,7 +296,27 @@ private[spark] object StandaloneRestClient {
   val REPORT_DRIVER_STATUS_MAX_TRIES = 10
   val PROTOCOL_VERSION = "v1"
 
-  /** Submit an application, assuming Spark parameters are specified through system properties. */
+  /**
+   * Submit an application, assuming Spark parameters are specified through the given config.
+   * This is abstracted to its own method for testing purposes.
+   */
+  private[rest] def run(
+      appResource: String,
+      mainClass: String,
+      appArgs: Array[String],
+      conf: SparkConf,
+      env: Map[String, String] = sys.env): SubmitRestProtocolResponse = {
+    val master = conf.getOption("spark.master").getOrElse {
+      throw new IllegalArgumentException("'spark.master' must be set.")
+    }
+    val sparkProperties = conf.getAll.toMap
+    val environmentVariables = env.filter { case (k, _) => k.startsWith("SPARK_") }
+    val client = new StandaloneRestClient
+    val submitRequest = client.constructSubmitRequest(
+      appResource, mainClass, appArgs, sparkProperties, environmentVariables)
+    client.createSubmission(master, submitRequest)
+  }
+
   def main(args: Array[String]): Unit = {
     if (args.size < 2) {
       sys.error("Usage: StandaloneRestClient [app resource] [main class] [app args*]")
@@ -294,14 +326,6 @@ private[spark] object StandaloneRestClient {
     val mainClass = args(1)
     val appArgs = args.slice(2, args.size)
     val conf = new SparkConf
-    val master = conf.getOption("spark.master").getOrElse {
-      throw new IllegalArgumentException("'spark.master' must be set.")
-    }
-    val sparkProperties = conf.getAll.toMap
-    val environmentVariables = sys.env.filter { case (k, _) => k.startsWith("SPARK_") }
-    val client = new StandaloneRestClient
-    val submitRequest = client.constructSubmitRequest(
-      appResource, mainClass, appArgs, sparkProperties, environmentVariables)
-    client.createSubmission(master, submitRequest)
+    run(appResource, mainClass, appArgs, conf)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
index acd3a2b5abe6c..f9e0478e4f874 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/StandaloneRestServer.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.deploy.rest
 
-import java.io.{DataOutputStream, File}
+import java.io.File
 import java.net.InetSocketAddress
 import javax.servlet.http.{HttpServlet, HttpServletRequest, HttpServletResponse}
 
 import scala.io.Source
 
 import akka.actor.ActorRef
-import com.fasterxml.jackson.databind.JsonMappingException
-import com.google.common.base.Charsets
+import com.fasterxml.jackson.core.JsonProcessingException
 import org.eclipse.jetty.server.Server
 import org.eclipse.jetty.servlet.{ServletHolder, ServletContextHandler}
 import org.eclipse.jetty.util.thread.QueuedThreadPool
@@ -70,14 +69,14 @@ private[spark] class StandaloneRestServer(
   import StandaloneRestServer._
 
   private var _server: Option[Server] = None
-  private val baseContext = s"/$PROTOCOL_VERSION/submissions"
-
-  // A mapping from servlets to the URL prefixes they are responsible for
-  private val servletToContext = Map[StandaloneRestServlet, String](
-    new SubmitRequestServlet(masterActor, masterUrl, masterConf) -> s"$baseContext/create/*",
-    new KillRequestServlet(masterActor, masterConf) -> s"$baseContext/kill/*",
-    new StatusRequestServlet(masterActor, masterConf) -> s"$baseContext/status/*",
-    new ErrorServlet -> "/*" // default handler
+
+  // A mapping from URL prefixes to servlets that serve them. Exposed for testing.
+  protected val baseContext = s"/$PROTOCOL_VERSION/submissions"
+  protected val contextToServlet = Map[String, StandaloneRestServlet](
+    s"$baseContext/create/*" -> new SubmitRequestServlet(masterActor, masterUrl, masterConf),
+    s"$baseContext/kill/*" -> new KillRequestServlet(masterActor, masterConf),
+    s"$baseContext/status/*" -> new StatusRequestServlet(masterActor, masterConf),
+    "/*" -> new ErrorServlet // default handler
   )
 
   /** Start the server and return the bound port. */
@@ -99,7 +98,7 @@ private[spark] class StandaloneRestServer(
     server.setThreadPool(threadPool)
     val mainHandler = new ServletContextHandler
     mainHandler.setContextPath("/")
-    servletToContext.foreach { case (servlet, prefix) =>
+    contextToServlet.foreach { case (prefix, servlet) =>
       mainHandler.addServlet(new ServletHolder(servlet), prefix)
     }
     server.setHandler(mainHandler)
@@ -113,7 +112,7 @@ private[spark] class StandaloneRestServer(
   }
 }
 
-private object StandaloneRestServer {
+private[rest] object StandaloneRestServer {
   val PROTOCOL_VERSION = StandaloneRestClient.PROTOCOL_VERSION
   val SC_UNKNOWN_PROTOCOL_VERSION = 468
 }
@@ -121,20 +120,7 @@ private object StandaloneRestServer {
 /**
  * An abstract servlet for handling requests passed to the [[StandaloneRestServer]].
  */
-private abstract class StandaloneRestServlet extends HttpServlet with Logging {
-
-  /** Service a request. If an exception is thrown in the process, indicate server error. */
-  protected override def service(
-      request: HttpServletRequest,
-      response: HttpServletResponse): Unit = {
-    try {
-      super.service(request, response)
-    } catch {
-      case e: Exception =>
-        logError("Exception while handling request", e)
-        response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR)
-    }
-  }
+private[rest] abstract class StandaloneRestServlet extends HttpServlet with Logging {
 
   /**
    * Serialize the given response message to JSON and send it through the response servlet.
@@ -146,11 +132,7 @@ private abstract class StandaloneRestServlet extends HttpServlet with Logging {
     val message = validateResponse(responseMessage, responseServlet)
     responseServlet.setContentType("application/json")
     responseServlet.setCharacterEncoding("utf-8")
-    responseServlet.setStatus(HttpServletResponse.SC_OK)
-    val content = message.toJson.getBytes(Charsets.UTF_8)
-    val out = new DataOutputStream(responseServlet.getOutputStream)
-    out.write(content)
-    out.close()
+    responseServlet.getWriter.write(message.toJson)
   }
 
   /**
@@ -186,6 +168,19 @@ private abstract class StandaloneRestServlet extends HttpServlet with Logging {
     e
   }
 
+  /**
+   * Parse a submission ID from the relative path, assuming it is the first part of the path.
+   * For instance, we expect the path to take the form /[submission ID]/maybe/something/else.
+   * The returned submission ID cannot be empty. If the path is unexpected, return None.
+   */
+  protected def parseSubmissionId(path: String): Option[String] = {
+    if (path == null || path.isEmpty) {
+      None
+    } else {
+      path.stripPrefix("/").split("/").headOption.filter(_.nonEmpty)
+    }
+  }
+
   /**
    * Validate the response to ensure that it is correctly constructed.
    *
@@ -209,7 +204,7 @@ private abstract class StandaloneRestServlet extends HttpServlet with Logging {
 /**
  * A servlet for handling kill requests passed to the [[StandaloneRestServer]].
  */
-private class KillRequestServlet(masterActor: ActorRef, conf: SparkConf)
+private[rest] class KillRequestServlet(masterActor: ActorRef, conf: SparkConf)
   extends StandaloneRestServlet {
 
   /**
@@ -219,18 +214,15 @@ private class KillRequestServlet(masterActor: ActorRef, conf: SparkConf)
   protected override def doPost(
       request: HttpServletRequest,
       response: HttpServletResponse): Unit = {
-    val submissionId = request.getPathInfo.stripPrefix("/")
-    val responseMessage =
-      if (submissionId.nonEmpty) {
-        handleKill(submissionId)
-      } else {
-        response.setStatus(HttpServletResponse.SC_BAD_REQUEST)
-        handleError("Submission ID is missing in kill request.")
-      }
+    val submissionId = parseSubmissionId(request.getPathInfo)
+    val responseMessage = submissionId.map(handleKill).getOrElse {
+      response.setStatus(HttpServletResponse.SC_BAD_REQUEST)
+      handleError("Submission ID is missing in kill request.")
+    }
     sendResponse(responseMessage, response)
   }
 
-  private def handleKill(submissionId: String): KillSubmissionResponse = {
+  protected def handleKill(submissionId: String): KillSubmissionResponse = {
     val askTimeout = AkkaUtils.askTimeout(conf)
     val response = AkkaUtils.askWithReply[DeployMessages.KillDriverResponse](
       DeployMessages.RequestKillDriver(submissionId), masterActor, askTimeout)
@@ -246,7 +238,7 @@ private class KillRequestServlet(masterActor: ActorRef, conf: SparkConf)
 /**
  * A servlet for handling status requests passed to the [[StandaloneRestServer]].
  */
-private class StatusRequestServlet(masterActor: ActorRef, conf: SparkConf)
+private[rest] class StatusRequestServlet(masterActor: ActorRef, conf: SparkConf)
   extends StandaloneRestServlet {
 
   /**
@@ -256,18 +248,15 @@ private class StatusRequestServlet(masterActor: ActorRef, conf: SparkConf)
   protected override def doGet(
       request: HttpServletRequest,
       response: HttpServletResponse): Unit = {
-    val submissionId = request.getPathInfo.stripPrefix("/")
-    val responseMessage =
-      if (submissionId.nonEmpty) {
-        handleStatus(submissionId)
-      } else {
-        response.setStatus(HttpServletResponse.SC_BAD_REQUEST)
-        handleError("Submission ID is missing in status request.")
-      }
+    val submissionId = parseSubmissionId(request.getPathInfo)
+    val responseMessage = submissionId.map(handleStatus).getOrElse {
+      response.setStatus(HttpServletResponse.SC_BAD_REQUEST)
+      handleError("Submission ID is missing in status request.")
+    }
     sendResponse(responseMessage, response)
   }
 
-  private def handleStatus(submissionId: String): SubmissionStatusResponse = {
+  protected def handleStatus(submissionId: String): SubmissionStatusResponse = {
     val askTimeout = AkkaUtils.askTimeout(conf)
     val response = AkkaUtils.askWithReply[DeployMessages.DriverStatusResponse](
       DeployMessages.RequestDriverStatus(submissionId), masterActor, askTimeout)
@@ -287,7 +276,7 @@ private class StatusRequestServlet(masterActor: ActorRef, conf: SparkConf)
 /**
  * A servlet for handling submit requests passed to the [[StandaloneRestServer]].
  */
-private class SubmitRequestServlet(
+private[rest] class SubmitRequestServlet(
     masterActor: ActorRef,
     masterUrl: String,
     conf: SparkConf)
@@ -313,7 +302,7 @@ private class SubmitRequestServlet(
         handleSubmit(requestMessageJson, requestMessage, responseServlet)
       } catch {
         // The client failed to provide a valid JSON, so this is not our fault
-        case e @ (_: JsonMappingException | _: SubmitRestProtocolException) =>
+        case e @ (_: JsonProcessingException | _: SubmitRestProtocolException) =>
           responseServlet.setStatus(HttpServletResponse.SC_BAD_REQUEST)
           handleError("Malformed request: " + formatException(e))
       }
@@ -413,7 +402,7 @@ private class ErrorServlet extends StandaloneRestServlet {
       request: HttpServletRequest,
       response: HttpServletResponse): Unit = {
     val path = request.getPathInfo
-    val parts = path.stripPrefix("/").split("/").toSeq
+    val parts = path.stripPrefix("/").split("/").filter(_.nonEmpty).toList
     var versionMismatch = false
     var msg =
       parts match {
@@ -423,10 +412,10 @@ private class ErrorServlet extends StandaloneRestServlet {
         case `serverVersion` :: Nil =>
           // http://host:port/correct-version
           "Missing the /submissions prefix."
-        case `serverVersion` :: "submissions" :: Nil =>
-          // http://host:port/correct-version/submissions
+        case `serverVersion` :: "submissions" :: tail =>
+          // http://host:port/correct-version/submissions/*
           "Missing an action: please specify one of /create, /kill, or /status."
-        case unknownVersion :: _ =>
+        case unknownVersion :: tail =>
           // http://host:port/unknown-version/*
           versionMismatch = true
           s"Unknown protocol version '$unknownVersion'."
diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
index 29aed89b67aa7..a345e06ecb7d2 100644
--- a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
@@ -17,141 +17,412 @@
 
 package org.apache.spark.deploy.rest
 
-import java.io.{File, FileInputStream, FileOutputStream, PrintWriter}
-import java.util.jar.{JarEntry, JarOutputStream}
-import java.util.zip.ZipEntry
+import java.io.DataOutputStream
+import java.net.{HttpURLConnection, URL}
+import javax.servlet.http.HttpServletResponse
 
-import scala.collection.mutable.ArrayBuffer
-import scala.io.Source
+import scala.collection.mutable
 
-import akka.actor.ActorSystem
-import com.google.common.io.ByteStreams
-import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
-import org.scalatest.exceptions.TestFailedException
+import akka.actor.{Actor, ActorRef, ActorSystem, Props}
+import com.google.common.base.Charsets
+import org.scalatest.{BeforeAndAfterEach, FunSuite}
+import org.json4s.JsonAST._
+import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark._
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{AkkaUtils, Utils}
+import org.apache.spark.deploy.DeployMessages._
 import org.apache.spark.deploy.{SparkSubmit, SparkSubmitArguments}
-import org.apache.spark.deploy.master.{DriverState, Master}
-import org.apache.spark.deploy.worker.Worker
+import org.apache.spark.deploy.master.DriverState._
 
 /**
- * End-to-end tests for the REST application submission protocol in standalone mode.
+ * Tests for the REST application submission protocol used in standalone cluster mode.
  */
-class StandaloneRestSubmitSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAfterEach {
-  private val systemsToStop = new ArrayBuffer[ActorSystem]
-  private val masterRestUrl = startLocalCluster()
+class StandaloneRestSubmitSuite extends FunSuite with BeforeAndAfterEach {
   private val client = new StandaloneRestClient
-  private val mainJar = StandaloneRestSubmitSuite.createJar()
-  private val mainClass = StandaloneRestApp.getClass.getName.stripSuffix("$")
+  private var actorSystem: Option[ActorSystem] = None
+  private var server: Option[StandaloneRestServer] = None
 
-  override def afterAll() {
-    systemsToStop.foreach(_.shutdown())
+  override def afterEach() {
+    actorSystem.foreach(_.shutdown())
+    server.foreach(_.stop())
   }
 
-  test("simple submit until completion") {
-    val resultsFile = File.createTempFile("test-submit", ".txt")
-    val numbers = Seq(1, 2, 3)
-    val size = 500
-    val submissionId = submitApplication(resultsFile, numbers, size)
-    waitUntilFinished(submissionId)
-    validateResult(resultsFile, numbers, size)
+  test("construct submit request") {
+    val appArgs = Array("one", "two", "three")
+    val sparkProperties = Map("spark.app.name" -> "pi")
+    val environmentVariables = Map("SPARK_ONE" -> "UN", "SPARK_TWO" -> "DEUX")
+    val request = client.constructSubmitRequest(
+      "my-app-resource", "my-main-class", appArgs, sparkProperties, environmentVariables)
+    assert(request.action === Utils.getFormattedClassName(request))
+    assert(request.clientSparkVersion === SPARK_VERSION)
+    assert(request.appResource === "my-app-resource")
+    assert(request.mainClass === "my-main-class")
+    assert(request.appArgs === appArgs)
+    assert(request.sparkProperties === sparkProperties)
+    assert(request.environmentVariables === environmentVariables)
   }
 
-  test("kill empty submission") {
-    val response = client.killSubmission(masterRestUrl, "submission-that-does-not-exist")
-    val killResponse = getKillResponse(response)
-    val killSuccess = killResponse.success
-    assert(!killSuccess)
+  test("create submission") {
+    val submittedDriverId = "my-driver-id"
+    val submitMessage = "your driver is submitted"
+    val masterUrl = startDummyServer(submitId = submittedDriverId, submitMessage = submitMessage)
+    val appArgs = Array("one", "two", "four")
+    val request = constructSubmitRequest(masterUrl, appArgs)
+    assert(request.appArgs === appArgs)
+    assert(request.sparkProperties("spark.master") === masterUrl)
+    val response = client.createSubmission(masterUrl, request)
+    val submitResponse = getSubmitResponse(response)
+    assert(submitResponse.action === Utils.getFormattedClassName(submitResponse))
+    assert(submitResponse.serverSparkVersion === SPARK_VERSION)
+    assert(submitResponse.message === submitMessage)
+    assert(submitResponse.submissionId === submittedDriverId)
+    assert(submitResponse.success)
+  }
+
+  test("create submission from main method") {
+    val submittedDriverId = "your-driver-id"
+    val submitMessage = "my driver is submitted"
+    val masterUrl = startDummyServer(submitId = submittedDriverId, submitMessage = submitMessage)
+    val conf = new SparkConf(loadDefaults = false)
+    conf.set("spark.master", masterUrl)
+    conf.set("spark.app.name", "dreamer")
+    val appArgs = Array("one", "two", "six")
+    // main method calls this
+    val response = StandaloneRestClient.run("app-resource", "main-class", appArgs, conf)
+    val submitResponse = getSubmitResponse(response)
+    assert(submitResponse.action === Utils.getFormattedClassName(submitResponse))
+    assert(submitResponse.serverSparkVersion === SPARK_VERSION)
+    assert(submitResponse.message === submitMessage)
+    assert(submitResponse.submissionId === submittedDriverId)
+    assert(submitResponse.success)
   }
 
-  test("kill running submission") {
-    val resultsFile = File.createTempFile("test-kill", ".txt")
-    val numbers = Seq(1, 2, 3)
-    val size = 500
-    val submissionId = submitApplication(resultsFile, numbers, size)
-    val response = client.killSubmission(masterRestUrl, submissionId)
+  test("kill submission") {
+    val submissionId = "my-lyft-driver"
+    val killMessage = "your driver is killed"
+    val masterUrl = startDummyServer(killMessage = killMessage)
+    val response = client.killSubmission(masterUrl, submissionId)
     val killResponse = getKillResponse(response)
-    val killSuccess = killResponse.success
-    waitUntilFinished(submissionId)
-    val response2 = client.requestSubmissionStatus(masterRestUrl, submissionId)
-    val statusResponse = getStatusResponse(response2)
-    val statusSuccess = statusResponse.success
-    val driverState = statusResponse.driverState
-    assert(killSuccess)
-    assert(statusSuccess)
-    assert(driverState === DriverState.KILLED.toString)
-    // we should not see the expected results because we killed the submission
-    intercept[TestFailedException] { validateResult(resultsFile, numbers, size) }
+    assert(killResponse.action === Utils.getFormattedClassName(killResponse))
+    assert(killResponse.serverSparkVersion === SPARK_VERSION)
+    assert(killResponse.message === killMessage)
+    assert(killResponse.submissionId === submissionId)
+    assert(killResponse.success)
   }
 
-  test("request status for empty submission") {
-    val response = client.requestSubmissionStatus(masterRestUrl, "submission-that-does-not-exist")
+  test("request submission status") {
+    val submissionId = "my-uber-driver"
+    val submissionState = KILLED
+    val submissionException = new Exception("there was an irresponsible mix of alcohol and cars")
+    val masterUrl = startDummyServer(state = submissionState, exception = Some(submissionException))
+    val response = client.requestSubmissionStatus(masterUrl, submissionId)
     val statusResponse = getStatusResponse(response)
-    val statusSuccess = statusResponse.success
-    assert(!statusSuccess)
+    assert(statusResponse.action === Utils.getFormattedClassName(statusResponse))
+    assert(statusResponse.serverSparkVersion === SPARK_VERSION)
+    assert(statusResponse.message.contains(submissionException.getMessage))
+    assert(statusResponse.submissionId === submissionId)
+    assert(statusResponse.driverState === submissionState.toString)
+    assert(statusResponse.success)
+  }
+
+  test("create then kill") {
+    val masterUrl = startSmartServer()
+    val request = constructSubmitRequest(masterUrl)
+    val response1 = client.createSubmission(masterUrl, request)
+    val submitResponse = getSubmitResponse(response1)
+    assert(submitResponse.success)
+    assert(submitResponse.submissionId != null)
+    // kill submission that was just created
+    val submissionId = submitResponse.submissionId
+    val response2 = client.killSubmission(masterUrl, submissionId)
+    val killResponse = getKillResponse(response2)
+    assert(killResponse.success)
+    assert(killResponse.submissionId === submissionId)
+  }
+
+  test("create then request status") {
+    val masterUrl = startSmartServer()
+    val request = constructSubmitRequest(masterUrl)
+    val response1 = client.createSubmission(masterUrl, request)
+    val submitResponse = getSubmitResponse(response1)
+    assert(submitResponse.success)
+    assert(submitResponse.submissionId != null)
+    // request status of submission that was just created
+    val submissionId = submitResponse.submissionId
+    val response2 = client.requestSubmissionStatus(masterUrl, submissionId)
+    val statusResponse = getStatusResponse(response2)
+    assert(statusResponse.success)
+    assert(statusResponse.submissionId === submissionId)
+    assert(statusResponse.driverState === RUNNING.toString)
+  }
+
+  test("create then kill then request status") {
+    val masterUrl = startSmartServer()
+    val request = constructSubmitRequest(masterUrl)
+    val response1 = client.createSubmission(masterUrl, request)
+    val response2 = client.createSubmission(masterUrl, request)
+    val submitResponse1 = getSubmitResponse(response1)
+    val submitResponse2 = getSubmitResponse(response2)
+    assert(submitResponse1.success)
+    assert(submitResponse2.success)
+    assert(submitResponse1.submissionId != null)
+    assert(submitResponse2.submissionId != null)
+    val submissionId1 = submitResponse1.submissionId
+    val submissionId2 = submitResponse2.submissionId
+    // kill only submission 1, but not submission 2
+    val response3 = client.killSubmission(masterUrl, submissionId1)
+    val killResponse = getKillResponse(response3)
+    assert(killResponse.success)
+    assert(killResponse.submissionId === submissionId1)
+    // request status for both submissions: 1 should be KILLED but 2 should be RUNNING still
+    val response4 = client.requestSubmissionStatus(masterUrl, submissionId1)
+    val response5 = client.requestSubmissionStatus(masterUrl, submissionId2)
+    val statusResponse1 = getStatusResponse(response4)
+    val statusResponse2 = getStatusResponse(response5)
+    assert(statusResponse1.submissionId === submissionId1)
+    assert(statusResponse2.submissionId === submissionId2)
+    assert(statusResponse1.driverState === KILLED.toString)
+    assert(statusResponse2.driverState === RUNNING.toString)
+  }
+
+  test("kill or request status before create") {
+    val masterUrl = startSmartServer()
+    val doesNotExist = "does-not-exist"
+    // kill a non-existent submission
+    val response1 = client.killSubmission(masterUrl, doesNotExist)
+    val killResponse = getKillResponse(response1)
+    assert(!killResponse.success)
+    assert(killResponse.submissionId === doesNotExist)
+    // request status for a non-existent submission
+    val response2 = client.requestSubmissionStatus(masterUrl, doesNotExist)
+    val statusResponse = getStatusResponse(response2)
+    assert(!statusResponse.success)
+    assert(statusResponse.submissionId === doesNotExist)
+  }
+
+  /* ---------------------------------------- *
+   |     Aberrant client / server behavior    |
+   * ---------------------------------------- */
+
+  test("good request paths") {
+    val masterUrl = startSmartServer()
+    val httpUrl = masterUrl.replace("spark://", "http://")
+    val v = StandaloneRestServer.PROTOCOL_VERSION
+    val json = constructSubmitRequest(masterUrl).toJson
+    val submitRequestPath = s"$httpUrl/$v/submissions/create"
+    val killRequestPath = s"$httpUrl/$v/submissions/kill"
+    val statusRequestPath = s"$httpUrl/$v/submissions/status"
+    val (response1, code1) = sendHttpRequestWithResponse(submitRequestPath, "POST", json)
+    val (response2, code2) = sendHttpRequestWithResponse(s"$killRequestPath/anything", "POST")
+    val (response3, code3) = sendHttpRequestWithResponse(s"$killRequestPath/any/thing", "POST")
+    val (response4, code4) = sendHttpRequestWithResponse(s"$statusRequestPath/anything", "GET")
+    val (response5, code5) = sendHttpRequestWithResponse(s"$statusRequestPath/any/thing", "GET")
+    // these should all succeed and the responses should be of the correct types
+    getSubmitResponse(response1)
+    val killResponse1 = getKillResponse(response2)
+    val killResponse2 = getKillResponse(response3)
+    val statusResponse1 = getStatusResponse(response4)
+    val statusResponse2 = getStatusResponse(response5)
+    assert(killResponse1.submissionId === "anything")
+    assert(killResponse2.submissionId === "any")
+    assert(statusResponse1.submissionId === "anything")
+    assert(statusResponse2.submissionId === "any")
+    assert(code1 === HttpServletResponse.SC_OK)
+    assert(code2 === HttpServletResponse.SC_OK)
+    assert(code3 === HttpServletResponse.SC_OK)
+    assert(code4 === HttpServletResponse.SC_OK)
+    assert(code5 === HttpServletResponse.SC_OK)
+  }
+
+  test("good request paths, bad requests") {
+    val masterUrl = startSmartServer()
+    val httpUrl = masterUrl.replace("spark://", "http://")
+    val v = StandaloneRestServer.PROTOCOL_VERSION
+    val submitRequestPath = s"$httpUrl/$v/submissions/create"
+    val killRequestPath = s"$httpUrl/$v/submissions/kill"
+    val statusRequestPath = s"$httpUrl/$v/submissions/status"
+    val goodJson = constructSubmitRequest(masterUrl).toJson
+    val badJson1 = goodJson.replaceAll("action", "fraction") // invalid JSON
+    val badJson2 = goodJson.substring(goodJson.size / 2) // malformed JSON
+    val (response1, code1) = sendHttpRequestWithResponse(submitRequestPath, "POST") // missing JSON
+    val (response2, code2) = sendHttpRequestWithResponse(submitRequestPath, "POST", badJson1)
+    val (response3, code3) = sendHttpRequestWithResponse(submitRequestPath, "POST", badJson2)
+    val (response4, code4) = sendHttpRequestWithResponse(killRequestPath, "POST") // missing ID
+    val (response5, code5) = sendHttpRequestWithResponse(s"$killRequestPath/", "POST")
+    val (response6, code6) = sendHttpRequestWithResponse(statusRequestPath, "GET") // missing ID
+    val (response7, code7) = sendHttpRequestWithResponse(s"$statusRequestPath/", "GET")
+    // these should all fail as error responses
+    getErrorResponse(response1)
+    getErrorResponse(response2)
+    getErrorResponse(response3)
+    getErrorResponse(response4)
+    getErrorResponse(response5)
+    getErrorResponse(response6)
+    getErrorResponse(response7)
+    assert(code1 === HttpServletResponse.SC_BAD_REQUEST)
+    assert(code2 === HttpServletResponse.SC_BAD_REQUEST)
+    assert(code3 === HttpServletResponse.SC_BAD_REQUEST)
+    assert(code4 === HttpServletResponse.SC_BAD_REQUEST)
+    assert(code5 === HttpServletResponse.SC_BAD_REQUEST)
+    assert(code6 === HttpServletResponse.SC_BAD_REQUEST)
+    assert(code7 === HttpServletResponse.SC_BAD_REQUEST)
+  }
+
+  test("bad request paths") {
+    val masterUrl = startSmartServer()
+    val httpUrl = masterUrl.replace("spark://", "http://")
+    val v = StandaloneRestServer.PROTOCOL_VERSION
+    val (response1, code1) = sendHttpRequestWithResponse(httpUrl, "GET")
+    val (response2, code2) = sendHttpRequestWithResponse(s"$httpUrl/", "GET")
+    val (response3, code3) = sendHttpRequestWithResponse(s"$httpUrl/$v", "GET")
+    val (response4, code4) = sendHttpRequestWithResponse(s"$httpUrl/$v/", "GET")
+    val (response5, code5) = sendHttpRequestWithResponse(s"$httpUrl/$v/submissions", "GET")
+    val (response6, code6) = sendHttpRequestWithResponse(s"$httpUrl/$v/submissions/", "GET")
+    val (response7, code7) = sendHttpRequestWithResponse(s"$httpUrl/$v/submissions/bad", "GET")
+    val (response8, code8) = sendHttpRequestWithResponse(s"$httpUrl/bad-version", "GET")
+    assert(code1 === HttpServletResponse.SC_BAD_REQUEST)
+    assert(code2 === HttpServletResponse.SC_BAD_REQUEST)
+    assert(code3 === HttpServletResponse.SC_BAD_REQUEST)
+    assert(code4 === HttpServletResponse.SC_BAD_REQUEST)
+    assert(code5 === HttpServletResponse.SC_BAD_REQUEST)
+    assert(code6 === HttpServletResponse.SC_BAD_REQUEST)
+    assert(code7 === HttpServletResponse.SC_BAD_REQUEST)
+    assert(code8 === StandaloneRestServer.SC_UNKNOWN_PROTOCOL_VERSION)
+    // all responses should be error responses
+    val errorResponse1 = getErrorResponse(response1)
+    val errorResponse2 = getErrorResponse(response2)
+    val errorResponse3 = getErrorResponse(response3)
+    val errorResponse4 = getErrorResponse(response4)
+    val errorResponse5 = getErrorResponse(response5)
+    val errorResponse6 = getErrorResponse(response6)
+    val errorResponse7 = getErrorResponse(response7)
+    val errorResponse8 = getErrorResponse(response8)
+    // only the incompatible version response should have server protocol version set
+    assert(errorResponse1.highestProtocolVersion === null)
+    assert(errorResponse2.highestProtocolVersion === null)
+    assert(errorResponse3.highestProtocolVersion === null)
+    assert(errorResponse4.highestProtocolVersion === null)
+    assert(errorResponse5.highestProtocolVersion === null)
+    assert(errorResponse6.highestProtocolVersion === null)
+    assert(errorResponse7.highestProtocolVersion === null)
+    assert(errorResponse8.highestProtocolVersion === StandaloneRestServer.PROTOCOL_VERSION)
+  }
+
+  test("server returns unknown fields") {
+    val masterUrl = startSmartServer()
+    val httpUrl = masterUrl.replace("spark://", "http://")
+    val v = StandaloneRestServer.PROTOCOL_VERSION
+    val submitRequestPath = s"$httpUrl/$v/submissions/create"
+    val oldJson = constructSubmitRequest(masterUrl).toJson
+    val oldFields = parse(oldJson).asInstanceOf[JObject].obj
+    val newFields = oldFields ++ Seq(
+      JField("tomato", JString("not-a-fruit")),
+      JField("potato", JString("not-po-tah-to"))
+    )
+    val newJson = pretty(render(JObject(newFields)))
+    // send two requests, one with the unknown fields and the other without
+    val (response1, code1) = sendHttpRequestWithResponse(submitRequestPath, "POST", oldJson)
+    val (response2, code2) = sendHttpRequestWithResponse(submitRequestPath, "POST", newJson)
+    val submitResponse1 = getSubmitResponse(response1)
+    val submitResponse2 = getSubmitResponse(response2)
+    assert(code1 === HttpServletResponse.SC_OK)
+    assert(code2 === HttpServletResponse.SC_OK)
+    // only the response to the modified request should have unknown fields set
+    assert(submitResponse1.unknownFields === null)
+    assert(submitResponse2.unknownFields === Array("tomato", "potato"))
+  }
+
+  test("client handles faulty server") {
+    val masterUrl = startFaultyServer()
+    val httpUrl = masterUrl.replace("spark://", "http://")
+    val v = StandaloneRestServer.PROTOCOL_VERSION
+    val submitRequestPath = s"$httpUrl/$v/submissions/create"
+    val killRequestPath = s"$httpUrl/$v/submissions/kill/anything"
+    val statusRequestPath = s"$httpUrl/$v/submissions/status/anything"
+    val json = constructSubmitRequest(masterUrl).toJson
+    // server returns malformed response unwittingly
+    // client should throw an appropriate exception to indicate server failure
+    val conn1 = sendHttpRequest(submitRequestPath, "POST", json)
+    intercept[SubmitRestProtocolException] { client.readResponse(conn1) }
+    // server attempts to send invalid response, but fails internally on validation
+    // client should receive an error response as server is able to recover
+    val conn2 = sendHttpRequest(killRequestPath, "POST")
+    val response2 = client.readResponse(conn2)
+    getErrorResponse(response2)
+    assert(conn2.getResponseCode === HttpServletResponse.SC_INTERNAL_SERVER_ERROR)
+    // server explodes internally beyond recovery
+    // client should throw an appropriate exception to indicate server failure
+    val conn3 = sendHttpRequest(statusRequestPath, "GET")
+    intercept[SubmitRestProtocolException] { client.readResponse(conn3) } // empty response
+    assert(conn3.getResponseCode === HttpServletResponse.SC_INTERNAL_SERVER_ERROR)
+  }
+
+  /* --------------------- *
+   |     Helper methods    |
+   * --------------------- */
+
+  /** Start a dummy server that responds to requests using the specified parameters. */
+  private def startDummyServer(
+      submitId: String = "fake-driver-id",
+      submitMessage: String = "driver is submitted",
+      killMessage: String = "driver is killed",
+      state: DriverState = FINISHED,
+      exception: Option[Exception] = None): String = {
+    startServer(new DummyMaster(submitId, submitMessage, killMessage, state, exception))
+  }
+
+  /** Start a smarter dummy server that keeps track of submitted driver states. */
+  private def startSmartServer(): String = {
+    startServer(new SmarterMaster)
+  }
+
+  /** Start a dummy server that is faulty in many ways... */
+  private def startFaultyServer(): String = {
+    startServer(new DummyMaster, faulty = true)
   }
 
   /**
-   * Start a local cluster containing one Master and a few Workers.
-   * Do not use [[org.apache.spark.deploy.LocalSparkCluster]] here because we want the REST URL.
-   * Return the Master's REST URL to which applications should be submitted.
+   * Start a [[StandaloneRestServer]] that communicates with the given actor.
+   * If `faulty` is true, start an [[FaultyStandaloneRestServer]] instead.
+   * Return the master URL that corresponds to the address of this server.
    */
-  private def startLocalCluster(): String = {
-    val conf = new SparkConf(false)
-      .set("spark.master.rest.enabled", "true")
-      .set("spark.master.rest.port", "0")
-    val (numWorkers, coresPerWorker, memPerWorker) = (2, 1, 512)
-    val localHostName = Utils.localHostName()
-    val (masterSystem, masterPort, _, _masterRestPort) =
-      Master.startSystemAndActor(localHostName, 0, 0, conf)
-    val masterRestPort = _masterRestPort.getOrElse { fail("REST server not started on Master!") }
-    val masterUrl = "spark://" + localHostName + ":" + masterPort
-    val masterRestUrl = "spark://" + localHostName + ":" + masterRestPort
-    (1 to numWorkers).foreach { n =>
-      val (workerSystem, _) = Worker.startSystemAndActor(
-        localHostName, 0, 0, coresPerWorker, memPerWorker, Array(masterUrl), null, Some(n))
-      systemsToStop.append(workerSystem)
-    }
-    systemsToStop.append(masterSystem)
-    masterRestUrl
+  private def startServer(makeFakeMaster: => Actor, faulty: Boolean = false): String = {
+    val name = "test-standalone-rest-protocol"
+    val conf = new SparkConf
+    val localhost = Utils.localHostName()
+    val securityManager = new SecurityManager(conf)
+    val (_actorSystem, _) = AkkaUtils.createActorSystem(name, localhost, 0, conf, securityManager)
+    val fakeMasterRef = _actorSystem.actorOf(Props(makeFakeMaster))
+    val _server =
+      if (faulty) {
+        new FaultyStandaloneRestServer(localhost, 0, fakeMasterRef, "spark://fake:7077", conf)
+      } else {
+        new StandaloneRestServer(localhost, 0, fakeMasterRef, "spark://fake:7077", conf)
+      }
+    val port = _server.start()
+    // set these to clean them up after every test
+    actorSystem = Some(_actorSystem)
+    server = Some(_server)
+    s"spark://$localhost:$port"
   }
 
-  /** Submit the [[StandaloneRestApp]] and return the corresponding submission ID. */
-  private def submitApplication(resultsFile: File, numbers: Seq[Int], size: Int): String = {
-    val appArgs = Seq(resultsFile.getAbsolutePath) ++ numbers.map(_.toString) ++ Seq(size.toString)
+  /** Create a submit request with real parameters using Spark submit. */
+  private def constructSubmitRequest(
+      masterUrl: String,
+      appArgs: Array[String] = Array.empty): CreateSubmissionRequest = {
+    val mainClass = "main-class-not-used"
+    val mainJar = "dummy-jar-not-used.jar"
     val commandLineArgs = Array(
       "--deploy-mode", "cluster",
-      "--master", masterRestUrl,
+      "--master", masterUrl,
       "--name", mainClass,
       "--class", mainClass,
       mainJar) ++ appArgs
     val args = new SparkSubmitArguments(commandLineArgs)
     val (_, _, sparkProperties, _) = SparkSubmit.prepareSubmitEnvironment(args)
-    val request = client.constructSubmitRequest(
-      mainJar, mainClass, appArgs.toArray, sparkProperties.toMap, Map.empty)
-    val response = client.createSubmission(masterRestUrl, request)
-    val submitResponse = getSubmitResponse(response)
-    val submissionId = submitResponse.submissionId
-    assert(submissionId != null, "Application submission was unsuccessful!")
-    submissionId
-  }
-
-  /** Wait until the given submission has finished running up to the specified timeout. */
-  private def waitUntilFinished(submissionId: String, maxSeconds: Int = 30): Unit = {
-    var finished = false
-    val expireTime = System.currentTimeMillis + maxSeconds * 1000
-    while (!finished) {
-      val response = client.requestSubmissionStatus(masterRestUrl, submissionId)
-      val statusResponse = getStatusResponse(response)
-      val driverState = statusResponse.driverState
-      finished =
-        driverState != DriverState.SUBMITTED.toString &&
-        driverState != DriverState.RUNNING.toString
-      if (System.currentTimeMillis > expireTime) {
-        fail(s"Driver $submissionId did not finish within $maxSeconds seconds.")
-      }
-    }
+    client.constructSubmitRequest(
+      mainJar, mainClass, appArgs, sparkProperties.toMap, Map.empty)
   }
 
   /** Return the response as a submit response, or fail with error otherwise. */
@@ -181,85 +452,151 @@ class StandaloneRestSubmitSuite extends FunSuite with BeforeAndAfterAll with Bef
     }
   }
 
-  /** Validate whether the application produced the corrupt output. */
-  private def validateResult(resultsFile: File, numbers: Seq[Int], size: Int): Unit = {
-    val lines = Source.fromFile(resultsFile.getAbsolutePath).getLines().toSeq
-    val unexpectedContent =
-      if (lines.nonEmpty) {
-        "[\n" + lines.map { l => "  " + l }.mkString("\n") + "\n]"
-      } else {
-        "[EMPTY]"
-      }
-    assert(lines.size === 2, s"Unexpected content in file: $unexpectedContent")
-    assert(lines(0).toInt === numbers.sum, s"Sum of ${numbers.mkString(",")} is incorrect")
-    assert(lines(1).toInt === (size / 2) + 1, "Result of Spark job is incorrect")
+  /** Return the response as an error response, or fail if the response was not an error. */
+  private def getErrorResponse(response: SubmitRestProtocolResponse): ErrorResponse = {
+    response match {
+      case e: ErrorResponse => e
+      case r => fail(s"Expected error response. Actual: ${r.toJson}")
+    }
   }
-}
-
-private object StandaloneRestSubmitSuite {
-  private val pathPrefix = this.getClass.getPackage.getName.replaceAll("\\.", "/")
 
   /**
-   * Create a jar that contains all the class files needed for running the [[StandaloneRestApp]].
-   * Return the absolute path to that jar.
+   * Send an HTTP request to the given URL using the method and the body specified.
+   * Return the connection object.
    */
-  def createJar(): String = {
-    val jarFile = File.createTempFile("test-standalone-rest-protocol", ".jar")
-    val jarFileStream = new FileOutputStream(jarFile)
-    val jarStream = new JarOutputStream(jarFileStream, new java.util.jar.Manifest)
-    jarStream.putNextEntry(new ZipEntry(pathPrefix))
-    getClassFiles.foreach { cf =>
-      jarStream.putNextEntry(new JarEntry(pathPrefix + "/" + cf.getName))
-      val in = new FileInputStream(cf)
-      ByteStreams.copy(in, jarStream)
-      in.close()
+  private def sendHttpRequest(
+      url: String,
+      method: String,
+      body: String = ""): HttpURLConnection = {
+    val conn = new URL(url).openConnection().asInstanceOf[HttpURLConnection]
+    conn.setRequestMethod(method)
+    if (body.nonEmpty) {
+      conn.setDoOutput(true)
+      val out = new DataOutputStream(conn.getOutputStream)
+      out.write(body.getBytes(Charsets.UTF_8))
+      out.close()
     }
-    jarStream.close()
-    jarFileStream.close()
-    jarFile.getAbsolutePath
+    conn
   }
 
   /**
-   * Return a list of class files compiled for [[StandaloneRestApp]].
-   * This includes all the anonymous classes used in the application.
+   * Send an HTTP request to the given URL using the method and the body specified.
+   * Return a 2-tuple of the response message from the server and the response code.
    */
-  private def getClassFiles: Seq[File] = {
-    val className = Utils.getFormattedClassName(StandaloneRestApp)
-    val clazz = StandaloneRestApp.getClass
-    val basePath = clazz.getProtectionDomain.getCodeSource.getLocation.toURI.getPath
-    val baseDir = new File(basePath + "/" + pathPrefix)
-    baseDir.listFiles().filter(_.getName.contains(className))
+  private def sendHttpRequestWithResponse(
+      url: String,
+      method: String,
+      body: String = ""): (SubmitRestProtocolResponse, Int) = {
+    val conn = sendHttpRequest(url, method, body)
+    (client.readResponse(conn), conn.getResponseCode)
   }
 }
 
 /**
- * Sample application to be submitted to the cluster using the REST gateway.
- * All relevant classes will be packaged into a jar at run time.
+ * A mock standalone Master that responds with dummy messages.
+ * In all responses, the success parameter is always true.
  */
-object StandaloneRestApp {
-  // Usage: [path to results file] [num1] [num2] [num3] [rddSize]
-  // The first line of the results file should be (num1 + num2 + num3)
-  // The second line should be (rddSize / 2) + 1
-  def main(args: Array[String]) {
-    assert(args.size == 5, s"Expected exactly 5 arguments: ${args.mkString(",")}")
-    val resultFile = new File(args(0))
-    val writer = new PrintWriter(resultFile)
-    try {
-      val conf = new SparkConf()
-      val sc = new SparkContext(conf)
-      val firstLine = args(1).toInt + args(2).toInt + args(3).toInt
-      val secondLine = sc.parallelize(1 to args(4).toInt)
-        .map { i => (i / 2, i) }
-        .reduceByKey(_ + _)
-        .count()
-      writer.println(firstLine)
-      writer.println(secondLine)
-    } catch {
-      case e: Exception =>
-        writer.println(e)
-        e.getStackTrace.foreach { l => writer.println("  " + l) }
-    } finally {
-      writer.close()
+private class DummyMaster(
+    submitId: String = "fake-driver-id",
+    submitMessage: String = "submitted",
+    killMessage: String = "killed",
+    state: DriverState = FINISHED,
+    exception: Option[Exception] = None)
+  extends Actor {
+
+  override def receive = {
+    case RequestSubmitDriver(driverDesc) =>
+      sender ! SubmitDriverResponse(success = true, Some(submitId), submitMessage)
+    case RequestKillDriver(driverId) =>
+      sender ! KillDriverResponse(driverId, success = true, killMessage)
+    case RequestDriverStatus(driverId) =>
+      sender ! DriverStatusResponse(found = true, Some(state), None, None, exception)
+  }
+}
+
+/**
+ * A mock standalone Master that keeps track of drivers that have been submitted.
+ *
+ * If a driver is submitted, its state is immediately set to RUNNING.
+ * If an existing driver is killed, its state is immediately set to KILLED.
+ * If an existing driver's status is requested, its state is returned in the response.
+ * Submits are always successful while kills and status requests are successful only
+ * if the driver was submitted in the past.
+ */
+private class SmarterMaster extends Actor {
+  private var counter: Int = 0
+  private val submittedDrivers = new mutable.HashMap[String, DriverState]
+
+  override def receive = {
+    case RequestSubmitDriver(driverDesc) =>
+      val driverId = s"driver-$counter"
+      submittedDrivers(driverId) = RUNNING
+      counter += 1
+      sender ! SubmitDriverResponse(success = true, Some(driverId), "submitted")
+
+    case RequestKillDriver(driverId) =>
+      val success = submittedDrivers.contains(driverId)
+      if (success) {
+        submittedDrivers(driverId) = KILLED
+      }
+      sender ! KillDriverResponse(driverId, success, "killed")
+
+    case RequestDriverStatus(driverId) =>
+      val found = submittedDrivers.contains(driverId)
+      val state = submittedDrivers.get(driverId)
+      sender ! DriverStatusResponse(found, state, None, None, None)
+  }
+}
+
+/**
+ * A [[StandaloneRestServer]] that is faulty in many ways.
+ *
+ * When handling a submit request, the server returns a malformed JSON.
+ * When handling a kill request, the server returns an invalid JSON.
+ * When handling a status request, the server throws an internal exception.
+ * The purpose of this class is to test that client handles these cases gracefully.
+ */
+private class FaultyStandaloneRestServer(
+    host: String,
+    requestedPort: Int,
+    masterActor: ActorRef,
+    masterUrl: String,
+    masterConf: SparkConf)
+  extends StandaloneRestServer(host, requestedPort, masterActor, masterUrl, masterConf) {
+
+  protected override val contextToServlet = Map[String, StandaloneRestServlet](
+    s"$baseContext/create/*" -> new MalformedSubmitServlet,
+    s"$baseContext/kill/*" -> new InvalidKillServlet,
+    s"$baseContext/status/*" -> new ExplodingStatusServlet,
+    "/*" -> new ErrorServlet
+  )
+
+  /** A faulty servlet that produces malformed responses. */
+  class MalformedSubmitServlet extends SubmitRequestServlet(masterActor, masterUrl, masterConf) {
+    protected override def sendResponse(
+        responseMessage: SubmitRestProtocolResponse,
+        responseServlet: HttpServletResponse): Unit = {
+      val badJson = responseMessage.toJson.drop(10).dropRight(20)
+      responseServlet.getWriter.write(badJson)
+    }
+  }
+
+  /** A faulty servlet that produces invalid responses. */
+  class InvalidKillServlet extends KillRequestServlet(masterActor, masterConf) {
+    protected override def handleKill(submissionId: String): KillSubmissionResponse = {
+      val k = super.handleKill(submissionId)
+      k.submissionId = null
+      k
+    }
+  }
+
+  /** A faulty status servlet that explodes. */
+  class ExplodingStatusServlet extends StatusRequestServlet(masterActor, masterConf) {
+    private def explode: Int = 1 / 0
+    protected override def handleStatus(submissionId: String): SubmissionStatusResponse = {
+      val s = super.handleStatus(submissionId)
+      s.workerId = explode.toString
+      s
     }
   }
 }

From 947b8bd82ec0f4c45910e6d781df4661f56e4587 Mon Sep 17 00:00:00 2001
From: lianhuiwang <lianhuiwang09@gmail.com>
Date: Thu, 12 Feb 2015 14:50:16 -0800
Subject: [PATCH 142/357] [SPARK-5759][Yarn]ExecutorRunnable should catch
 YarnException while NMClient start contain...

some time since some reasons, it lead to some exception while NMClient start some containers.example:we do not config spark_shuffle on some machines, so it will throw a exception:
java.lang.Error: org.apache.hadoop.yarn.exceptions.InvalidAuxServiceException: The auxService:spark_shuffle does not exist.
because YarnAllocator use ThreadPoolExecutor to start Container, so we can not find which container or hostname throw exception. I think we should catch YarnException in ExecutorRunnable when start container. if there are some exceptions, we can know the container id or hostname of failed container.

Author: lianhuiwang <lianhuiwang09@gmail.com>

Closes #4554 from lianhuiwang/SPARK-5759 and squashes the following commits:

caf5a99 [lianhuiwang] use SparkException to warp exception
c02140f [lianhuiwang] ExecutorRunnable should catch YarnException while NMClient start container
---
 .../apache/spark/deploy/yarn/ExecutorRunnable.scala    | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
index 6d5b8fda76ab8..c1d3f7320f53c 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
@@ -38,7 +38,7 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.ipc.YarnRPC
 import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
 
-import org.apache.spark.{SecurityManager, SparkConf, Logging}
+import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
 import org.apache.spark.network.util.JavaUtils
 
 class ExecutorRunnable(
@@ -109,7 +109,13 @@ class ExecutorRunnable(
     }
 
     // Send the start request to the ContainerManager
-    nmClient.startContainer(container, ctx)
+    try {
+      nmClient.startContainer(container, ctx)
+    } catch {
+      case ex: Exception =>
+        throw new SparkException(s"Exception while starting container ${container.getId}" +
+          s" on host $hostname", ex)
+    }
   }
 
   private def prepareCommand(

From 26c816e7388eaa336a59183029f86548f1cc279c Mon Sep 17 00:00:00 2001
From: "David Y. Ross" <dyross@gmail.com>
Date: Thu, 12 Feb 2015 14:52:38 -0800
Subject: [PATCH 143/357] SPARK-5747: Fix wordsplitting bugs in
 make-distribution.sh

The `$MVN` command variable may have spaces, so when referring to it, must wrap in quotes.

Author: David Y. Ross <dyross@gmail.com>

Closes #4540 from dyross/dyr-fix-make-distribution2 and squashes the following commits:

5a41596 [David Y. Ross] SPARK-5747: Fix wordsplitting bugs in make-distribution.sh
---
 make-distribution.sh | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/make-distribution.sh b/make-distribution.sh
index 051c87c0894ae..dd990d4b96e46 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -98,9 +98,9 @@ done
 if [ -z "$JAVA_HOME" ]; then
   # Fall back on JAVA_HOME from rpm, if found
   if [ $(command -v  rpm) ]; then
-    RPM_JAVA_HOME=$(rpm -E %java_home 2>/dev/null)
+    RPM_JAVA_HOME="$(rpm -E %java_home 2>/dev/null)"
     if [ "$RPM_JAVA_HOME" != "%java_home" ]; then
-      JAVA_HOME=$RPM_JAVA_HOME
+      JAVA_HOME="$RPM_JAVA_HOME"
       echo "No JAVA_HOME set, proceeding with '$JAVA_HOME' learned from rpm"
     fi
   fi
@@ -113,24 +113,24 @@ fi
 
 if [ $(command -v git) ]; then
     GITREV=$(git rev-parse --short HEAD 2>/dev/null || :)
-    if [ ! -z $GITREV ]; then
+    if [ ! -z "$GITREV" ]; then
 	 GITREVSTRING=" (git revision $GITREV)"
     fi
     unset GITREV
 fi
 
 
-if [ ! $(command -v $MVN) ] ; then
+if [ ! $(command -v "$MVN") ] ; then
     echo -e "Could not locate Maven command: '$MVN'."
     echo -e "Specify the Maven command with the --mvn flag"
     exit -1;
 fi
 
-VERSION=$($MVN help:evaluate -Dexpression=project.version 2>/dev/null | grep -v "INFO" | tail -n 1)
-SPARK_HADOOP_VERSION=$($MVN help:evaluate -Dexpression=hadoop.version $@ 2>/dev/null\
+VERSION=$("$MVN" help:evaluate -Dexpression=project.version 2>/dev/null | grep -v "INFO" | tail -n 1)
+SPARK_HADOOP_VERSION=$("$MVN" help:evaluate -Dexpression=hadoop.version $@ 2>/dev/null\
     | grep -v "INFO"\
     | tail -n 1)
-SPARK_HIVE=$($MVN help:evaluate -Dexpression=project.activeProfiles -pl sql/hive $@ 2>/dev/null\
+SPARK_HIVE=$("$MVN" help:evaluate -Dexpression=project.activeProfiles -pl sql/hive $@ 2>/dev/null\
     | grep -v "INFO"\
     | fgrep --count "<id>hive</id>";\
     # Reset exit status to 0, otherwise the script stops here if the last grep finds nothing\
@@ -147,7 +147,7 @@ if [[ ! "$JAVA_VERSION" =~ "1.6" && -z "$SKIP_JAVA_TEST" ]]; then
   echo "Output from 'java -version' was:"
   echo "$JAVA_VERSION"
   read -p "Would you like to continue anyways? [y,n]: " -r
-  if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+  if [[ ! "$REPLY" =~ ^[Yy]$ ]]; then
     echo "Okay, exiting."
     exit 1
   fi
@@ -232,7 +232,7 @@ cp -r "$SPARK_HOME/ec2" "$DISTDIR"
 if [ "$SPARK_TACHYON" == "true" ]; then
   TMPD=`mktemp -d 2>/dev/null || mktemp -d -t 'disttmp'`
 
-  pushd $TMPD > /dev/null
+  pushd "$TMPD" > /dev/null
   echo "Fetching tachyon tgz"
 
   TACHYON_DL="${TACHYON_TGZ}.part"
@@ -259,7 +259,7 @@ if [ "$SPARK_TACHYON" == "true" ]; then
   fi
 
   popd > /dev/null
-  rm -rf $TMPD
+  rm -rf "$TMPD"
 fi
 
 if [ "$MAKE_TGZ" == "true" ]; then

From 0bf031582588723dd5a4ca42e6f9f36bc2da1a0b Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 12 Feb 2015 14:54:38 -0800
Subject: [PATCH 144/357] [SPARK-5780] [PySpark] Mute the logging during unit
 tests

There a bunch of logging coming from driver and worker, it's noisy and scaring, and a lots of exception in it, people are confusing about the tests are failing or not.

This PR will mute the logging during tests, only show them if any one failed.

Author: Davies Liu <davies@databricks.com>

Closes #4572 from davies/mute and squashes the following commits:

1e9069c [Davies Liu] mute the logging during python tests
---
 python/run-tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/run-tests b/python/run-tests
index 58a26dd8ff088..077ad60d764a3 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -35,7 +35,7 @@ rm -rf metastore warehouse
 function run_test() {
     echo "Running test: $1" | tee -a $LOG_FILE
 
-    SPARK_TESTING=1 time "$FWDIR"/bin/pyspark $1 2>&1 | tee -a $LOG_FILE
+    SPARK_TESTING=1 time "$FWDIR"/bin/pyspark $1 >> $LOG_FILE 2>&1
 
     FAILED=$((PIPESTATUS[0]||$FAILED))
 

From c352ffbdb9112714c176a747edff6115e9369e58 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Thu, 12 Feb 2015 15:17:25 -0800
Subject: [PATCH 145/357] [SPARK-5758][SQL] Use LongType as the default type
 for integers in JSON schema inference.

Author: Yin Huai <yhuai@databricks.com>

Closes #4544 from yhuai/jsonUseLongTypeByDefault and squashes the following commits:

6e2ffc2 [Yin Huai] Use LongType as the default type for integers in JSON schema inference.
---
 .../org/apache/spark/sql/json/JsonRDD.scala      | 12 ++++++++----
 .../spark/sql/api/java/JavaApplySchemaSuite.java |  2 +-
 .../org/apache/spark/sql/json/JsonSuite.scala    | 16 ++++++++--------
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index 1043eefcfc6a5..7dfb30402136e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -17,13 +17,12 @@
 
 package org.apache.spark.sql.json
 
-import java.io.StringWriter
-import java.sql.{Date, Timestamp}
+import java.sql.Timestamp
 
 import scala.collection.Map
 import scala.collection.convert.Wrappers.{JMapWrapper, JListWrapper}
 
-import com.fasterxml.jackson.core.{JsonGenerator, JsonProcessingException, JsonFactory}
+import com.fasterxml.jackson.core.{JsonGenerator, JsonProcessingException}
 import com.fasterxml.jackson.databind.ObjectMapper
 
 import org.apache.spark.rdd.RDD
@@ -178,7 +177,12 @@ private[sql] object JsonRDD extends Logging {
   }
 
   private def typeOfPrimitiveValue: PartialFunction[Any, DataType] = {
-    ScalaReflection.typeOfObject orElse {
+    // For Integer values, use LongType by default.
+    val useLongType: PartialFunction[Any, DataType] = {
+      case value: IntegerType.JvmType => LongType
+    }
+
+    useLongType orElse ScalaReflection.typeOfObject orElse {
       // Since we do not have a data type backed by BigInteger,
       // when we see a Java BigInteger, we use DecimalType.
       case value: java.math.BigInteger => DecimalType.Unlimited
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
index 2e6e977fdc752..643b891ab1b63 100644
--- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
+++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
@@ -164,7 +164,7 @@ public void applySchemaToJSON() {
     fields.add(DataTypes.createStructField("bigInteger", DataTypes.createDecimalType(), true));
     fields.add(DataTypes.createStructField("boolean", DataTypes.BooleanType, true));
     fields.add(DataTypes.createStructField("double", DataTypes.DoubleType, true));
-    fields.add(DataTypes.createStructField("integer", DataTypes.IntegerType, true));
+    fields.add(DataTypes.createStructField("integer", DataTypes.LongType, true));
     fields.add(DataTypes.createStructField("long", DataTypes.LongType, true));
     fields.add(DataTypes.createStructField("null", DataTypes.StringType, true));
     fields.add(DataTypes.createStructField("string", DataTypes.StringType, true));
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index fde4b47438c56..b5f13f8bd5e80 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -222,7 +222,7 @@ class JsonSuite extends QueryTest {
       StructField("bigInteger", DecimalType.Unlimited, true) ::
       StructField("boolean", BooleanType, true) ::
       StructField("double", DoubleType, true) ::
-      StructField("integer", IntegerType, true) ::
+      StructField("integer", LongType, true) ::
       StructField("long", LongType, true) ::
       StructField("null", StringType, true) ::
       StructField("string", StringType, true) :: Nil)
@@ -252,7 +252,7 @@ class JsonSuite extends QueryTest {
       StructField("arrayOfBigInteger", ArrayType(DecimalType.Unlimited, false), true) ::
       StructField("arrayOfBoolean", ArrayType(BooleanType, false), true) ::
       StructField("arrayOfDouble", ArrayType(DoubleType, false), true) ::
-      StructField("arrayOfInteger", ArrayType(IntegerType, false), true) ::
+      StructField("arrayOfInteger", ArrayType(LongType, false), true) ::
       StructField("arrayOfLong", ArrayType(LongType, false), true) ::
       StructField("arrayOfNull", ArrayType(StringType, true), true) ::
       StructField("arrayOfString", ArrayType(StringType, false), true) ::
@@ -265,7 +265,7 @@ class JsonSuite extends QueryTest {
         StructField("field1", BooleanType, true) ::
         StructField("field2", DecimalType.Unlimited, true) :: Nil), true) ::
       StructField("structWithArrayFields", StructType(
-        StructField("field1", ArrayType(IntegerType, false), true) ::
+        StructField("field1", ArrayType(LongType, false), true) ::
         StructField("field2", ArrayType(StringType, false), true) :: Nil), true) :: Nil)
 
     assert(expectedSchema === jsonDF.schema)
@@ -486,7 +486,7 @@ class JsonSuite extends QueryTest {
     val jsonDF = jsonRDD(complexFieldValueTypeConflict)
 
     val expectedSchema = StructType(
-      StructField("array", ArrayType(IntegerType, false), true) ::
+      StructField("array", ArrayType(LongType, false), true) ::
       StructField("num_struct", StringType, true) ::
       StructField("str_array", StringType, true) ::
       StructField("struct", StructType(
@@ -540,7 +540,7 @@ class JsonSuite extends QueryTest {
     val expectedSchema = StructType(
       StructField("a", BooleanType, true) ::
       StructField("b", LongType, true) ::
-      StructField("c", ArrayType(IntegerType, false), true) ::
+      StructField("c", ArrayType(LongType, false), true) ::
       StructField("d", StructType(
         StructField("field", BooleanType, true) :: Nil), true) ::
       StructField("e", StringType, true) :: Nil)
@@ -560,7 +560,7 @@ class JsonSuite extends QueryTest {
       StructField("bigInteger", DecimalType.Unlimited, true) ::
       StructField("boolean", BooleanType, true) ::
       StructField("double", DoubleType, true) ::
-      StructField("integer", IntegerType, true) ::
+      StructField("integer", LongType, true) ::
       StructField("long", LongType, true) ::
       StructField("null", StringType, true) ::
       StructField("string", StringType, true) :: Nil)
@@ -781,12 +781,12 @@ class JsonSuite extends QueryTest {
         ArrayType(ArrayType(ArrayType(ArrayType(StringType, false), false), true), false), true) ::
       StructField("field2",
         ArrayType(ArrayType(
-          StructType(StructField("Test", IntegerType, true) :: Nil), false), true), true) ::
+          StructType(StructField("Test", LongType, true) :: Nil), false), true), true) ::
       StructField("field3",
         ArrayType(ArrayType(
           StructType(StructField("Test", StringType, true) :: Nil), true), false), true) ::
       StructField("field4",
-        ArrayType(ArrayType(ArrayType(IntegerType, false), true), false), true) :: Nil)
+        ArrayType(ArrayType(ArrayType(LongType, false), true), false), true) :: Nil)
 
     assert(schema === jsonDF.schema)
 

From ee04a8b19be8330bfc48f470ef365622162c915f Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Thu, 12 Feb 2015 15:19:19 -0800
Subject: [PATCH 146/357] [SPARK-5573][SQL] Add explode to dataframes

Author: Michael Armbrust <michael@databricks.com>

Closes #4546 from marmbrus/explode and squashes the following commits:

eefd33a [Michael Armbrust] whitespace
a8d496c [Michael Armbrust] Merge remote-tracking branch 'apache/master' into explode
4af740e [Michael Armbrust] Merge remote-tracking branch 'origin/master' into explode
dc86a5c [Michael Armbrust] simple version
d633d01 [Michael Armbrust] add scala specific
950707a [Michael Armbrust] fix comments
ba8854c [Michael Armbrust] [SPARK-5573][SQL] Add explode to dataframes
---
 .../sql/catalyst/expressions/generators.scala | 19 ++++++++++
 .../org/apache/spark/sql/DataFrame.scala      | 38 +++++++++++++++++++
 .../org/apache/spark/sql/DataFrameImpl.scala  | 30 ++++++++++++++-
 .../apache/spark/sql/IncomputableColumn.scala |  9 +++++
 .../org/apache/spark/sql/DataFrameSuite.scala | 25 ++++++++++++
 5 files changed, 119 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index 43b6482c0171c..0983d274def3f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -73,6 +73,25 @@ abstract class Generator extends Expression {
   }
 }
 
+/**
+ * A generator that produces its output using the provided lambda function.
+ */
+case class UserDefinedGenerator(
+    schema: Seq[Attribute],
+    function: Row => TraversableOnce[Row],
+    children: Seq[Expression])
+  extends Generator{
+
+  override protected def makeOutput(): Seq[Attribute] = schema
+
+  override def eval(input: Row): TraversableOnce[Row] = {
+    val inputRow = new InterpretedProjection(children)
+    function(inputRow(input))
+  }
+
+  override def toString = s"UserDefinedGenerator(${children.mkString(",")})"
+}
+
 /**
  * Given an input array produces a sequence of rows for each value in the array.
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 13aff760e9a5c..65257882f4e7c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql
 
 import scala.collection.JavaConversions._
 import scala.reflect.ClassTag
+import scala.reflect.runtime.universe.TypeTag
 import scala.util.control.NonFatal
 
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
@@ -441,6 +442,43 @@ trait DataFrame extends RDDApi[Row] with Serializable {
     sample(withReplacement, fraction, Utils.random.nextLong)
   }
 
+  /**
+   * (Scala-specific) Returns a new [[DataFrame]] where each row has been expanded to zero or more
+   * rows by the provided function.  This is similar to a `LATERAL VIEW` in HiveQL. The columns of
+   * the input row are implicitly joined with each row that is output by the function.
+   *
+   * The following example uses this function to count the number of books which contain
+   * a given word:
+   *
+   * {{{
+   *   case class Book(title: String, words: String)
+   *   val df: RDD[Book]
+   *
+   *   case class Word(word: String)
+   *   val allWords = df.explode('words) {
+   *     case Row(words: String) => words.split(" ").map(Word(_))
+   *   }
+   *
+   *   val bookCountPerWord = allWords.groupBy("word").agg(countDistinct("title"))
+   * }}}
+   */
+  def explode[A <: Product : TypeTag](input: Column*)(f: Row => TraversableOnce[A]): DataFrame
+
+
+  /**
+   * (Scala-specific) Returns a new [[DataFrame]] where a single column has been expanded to zero
+   * or more rows by the provided function.  This is similar to a `LATERAL VIEW` in HiveQL. All
+   * columns of the input row are implicitly joined with each value that is output by the function.
+   *
+   * {{{
+   *   df.explode("words", "word")(words: String => words.split(" "))
+   * }}}
+   */
+  def explode[A, B : TypeTag](
+      inputColumn: String,
+      outputColumn: String)(
+      f: A => TraversableOnce[B]): DataFrame
+
   /////////////////////////////////////////////////////////////////////////////
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index 4c6e19cace8ca..bb5c6226a2217 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -21,6 +21,7 @@ import java.io.CharArrayWriter
 
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
+import scala.reflect.runtime.universe.TypeTag
 import scala.collection.JavaConversions._
 
 import com.fasterxml.jackson.core.JsonFactory
@@ -29,7 +30,7 @@ import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.python.SerDeUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.sql.catalyst.{SqlParser, ScalaReflection}
+import org.apache.spark.sql.catalyst.{expressions, SqlParser, ScalaReflection}
 import org.apache.spark.sql.catalyst.analysis.{ResolvedStar, UnresolvedRelation}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.{JoinType, Inner}
@@ -39,7 +40,6 @@ import org.apache.spark.sql.json.JsonRDD
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{NumericType, StructType}
 
-
 /**
  * Internal implementation of [[DataFrame]]. Users of the API should use [[DataFrame]] directly.
  */
@@ -282,6 +282,32 @@ private[sql] class DataFrameImpl protected[sql](
     Sample(fraction, withReplacement, seed, logicalPlan)
   }
 
+  override def explode[A <: Product : TypeTag]
+      (input: Column*)(f: Row => TraversableOnce[A]): DataFrame = {
+    val schema = ScalaReflection.schemaFor[A].dataType.asInstanceOf[StructType]
+    val attributes = schema.toAttributes
+    val rowFunction =
+      f.andThen(_.map(ScalaReflection.convertToCatalyst(_, schema).asInstanceOf[Row]))
+    val generator = UserDefinedGenerator(attributes, rowFunction, input.map(_.expr))
+
+    Generate(generator, join = true, outer = false, None, logicalPlan)
+  }
+
+  override def explode[A, B : TypeTag](
+      inputColumn: String,
+      outputColumn: String)(
+      f: A => TraversableOnce[B]): DataFrame = {
+    val dataType = ScalaReflection.schemaFor[B].dataType
+    val attributes = AttributeReference(outputColumn, dataType)() :: Nil
+    def rowFunction(row: Row) = {
+      f(row(0).asInstanceOf[A]).map(o => Row(ScalaReflection.convertToCatalyst(o, dataType)))
+    }
+    val generator = UserDefinedGenerator(attributes, rowFunction, apply(inputColumn).expr :: Nil)
+
+    Generate(generator, join = true, outer = false, None, logicalPlan)
+
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // RDD API
   /////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
index 4f9d92d97646f..19c8e3b4f4708 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql
 
 import scala.reflect.ClassTag
+import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.rdd.RDD
@@ -110,6 +111,14 @@ private[sql] class IncomputableColumn(protected[sql] val expr: Expression) exten
 
   override def sample(withReplacement: Boolean, fraction: Double, seed: Long): DataFrame = err()
 
+  override def explode[A <: Product : TypeTag]
+      (input: Column*)(f: Row => TraversableOnce[A]): DataFrame = err()
+
+  override def explode[A, B : TypeTag](
+      inputColumn: String,
+      outputColumn: String)(
+      f: A => TraversableOnce[B]): DataFrame = err()
+
   /////////////////////////////////////////////////////////////////////////////
 
   override def head(n: Int): Array[Row] = err()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 7be9215a443f0..33b35f376b270 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -98,6 +98,31 @@ class DataFrameSuite extends QueryTest {
       sql("SELECT a.key, b.key FROM testData a JOIN testData b ON a.key = b.key").collect().toSeq)
   }
 
+  test("simple explode") {
+    val df = Seq(Tuple1("a b c"), Tuple1("d e")).toDataFrame("words")
+
+    checkAnswer(
+      df.explode("words", "word") { word: String => word.split(" ").toSeq }.select('word),
+      Row("a") :: Row("b") :: Row("c") :: Row("d") ::Row("e") :: Nil
+    )
+  }
+
+  test("explode") {
+    val df = Seq((1, "a b c"), (2, "a b"), (3, "a")).toDataFrame("number", "letters")
+    val df2 =
+      df.explode('letters) {
+        case Row(letters: String) => letters.split(" ").map(Tuple1(_)).toSeq
+      }
+
+    checkAnswer(
+      df2
+        .select('_1 as 'letter, 'number)
+        .groupBy('letter)
+        .agg('letter, countDistinct('number)),
+      Row("a", 3) :: Row("b", 2) :: Row("c", 1) :: Nil
+    )
+  }
+
   test("selectExpr") {
     checkAnswer(
       testData.selectExpr("abs(key)", "value"),

From d5fc51491808630d0328a5937dbf349e00de361f Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Thu, 12 Feb 2015 15:22:07 -0800
Subject: [PATCH 147/357] [SPARK-5755] [SQL] remove unnecessary Add

    explain extended select +key from src;
before:
== Parsed Logical Plan ==
'Project [(0 + 'key) AS _c0#8]
 'UnresolvedRelation [src], None

== Analyzed Logical Plan ==
Project [(0 + key#10) AS _c0#8]
 MetastoreRelation test, src, None

== Optimized Logical Plan ==
Project [(0 + key#10) AS _c0#8]
 MetastoreRelation test, src, None

== Physical Plan ==
Project [(0 + key#10) AS _c0#8]
 HiveTableScan [key#10], (MetastoreRelation test, src, None), None

after this patch:
== Parsed Logical Plan ==
'Project ['key]
 'UnresolvedRelation [src], None

== Analyzed Logical Plan ==
Project [key#10]
 MetastoreRelation test, src, None

== Optimized Logical Plan ==
Project [key#10]
 MetastoreRelation test, src, None

== Physical Plan ==
HiveTableScan [key#10], (MetastoreRelation test, src, None), None

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #4551 from adrian-wang/positive and squashes the following commits:

0821ae4 [Daoyuan Wang] remove unnecessary Add
---
 sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index f3c9e63652a8e..5269460e5b6bc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -1099,7 +1099,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       Cast(nodeToExpr(arg), DateType)
 
     /* Arithmetic */
-    case Token("+", child :: Nil) => Add(Literal(0), nodeToExpr(child))
+    case Token("+", child :: Nil) => nodeToExpr(child)
     case Token("-", child :: Nil) => UnaryMinus(nodeToExpr(child))
     case Token("~", child :: Nil) => BitwiseNot(nodeToExpr(child))
     case Token("+", left :: right:: Nil) => Add(nodeToExpr(left), nodeToExpr(right))

From ada993e954e2825c0fe13326fc23b0e1a567cd55 Mon Sep 17 00:00:00 2001
From: Vladimir Grigor <vladimir@kiosked.com>
Date: Thu, 12 Feb 2015 23:26:24 +0000
Subject: [PATCH 148/357] [SPARK-5335] Fix deletion of security groups within a
 VPC

Please see https://issues.apache.org/jira/browse/SPARK-5335.

The fix itself is in e58a8b01a8bedcbfbbc6d04b1c1489255865cf87 commit. Two earlier commits are fixes of another VPC related bug waiting to be merged. I should have created former bug fix in own branch then this fix would not have former fixes. :(

This code is released under the project's license.

Author: Vladimir Grigor <vladimir@kiosked.com>
Author: Vladimir Grigor <vladimir@voukka.com>

Closes #4122 from voukka/SPARK-5335_delete_sg_vpc and squashes the following commits:

090dca9 [Vladimir Grigor] fixes as per review: removed printing of group_id and added comment
730ec05 [Vladimir Grigor] fix for SPARK-5335: Destroying cluster in VPC with "--delete-groups" fails to remove security groups
---
 ec2/spark_ec2.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 35209aec5fcfe..c59ab565c6862 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -1174,11 +1174,12 @@ def real_main():
                     time.sleep(30)  # Yes, it does have to be this long :-(
                     for group in groups:
                         try:
-                            conn.delete_security_group(group.name)
-                            print "Deleted security group " + group.name
+                            # It is needed to use group_id to make it work with VPC
+                            conn.delete_security_group(group_id=group.id)
+                            print "Deleted security group %s" % group.name
                         except boto.exception.EC2ResponseError:
                             success = False
-                            print "Failed to delete security group " + group.name
+                            print "Failed to delete security group %s" % group.name
 
                     # Unfortunately, group.revoke() returns True even if a rule was not
                     # deleted, so this needs to be rerun if something fails

From c025a468826e9b9f62032e207daa9d42d9dba3ca Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Thu, 12 Feb 2015 15:32:17 -0800
Subject: [PATCH 149/357] [SQL] Move SaveMode to SQL package.

Author: Yin Huai <yhuai@databricks.com>

Closes #4542 from yhuai/moveSaveMode and squashes the following commits:

65a4425 [Yin Huai] Move SaveMode to sql package.
---
 python/pyspark/sql/dataframe.py                                | 2 +-
 .../main/java/org/apache/spark/sql/{sources => }/SaveMode.java | 2 +-
 sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala   | 1 -
 .../main/scala/org/apache/spark/sql/IncomputableColumn.scala   | 1 -
 .../main/scala/org/apache/spark/sql/json/JSONRelation.scala    | 2 +-
 .../main/scala/org/apache/spark/sql/parquet/newParquet.scala   | 3 +--
 sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala | 2 +-
 .../main/scala/org/apache/spark/sql/sources/interfaces.scala   | 2 +-
 .../scala/org/apache/spark/sql/sources/SaveLoadSuite.scala     | 2 +-
 .../scala/org/apache/spark/sql/hive/execution/commands.scala   | 2 +-
 .../apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java   | 2 +-
 .../org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala  | 2 --
 12 files changed, 9 insertions(+), 14 deletions(-)
 rename sql/core/src/main/java/org/apache/spark/sql/{sources => }/SaveMode.java (97%)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 3eb56ed74cc6f..b6f052ee44ae2 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -149,7 +149,7 @@ def insertInto(self, tableName, overwrite=False):
     def _java_save_mode(self, mode):
         """Returns the Java save mode based on the Python save mode represented by a string.
         """
-        jSaveMode = self._sc._jvm.org.apache.spark.sql.sources.SaveMode
+        jSaveMode = self._sc._jvm.org.apache.spark.sql.SaveMode
         jmode = jSaveMode.ErrorIfExists
         mode = mode.lower()
         if mode == "append":
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/SaveMode.java b/sql/core/src/main/java/org/apache/spark/sql/SaveMode.java
similarity index 97%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/SaveMode.java
rename to sql/core/src/main/java/org/apache/spark/sql/SaveMode.java
index 3109f5716da2c..a40be526d0d11 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/SaveMode.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/SaveMode.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.sources;
+package org.apache.spark.sql;
 
 /**
  * SaveMode is used to specify the expected behavior of saving a DataFrame to a data source.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 65257882f4e7c..4f8f19e2c1465 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -27,7 +27,6 @@ import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.sources.SaveMode
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
index 19c8e3b4f4708..cba3b77011cc3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
@@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedSt
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.sql.sources.SaveMode
 import org.apache.spark.sql.types.StructType
 
 private[sql] class IncomputableColumn(protected[sql] val expr: Expression) extends Column {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
index 51ff2443f3717..24848634de9cf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
@@ -21,7 +21,7 @@ import java.io.IOException
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.sql.{SaveMode, DataFrame, SQLContext}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index aef9c10fbcd01..3a9f0600617be 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -44,9 +44,8 @@ import org.apache.spark.rdd.{NewHadoopPartition, NewHadoopRDD, RDD}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.parquet.ParquetTypesConverter._
 import org.apache.spark.sql.sources._
+import org.apache.spark.sql.{DataFrame, Row, SaveMode, SQLConf, SQLContext}
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType, _}
-import org.apache.spark.sql.types.StructType._
-import org.apache.spark.sql.{DataFrame, Row, SQLConf, SQLContext}
 import org.apache.spark.{Partition => SparkPartition, TaskContext, SerializableWritable, Logging, SparkException}
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index d3d72089c3303..8cac9c0fdf7fa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.sources
 import scala.language.implicitConversions
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.sql.{SaveMode, DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 37fda7ba6e5d0..0c4b706eeebae 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -18,7 +18,7 @@ package org.apache.spark.sql.sources
 
 import org.apache.spark.annotation.{Experimental, DeveloperApi}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+import org.apache.spark.sql.{SaveMode, DataFrame, Row, SQLContext}
 import org.apache.spark.sql.catalyst.expressions.{Expression, Attribute}
 import org.apache.spark.sql.types.StructType
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
index a51004567175c..607488ccfdd6a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
@@ -22,7 +22,7 @@ import java.io.File
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.catalyst.util
-import org.apache.spark.sql.{SQLConf, DataFrame}
+import org.apache.spark.sql.{SaveMode, SQLConf, DataFrame}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index ce0db1125c27f..0aa5f7f7b88bd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.sources._
-import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.sql.{SaveMode, DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.RunnableCommand
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
index 313e84756b6bb..53ddecf57958b 100644
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
@@ -25,7 +25,7 @@
 
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.spark.sql.sources.SaveMode;
+import org.apache.spark.sql.SaveMode;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index f94aabd29ad23..2916724f66e24 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -18,8 +18,6 @@
 package org.apache.spark.sql.hive
 
 import java.io.File
-
-import org.apache.spark.sql.sources.SaveMode
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.commons.io.FileUtils

From 1d0596a16e1d3add2631f5d8169aeec2876a1362 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Thu, 12 Feb 2015 18:08:01 -0800
Subject: [PATCH 150/357] [SPARK-3299][SQL]Public API in SQLContext to list
 tables

https://issues.apache.org/jira/browse/SPARK-3299

Author: Yin Huai <yhuai@databricks.com>

Closes #4547 from yhuai/tables and squashes the following commits:

6c8f92e [Yin Huai] Add tableNames.
acbb281 [Yin Huai] Update Python test.
7793dcb [Yin Huai] Fix scala test.
572870d [Yin Huai] Address comments.
aba2e88 [Yin Huai] Format.
12c86df [Yin Huai] Add tables() to SQLContext to return a DataFrame containing existing tables.
---
 python/pyspark/sql/context.py                 | 34 ++++++++
 .../spark/sql/catalyst/analysis/Catalog.scala | 37 +++++++++
 .../org/apache/spark/sql/SQLContext.scala     | 36 +++++++++
 .../apache/spark/sql/ListTablesSuite.scala    | 76 ++++++++++++++++++
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  5 ++
 .../spark/sql/hive/ListTablesSuite.scala      | 77 +++++++++++++++++++
 6 files changed, 265 insertions(+)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index db4bcbece2c1b..082f1b691b196 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -621,6 +621,40 @@ def table(self, tableName):
         """
         return DataFrame(self._ssql_ctx.table(tableName), self)
 
+    def tables(self, dbName=None):
+        """Returns a DataFrame containing names of tables in the given database.
+
+        If `dbName` is not specified, the current database will be used.
+
+        The returned DataFrame has two columns, tableName and isTemporary
+        (a column with BooleanType indicating if a table is a temporary one or not).
+
+        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> df2 = sqlCtx.tables()
+        >>> df2.filter("tableName = 'table1'").first()
+        Row(tableName=u'table1', isTemporary=True)
+        """
+        if dbName is None:
+            return DataFrame(self._ssql_ctx.tables(), self)
+        else:
+            return DataFrame(self._ssql_ctx.tables(dbName), self)
+
+    def tableNames(self, dbName=None):
+        """Returns a list of names of tables in the database `dbName`.
+
+        If `dbName` is not specified, the current database will be used.
+
+        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> "table1" in sqlCtx.tableNames()
+        True
+        >>> "table1" in sqlCtx.tableNames("db")
+        True
+        """
+        if dbName is None:
+            return [name for name in self._ssql_ctx.tableNames()]
+        else:
+            return [name for name in self._ssql_ctx.tableNames(dbName)]
+
     def cacheTable(self, tableName):
         """Caches the specified table in-memory."""
         self._ssql_ctx.cacheTable(tableName)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
index df8d03b86c533..f57eab24607f8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
@@ -34,6 +34,12 @@ trait Catalog {
       tableIdentifier: Seq[String],
       alias: Option[String] = None): LogicalPlan
 
+  /**
+   * Returns tuples of (tableName, isTemporary) for all tables in the given database.
+   * isTemporary is a Boolean value indicates if a table is a temporary or not.
+   */
+  def getTables(databaseName: Option[String]): Seq[(String, Boolean)]
+
   def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit
 
   def unregisterTable(tableIdentifier: Seq[String]): Unit
@@ -101,6 +107,12 @@ class SimpleCatalog(val caseSensitive: Boolean) extends Catalog {
     // properly qualified with this alias.
     alias.map(a => Subquery(a, tableWithQualifiers)).getOrElse(tableWithQualifiers)
   }
+
+  override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
+    tables.map {
+      case (name, _) => (name, true)
+    }.toSeq
+  }
 }
 
 /**
@@ -137,6 +149,27 @@ trait OverrideCatalog extends Catalog {
     withAlias.getOrElse(super.lookupRelation(tableIdentifier, alias))
   }
 
+  abstract override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
+    val dbName = if (!caseSensitive) {
+      if (databaseName.isDefined) Some(databaseName.get.toLowerCase) else None
+    } else {
+      databaseName
+    }
+
+    val temporaryTables = overrides.filter {
+      // If a temporary table does not have an associated database, we should return its name.
+      case ((None, _), _) => true
+      // If a temporary table does have an associated database, we should return it if the database
+      // matches the given database name.
+      case ((db: Some[String], _), _) if db == dbName => true
+      case _ => false
+    }.map {
+      case ((_, tableName), _) => (tableName, true)
+    }.toSeq
+
+    temporaryTables ++ super.getTables(databaseName)
+  }
+
   override def registerTable(
       tableIdentifier: Seq[String],
       plan: LogicalPlan): Unit = {
@@ -172,6 +205,10 @@ object EmptyCatalog extends Catalog {
     throw new UnsupportedOperationException
   }
 
+  override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
+    throw new UnsupportedOperationException
+  }
+
   def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit = {
     throw new UnsupportedOperationException
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 8aae222acd927..0f8af75fe740d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -774,6 +774,42 @@ class SQLContext(@transient val sparkContext: SparkContext)
   def table(tableName: String): DataFrame =
     DataFrame(this, catalog.lookupRelation(Seq(tableName)))
 
+  /**
+   * Returns a [[DataFrame]] containing names of existing tables in the given database.
+   * The returned DataFrame has two columns, tableName and isTemporary (a column with BooleanType
+   * indicating if a table is a temporary one or not).
+   */
+  def tables(): DataFrame = {
+    createDataFrame(catalog.getTables(None)).toDataFrame("tableName", "isTemporary")
+  }
+
+  /**
+   * Returns a [[DataFrame]] containing names of existing tables in the current database.
+   * The returned DataFrame has two columns, tableName and isTemporary (a column with BooleanType
+   * indicating if a table is a temporary one or not).
+   */
+  def tables(databaseName: String): DataFrame = {
+    createDataFrame(catalog.getTables(Some(databaseName))).toDataFrame("tableName", "isTemporary")
+  }
+
+  /**
+   * Returns an array of names of tables in the current database.
+   */
+  def tableNames(): Array[String] = {
+    catalog.getTables(None).map {
+      case (tableName, _) => tableName
+    }.toArray
+  }
+
+  /**
+   * Returns an array of names of tables in the given database.
+   */
+  def tableNames(databaseName: String): Array[String] = {
+    catalog.getTables(Some(databaseName)).map {
+      case (tableName, _) => tableName
+    }.toArray
+  }
+
   protected[sql] class SparkPlanner extends SparkStrategies {
     val sparkContext: SparkContext = self.sparkContext
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
new file mode 100644
index 0000000000000..5fc35349e166e
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
@@ -0,0 +1,76 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql
+
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.sql.test.TestSQLContext
+import org.apache.spark.sql.test.TestSQLContext._
+import org.apache.spark.sql.types.{BooleanType, StringType, StructField, StructType}
+
+class ListTablesSuite extends QueryTest with BeforeAndAfter {
+
+  import org.apache.spark.sql.test.TestSQLContext.implicits._
+
+  val df =
+    sparkContext.parallelize((1 to 10).map(i => (i,s"str$i"))).toDataFrame("key", "value")
+
+  before {
+    df.registerTempTable("ListTablesSuiteTable")
+  }
+
+  after {
+    catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+  }
+
+  test("get all tables") {
+    checkAnswer(
+      tables().filter("tableName = 'ListTablesSuiteTable'"),
+      Row("ListTablesSuiteTable", true))
+
+    catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    assert(tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
+  }
+
+  test("getting all Tables with a database name has no impact on returned table names") {
+    checkAnswer(
+      tables("DB").filter("tableName = 'ListTablesSuiteTable'"),
+      Row("ListTablesSuiteTable", true))
+
+    catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    assert(tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
+  }
+
+  test("query the returned DataFrame of tables") {
+    val tableDF = tables()
+    val schema = StructType(
+      StructField("tableName", StringType, true) ::
+      StructField("isTemporary", BooleanType, false) :: Nil)
+    assert(schema === tableDF.schema)
+
+    tableDF.registerTempTable("tables")
+    checkAnswer(
+      sql("SELECT isTemporary, tableName from tables WHERE tableName = 'ListTablesSuiteTable'"),
+      Row(true, "ListTablesSuiteTable")
+    )
+    checkAnswer(
+      tables().filter("tableName = 'tables'").select("tableName", "isTemporary"),
+      Row("tables", true))
+    dropTempTable("tables")
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index c78369d12cf55..eb1ee54247bea 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -198,6 +198,11 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
     }
   }
 
+  override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
+    val dbName = databaseName.getOrElse(hive.sessionState.getCurrentDatabase)
+    client.getAllTables(dbName).map(tableName => (tableName, false))
+  }
+
   /**
    * Create table with specified database, table name, table description and schema
    * @param databaseName Database Name
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
new file mode 100644
index 0000000000000..068aa03330c33
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
@@ -0,0 +1,77 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.hive
+
+import org.scalatest.BeforeAndAfterAll
+
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.Row
+
+class ListTablesSuite extends QueryTest with BeforeAndAfterAll {
+
+  import org.apache.spark.sql.hive.test.TestHive.implicits._
+
+  val df =
+    sparkContext.parallelize((1 to 10).map(i => (i,s"str$i"))).toDataFrame("key", "value")
+
+  override def beforeAll(): Unit = {
+    // The catalog in HiveContext is a case insensitive one.
+    catalog.registerTable(Seq("ListTablesSuiteTable"), df.logicalPlan)
+    catalog.registerTable(Seq("ListTablesSuiteDB", "InDBListTablesSuiteTable"), df.logicalPlan)
+    sql("CREATE TABLE HiveListTablesSuiteTable (key int, value string)")
+    sql("CREATE DATABASE IF NOT EXISTS ListTablesSuiteDB")
+    sql("CREATE TABLE ListTablesSuiteDB.HiveInDBListTablesSuiteTable (key int, value string)")
+  }
+
+  override def afterAll(): Unit = {
+    catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    catalog.unregisterTable(Seq("ListTablesSuiteDB", "InDBListTablesSuiteTable"))
+    sql("DROP TABLE IF EXISTS HiveListTablesSuiteTable")
+    sql("DROP TABLE IF EXISTS ListTablesSuiteDB.HiveInDBListTablesSuiteTable")
+    sql("DROP DATABASE IF EXISTS ListTablesSuiteDB")
+  }
+
+  test("get all tables of current database") {
+    val allTables = tables()
+    // We are using default DB.
+    checkAnswer(
+      allTables.filter("tableName = 'listtablessuitetable'"),
+      Row("listtablessuitetable", true))
+    assert(allTables.filter("tableName = 'indblisttablessuitetable'").count() === 0)
+    checkAnswer(
+      allTables.filter("tableName = 'hivelisttablessuitetable'"),
+      Row("hivelisttablessuitetable", false))
+    assert(allTables.filter("tableName = 'hiveindblisttablessuitetable'").count() === 0)
+  }
+
+  test("getting all tables with a database name") {
+    val allTables = tables("ListTablesSuiteDB")
+    checkAnswer(
+      allTables.filter("tableName = 'listtablessuitetable'"),
+      Row("listtablessuitetable", true))
+    checkAnswer(
+      allTables.filter("tableName = 'indblisttablessuitetable'"),
+      Row("indblisttablessuitetable", true))
+    assert(allTables.filter("tableName = 'hivelisttablessuitetable'").count() === 0)
+    checkAnswer(
+      allTables.filter("tableName = 'hiveindblisttablessuitetable'"),
+      Row("hiveindblisttablessuitetable", false))
+  }
+}

From 2aea892ebd4d6c802defeef35ef7ebfe42c06eba Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Thu, 12 Feb 2015 20:37:55 -0800
Subject: [PATCH 151/357] [SQL] Fix docs of SQLContext.tables

Author: Yin Huai <yhuai@databricks.com>

Closes #4579 from yhuai/tablesDoc and squashes the following commits:

7f8964c [Yin Huai] Fix doc.
---
 .../main/scala/org/apache/spark/sql/SQLContext.scala | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 0f8af75fe740d..2165949d32c6f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -775,8 +775,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
     DataFrame(this, catalog.lookupRelation(Seq(tableName)))
 
   /**
-   * Returns a [[DataFrame]] containing names of existing tables in the given database.
-   * The returned DataFrame has two columns, tableName and isTemporary (a column with BooleanType
+   * Returns a [[DataFrame]] containing names of existing tables in the current database.
+   * The returned DataFrame has two columns, tableName and isTemporary (a Boolean
    * indicating if a table is a temporary one or not).
    */
   def tables(): DataFrame = {
@@ -784,8 +784,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
   }
 
   /**
-   * Returns a [[DataFrame]] containing names of existing tables in the current database.
-   * The returned DataFrame has two columns, tableName and isTemporary (a column with BooleanType
+   * Returns a [[DataFrame]] containing names of existing tables in the given database.
+   * The returned DataFrame has two columns, tableName and isTemporary (a Boolean
    * indicating if a table is a temporary one or not).
    */
   def tables(databaseName: String): DataFrame = {
@@ -793,7 +793,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
   }
 
   /**
-   * Returns an array of names of tables in the current database.
+   * Returns the names of tables in the current database as an array.
    */
   def tableNames(): Array[String] = {
     catalog.getTables(None).map {
@@ -802,7 +802,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
   }
 
   /**
-   * Returns an array of names of tables in the given database.
+   * Returns the names of tables in the given database as an array.
    */
   def tableNames(databaseName: String): Array[String] = {
     catalog.getTables(Some(databaseName)).map {

From 1c8633f3fe9d814c83384e339b958740c250c00c Mon Sep 17 00:00:00 2001
From: tianyi <tianyi.asiainfo@gmail.com>
Date: Thu, 12 Feb 2015 22:18:39 -0800
Subject: [PATCH 152/357] [SPARK-3365][SQL]Wrong schema generated for List type

This PR fix the issue SPARK-3365.
The reason is Spark generated wrong schema for the type `List` in `ScalaReflection.scala`
for example:

the generated schema for type `Seq[String]` is:
```
{"name":"x","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}}`
```

the generated schema for type `List[String]` is:
```
{"name":"x","type":{"type":"struct","fields":[]},"nullable":true,"metadata":{}}`
```

Author: tianyi <tianyi.asiainfo@gmail.com>

Closes #4581 from tianyi/SPARK-3365 and squashes the following commits:

a097e86 [tianyi] change the order of resolution in ScalaReflection.scala
---
 .../spark/sql/catalyst/ScalaReflection.scala  | 30 +++++++++----------
 .../sql/catalyst/ScalaReflectionSuite.scala   |  5 ++++
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 5d9c331ca5178..11fd443733658 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -122,6 +122,21 @@ trait ScalaReflection {
       case t if t <:< typeOf[Option[_]] =>
         val TypeRef(_, _, Seq(optType)) = t
         Schema(schemaFor(optType).dataType, nullable = true)
+      // Need to decide if we actually need a special type here.
+      case t if t <:< typeOf[Array[Byte]] => Schema(BinaryType, nullable = true)
+      case t if t <:< typeOf[Array[_]] =>
+        val TypeRef(_, _, Seq(elementType)) = t
+        val Schema(dataType, nullable) = schemaFor(elementType)
+        Schema(ArrayType(dataType, containsNull = nullable), nullable = true)
+      case t if t <:< typeOf[Seq[_]] =>
+        val TypeRef(_, _, Seq(elementType)) = t
+        val Schema(dataType, nullable) = schemaFor(elementType)
+        Schema(ArrayType(dataType, containsNull = nullable), nullable = true)
+      case t if t <:< typeOf[Map[_, _]] =>
+        val TypeRef(_, _, Seq(keyType, valueType)) = t
+        val Schema(valueDataType, valueNullable) = schemaFor(valueType)
+        Schema(MapType(schemaFor(keyType).dataType,
+          valueDataType, valueContainsNull = valueNullable), nullable = true)
       case t if t <:< typeOf[Product] =>
         val formalTypeArgs = t.typeSymbol.asClass.typeParams
         val TypeRef(_, _, actualTypeArgs) = t
@@ -144,21 +159,6 @@ trait ScalaReflection {
               schemaFor(p.typeSignature.substituteTypes(formalTypeArgs, actualTypeArgs))
             StructField(p.name.toString, dataType, nullable)
           }), nullable = true)
-      // Need to decide if we actually need a special type here.
-      case t if t <:< typeOf[Array[Byte]] => Schema(BinaryType, nullable = true)
-      case t if t <:< typeOf[Array[_]] =>
-        val TypeRef(_, _, Seq(elementType)) = t
-        val Schema(dataType, nullable) = schemaFor(elementType)
-        Schema(ArrayType(dataType, containsNull = nullable), nullable = true)
-      case t if t <:< typeOf[Seq[_]] =>
-        val TypeRef(_, _, Seq(elementType)) = t
-        val Schema(dataType, nullable) = schemaFor(elementType)
-        Schema(ArrayType(dataType, containsNull = nullable), nullable = true)
-      case t if t <:< typeOf[Map[_, _]] =>
-        val TypeRef(_, _, Seq(keyType, valueType)) = t
-        val Schema(valueDataType, valueNullable) = schemaFor(valueType)
-        Schema(MapType(schemaFor(keyType).dataType,
-          valueDataType, valueContainsNull = valueNullable), nullable = true)
       case t if t <:< typeOf[String] => Schema(StringType, nullable = true)
       case t if t <:< typeOf[Timestamp] => Schema(TimestampType, nullable = true)
       case t if t <:< typeOf[java.sql.Date] => Schema(DateType, nullable = true)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
index d0f547d187ecb..eee00e3f7ea76 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
@@ -61,6 +61,7 @@ case class OptionalData(
 case class ComplexData(
     arrayField: Seq[Int],
     arrayField1: Array[Int],
+    arrayField2: List[Int],
     arrayFieldContainsNull: Seq[java.lang.Integer],
     mapField: Map[Int, Long],
     mapFieldValueContainsNull: Map[Int, java.lang.Long],
@@ -137,6 +138,10 @@ class ScalaReflectionSuite extends FunSuite {
           "arrayField1",
           ArrayType(IntegerType, containsNull = false),
           nullable = true),
+        StructField(
+          "arrayField2",
+          ArrayType(IntegerType, containsNull = false),
+          nullable = true),
         StructField(
           "arrayFieldContainsNull",
           ArrayType(IntegerType, containsNull = true),

From 1768bd51438670c493ca3ca02988aee3ae31e87e Mon Sep 17 00:00:00 2001
From: WangTaoTheTonic <barneystinson@aliyun.com>
Date: Fri, 13 Feb 2015 10:27:23 +0000
Subject: [PATCH 153/357] [SPARK-4832][Deploy]some other processes might take
 the daemon pid

Some other processes might use the pid saved in pid file. In that case we should ignore it and launch daemons.

JIRA is down for maintenance. I will file one once it return.

Author: WangTaoTheTonic <barneystinson@aliyun.com>
Author: WangTaoTheTonic <wangtao111@huawei.com>

Closes #3683 from WangTaoTheTonic/otherproc and squashes the following commits:

daa86a1 [WangTaoTheTonic] some bash style fix
8befee7 [WangTaoTheTonic] handle the mistake scenario
cf4ecc6 [WangTaoTheTonic] remove redundant condition
f36cfb4 [WangTaoTheTonic] some other processes might take the pid
---
 sbin/spark-daemon.sh | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/sbin/spark-daemon.sh b/sbin/spark-daemon.sh
index 89608bc41b71d..ec6d0b5a40ef2 100755
--- a/sbin/spark-daemon.sh
+++ b/sbin/spark-daemon.sh
@@ -129,8 +129,9 @@ case $option in
     mkdir -p "$SPARK_PID_DIR"
 
     if [ -f $pid ]; then
-      if kill -0 `cat $pid` > /dev/null 2>&1; then
-        echo $command running as process `cat $pid`.  Stop it first.
+      TARGET_ID="$(cat "$pid")"
+      if [[ $(ps -p "$TARGET_ID" -o args=) =~ $command ]]; then
+        echo "$command running as process $TARGET_ID.  Stop it first."
         exit 1
       fi
     fi
@@ -141,7 +142,7 @@ case $option in
     fi
 
     spark_rotate_log "$log"
-    echo starting $command, logging to $log
+    echo "starting $command, logging to $log"
     if [ $option == spark-submit ]; then
       source "$SPARK_HOME"/bin/utils.sh
       gatherSparkSubmitOpts "$@"
@@ -154,7 +155,7 @@ case $option in
     echo $newpid > $pid
     sleep 2
     # Check if the process has died; in that case we'll tail the log so the user can see
-    if ! kill -0 $newpid >/dev/null 2>&1; then
+    if [[ ! $(ps -p "$newpid" -o args=) =~ $command ]]; then
       echo "failed to launch $command:"
       tail -2 "$log" | sed 's/^/  /'
       echo "full log in $log"
@@ -164,14 +165,15 @@ case $option in
   (stop)
 
     if [ -f $pid ]; then
-      if kill -0 `cat $pid` > /dev/null 2>&1; then
-        echo stopping $command
-        kill `cat $pid`
+      TARGET_ID="$(cat "$pid")"
+      if [[ $(ps -p "$TARGET_ID" -o args=) =~ $command ]]; then
+        echo "stopping $command"
+        kill "$TARGET_ID"
       else
-        echo no $command to stop
+        echo "no $command to stop"
       fi
     else
-      echo no $command to stop
+      echo "no $command to stop"
     fi
     ;;
 

From c0ccd2564182695ea5771524840bf1a99d5aa842 Mon Sep 17 00:00:00 2001
From: uncleGen <hustyugm@gmail.com>
Date: Fri, 13 Feb 2015 09:43:10 -0800
Subject: [PATCH 154/357] [SPARK-5732][CORE]:Add an option to print the spark
 version in spark script.

Naturally, we may need to add an option to print the spark version in spark script. It is pretty common in script tool.
![9](https://cloud.githubusercontent.com/assets/7402327/6183331/cab1b74e-b38e-11e4-9daa-e26e6015cff3.JPG)

Author: uncleGen <hustyugm@gmail.com>
Author: genmao.ygm <genmao.ygm@alibaba-inc.com>

Closes #4522 from uncleGen/master-clean-150211 and squashes the following commits:

9f2127c [genmao.ygm] revert the behavior of "-v"
015ddee [uncleGen] minor changes
463f02c [uncleGen] minor changes
---
 .../org/apache/spark/deploy/SparkSubmit.scala  | 18 +++++++++++++++---
 .../spark/deploy/SparkSubmitArguments.scala    |  4 ++++
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 80cc0587286b1..54399e99c98f0 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -37,6 +37,7 @@ import org.apache.ivy.core.settings.IvySettings
 import org.apache.ivy.plugins.matcher.GlobPatternMatcher
 import org.apache.ivy.plugins.resolver.{ChainResolver, IBiblioResolver}
 
+import org.apache.spark.SPARK_VERSION
 import org.apache.spark.deploy.rest._
 import org.apache.spark.executor._
 import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, Utils}
@@ -83,14 +84,25 @@ object SparkSubmit {
   // Exposed for testing
   private[spark] var exitFn: () => Unit = () => System.exit(1)
   private[spark] var printStream: PrintStream = System.err
-  private[spark] def printWarning(str: String) = printStream.println("Warning: " + str)
-  private[spark] def printErrorAndExit(str: String) = {
+  private[spark] def printWarning(str: String): Unit = printStream.println("Warning: " + str)
+  private[spark] def printErrorAndExit(str: String): Unit = {
     printStream.println("Error: " + str)
     printStream.println("Run with --help for usage help or --verbose for debug output")
     exitFn()
   }
+  private[spark] def printVersionAndExit(): Unit = {
+    printStream.println("""Welcome to
+      ____              __
+     / __/__  ___ _____/ /__
+    _\ \/ _ \/ _ `/ __/  '_/
+   /___/ .__/\_,_/_/ /_/\_\   version %s
+      /_/
+                        """.format(SPARK_VERSION))
+    printStream.println("Type --help for more information.")
+    exitFn()
+  }
 
-  def main(args: Array[String]) {
+  def main(args: Array[String]): Unit = {
     val appArgs = new SparkSubmitArguments(args)
     if (appArgs.verbose) {
       printStream.println(appArgs)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index fa38070c6fcfe..82e66a374249c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -417,6 +417,9 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
         verbose = true
         parse(tail)
 
+      case ("--version") :: tail =>
+        SparkSubmit.printVersionAndExit()
+
       case EQ_SEPARATED_OPT(opt, value) :: tail =>
         parse(opt :: value :: tail)
 
@@ -485,6 +488,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
         |
         |  --help, -h                  Show this help message and exit
         |  --verbose, -v               Print additional debug output
+        |  --version,                  Print the version of current Spark
         |
         | Spark standalone with cluster deploy mode only:
         |  --driver-cores NUM          Cores for driver (Default: 1).

From e1a1ff8108463ca79299ec0eb555a0c8db9dffa0 Mon Sep 17 00:00:00 2001
From: sboeschhuawei <stephen.boesch@huawei.com>
Date: Fri, 13 Feb 2015 09:45:57 -0800
Subject: [PATCH 155/357] [SPARK-5503][MLLIB] Example code for Power Iteration
 Clustering

Author: sboeschhuawei <stephen.boesch@huawei.com>

Closes #4495 from javadba/picexamples and squashes the following commits:

3c84b14 [sboeschhuawei] PIC Examples updates from Xiangrui's comments round 5
2878675 [sboeschhuawei] Fourth round with xiangrui on PICExample
d7ac350 [sboeschhuawei] Updates to PICExample from Xiangrui's comments round 3
d7f0cba [sboeschhuawei] Updates to PICExample from Xiangrui's comments round 3
cef28f4 [sboeschhuawei] Further updates to PICExample from Xiangrui's comments
f7ff43d [sboeschhuawei] Update to PICExample from Xiangrui's comments
efeec45 [sboeschhuawei] Update to PICExample from Xiangrui's comments
03e8de4 [sboeschhuawei] Added PICExample
c509130 [sboeschhuawei] placeholder for pic examples
5864d4a [sboeschhuawei] placeholder for pic examples
---
 .../PowerIterationClusteringExample.scala     | 160 ++++++++++++++++++
 1 file changed, 160 insertions(+)
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala

diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala
new file mode 100644
index 0000000000000..b2373adba1fd4
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+import org.apache.log4j.{Level, Logger}
+import scopt.OptionParser
+
+import org.apache.spark.mllib.clustering.PowerIterationClustering
+import org.apache.spark.rdd.RDD
+import org.apache.spark.{SparkConf, SparkContext}
+
+/**
+ * An example Power Iteration Clustering http://www.icml2010.org/papers/387.pdf app.
+ * Takes an input of K concentric circles and the number of points in the innermost circle.
+ * The output should be K clusters - each cluster containing precisely the points associated
+ * with each of the input circles.
+ *
+ * Run with
+ * {{{
+ * ./bin/run-example mllib.PowerIterationClusteringExample [options]
+ *
+ * Where options include:
+ *   k:  Number of circles/clusters
+ *   n:  Number of sampled points on innermost circle.. There are proportionally more points
+ *      within the outer/larger circles
+ *   maxIterations:   Number of Power Iterations
+ *   outerRadius:  radius of the outermost of the concentric circles
+ * }}}
+ *
+ * Here is a sample run and output:
+ *
+ * ./bin/run-example mllib.PowerIterationClusteringExample
+ * -k 3 --n 30 --maxIterations 15
+ *
+ * Cluster assignments: 1 -> [0,1,2,3,4],2 -> [5,6,7,8,9,10,11,12,13,14],
+ * 0 -> [15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]
+ *
+ *
+ * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
+ */
+object PowerIterationClusteringExample {
+
+  case class Params(
+      input: String = null,
+      k: Int = 3,
+      numPoints: Int = 5,
+      maxIterations: Int = 10,
+      outerRadius: Double = 3.0
+    ) extends AbstractParams[Params]
+
+  def main(args: Array[String]) {
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("PIC Circles") {
+      head("PowerIterationClusteringExample: an example PIC app using concentric circles.")
+      opt[Int]('k', "k")
+        .text(s"number of circles (/clusters), default: ${defaultParams.k}")
+        .action((x, c) => c.copy(k = x))
+      opt[Int]('n', "n")
+        .text(s"number of points in smallest circle, default: ${defaultParams.numPoints}")
+        .action((x, c) => c.copy(numPoints = x))
+      opt[Int]("maxIterations")
+        .text(s"number of iterations, default: ${defaultParams.maxIterations}")
+        .action((x, c) => c.copy(maxIterations = x))
+      opt[Int]('r', "r")
+        .text(s"radius of outermost circle, default: ${defaultParams.outerRadius}")
+        .action((x, c) => c.copy(numPoints = x))
+    }
+
+    parser.parse(args, defaultParams).map { params =>
+      run(params)
+    }.getOrElse {
+      sys.exit(1)
+    }
+  }
+
+  def run(params: Params) {
+    val conf = new SparkConf()
+      .setMaster("local")
+      .setAppName(s"PowerIterationClustering with $params")
+    val sc = new SparkContext(conf)
+
+    Logger.getRootLogger.setLevel(Level.WARN)
+
+    val circlesRdd = generateCirclesRdd(sc, params.k, params.numPoints, params.outerRadius)
+    val model = new PowerIterationClustering()
+      .setK(params.k)
+      .setMaxIterations(params.maxIterations)
+      .run(circlesRdd)
+
+    val clusters = model.assignments.collect.groupBy(_._2).mapValues(_.map(_._1))
+    val assignments = clusters.toList.sortBy { case (k, v) => v.length}
+    val assignmentsStr = assignments
+      .map { case (k, v) =>
+      s"$k -> ${v.sorted.mkString("[", ",", "]")}"
+    }.mkString(",")
+    val sizesStr = assignments.map {
+      _._2.size
+    }.sorted.mkString("(", ",", ")")
+    println(s"Cluster assignments: $assignmentsStr\ncluster sizes: $sizesStr")
+
+    sc.stop()
+  }
+
+  def generateCircle(radius: Double, n: Int) = {
+    Seq.tabulate(n) { i =>
+      val theta = 2.0 * math.Pi * i / n
+      (radius * math.cos(theta), radius * math.sin(theta))
+    }
+  }
+
+  def generateCirclesRdd(sc: SparkContext,
+      nCircles: Int = 3,
+      nPoints: Int = 30,
+      outerRadius: Double): RDD[(Long, Long, Double)] = {
+
+    val radii = Array.tabulate(nCircles) { cx => outerRadius / (nCircles - cx)}
+    val groupSizes = Array.tabulate(nCircles) { cx => (cx + 1) * nPoints}
+    val points = (0 until nCircles).flatMap { cx =>
+      generateCircle(radii(cx), groupSizes(cx))
+    }.zipWithIndex
+    val rdd = sc.parallelize(points)
+    val distancesRdd = rdd.cartesian(rdd).flatMap { case (((x0, y0), i0), ((x1, y1), i1)) =>
+      if (i0 < i1) {
+        Some((i0.toLong, i1.toLong, gaussianSimilarity((x0, y0), (x1, y1), 1.0)))
+      } else {
+        None
+      }
+    }
+    distancesRdd
+  }
+
+  /**
+   * Gaussian Similarity:  http://en.wikipedia.org/wiki/Radial_basis_function_kernel
+   */
+  def gaussianSimilarity(p1: (Double, Double), p2: (Double, Double), sigma: Double) = {
+    val coeff = 1.0 / (math.sqrt(2.0 * math.Pi) * sigma)
+    val expCoeff = -1.0 / 2.0 * math.pow(sigma, 2.0)
+    val ssquares = (p1._1 - p2._1) * (p1._1 - p2._1) + (p1._2 - p2._2) * (p1._2 - p2._2)
+    coeff * math.exp(expCoeff * ssquares)
+    //    math.exp((p1._1 - p2._1) * (p1._1 - p2._1) + (p1._2 - p2._2) * (p1._2 - p2._2))
+  }
+
+
+}

From fc6d3e796a3c600e2f7827562455d555e59775ae Mon Sep 17 00:00:00 2001
From: Ryan Williams <ryan.blake.williams@gmail.com>
Date: Fri, 13 Feb 2015 09:47:26 -0800
Subject: [PATCH 156/357] [SPARK-5783] Better eventlog-parsing error messages

Author: Ryan Williams <ryan.blake.williams@gmail.com>

Closes #4573 from ryan-williams/history and squashes the following commits:

a8647ec [Ryan Williams] fix test calls to .replay()
98aa3fe [Ryan Williams] include filename in history-parsing error message
8deecf0 [Ryan Williams] add line number to history-parsing error message
b668b52 [Ryan Williams] add log info line to history-eventlog parsing
---
 .../apache/spark/deploy/history/FsHistoryProvider.scala  | 3 ++-
 .../scala/org/apache/spark/deploy/master/Master.scala    | 2 +-
 .../org/apache/spark/scheduler/ReplayListenerBus.scala   | 9 ++++++---
 .../org/apache/spark/scheduler/ReplayListenerSuite.scala | 4 ++--
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 868c63d30a202..885fa0fdbf85b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -247,6 +247,7 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
    */
   private def replay(eventLog: FileStatus, bus: ReplayListenerBus): FsApplicationHistoryInfo = {
     val logPath = eventLog.getPath()
+    logInfo(s"Replaying log path: $logPath")
     val (logInput, sparkVersion) =
       if (isLegacyLogDirectory(eventLog)) {
         openLegacyEventLog(logPath)
@@ -256,7 +257,7 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
     try {
       val appListener = new ApplicationEventListener
       bus.addListener(appListener)
-      bus.replay(logInput, sparkVersion)
+      bus.replay(logInput, sparkVersion, logPath.toString)
       new FsApplicationHistoryInfo(
         logPath.getName(),
         appListener.appId.getOrElse(logPath.getName()),
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 53e453990f8c7..8cc6ec1e8192c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -761,7 +761,7 @@ private[spark] class Master(
       val ui = SparkUI.createHistoryUI(new SparkConf, replayBus, new SecurityManager(conf),
         appName + " (completed)", HistoryServer.UI_PATH_PREFIX + s"/${app.id}")
       try {
-        replayBus.replay(logInput, sparkVersion)
+        replayBus.replay(logInput, sparkVersion, eventLogFile)
       } finally {
         logInput.close()
       }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
index 584f4e7789d1a..d9c3a10dc5413 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
@@ -40,21 +40,24 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging {
    *
    * @param logData Stream containing event log data.
    * @param version Spark version that generated the events.
+   * @param sourceName Filename (or other source identifier) from whence @logData is being read
    */
-  def replay(logData: InputStream, version: String) {
+  def replay(logData: InputStream, version: String, sourceName: String) {
     var currentLine: String = null
+    var lineNumber: Int = 1
     try {
       val lines = Source.fromInputStream(logData).getLines()
       lines.foreach { line =>
         currentLine = line
         postToAll(JsonProtocol.sparkEventFromJson(parse(line)))
+        lineNumber += 1
       }
     } catch {
       case ioe: IOException =>
         throw ioe
       case e: Exception =>
-        logError("Exception in parsing Spark event log.", e)
-        logError("Malformed line: %s\n".format(currentLine))
+        logError(s"Exception parsing Spark event log: $sourceName", e)
+        logError(s"Malformed line #$lineNumber: $currentLine\n")
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
index 7e360cc6082ec..702c4cb3bdef9 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
@@ -61,7 +61,7 @@ class ReplayListenerSuite extends FunSuite with BeforeAndAfter {
     try {
       val replayer = new ReplayListenerBus()
       replayer.addListener(eventMonster)
-      replayer.replay(logData, SPARK_VERSION)
+      replayer.replay(logData, SPARK_VERSION, logFilePath.toString)
     } finally {
       logData.close()
     }
@@ -120,7 +120,7 @@ class ReplayListenerSuite extends FunSuite with BeforeAndAfter {
     try {
       val replayer = new ReplayListenerBus()
       replayer.addListener(eventMonster)
-      replayer.replay(logData, version)
+      replayer.replay(logData, version, eventLog.getPath().toString)
     } finally {
       logData.close()
     }

From 077eec2d9dba197f51004ee4a322d0fa71424ea0 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 13 Feb 2015 09:53:57 -0800
Subject: [PATCH 157/357] [SPARK-5735] Replace uses of EasyMock with Mockito

This patch replaces all uses of EasyMock with Mockito.  There are two motivations for this:

1. We should use a single mocking framework in our tests in order to keep things consistent.
2. EasyMock may be responsible for non-deterministic unit test failures due to its Objensis dependency (see SPARK-5626 for more details).

Most of these changes are fairly mechanical translations of Mockito code to EasyMock, although I made a small change that strengthens the assertions in one test in KinesisReceiverSuite.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #4578 from JoshRosen/SPARK-5735-remove-easymock and squashes the following commits:

0ab192b [Josh Rosen] Import sorting plus two minor changes to more closely match old semantics.
977565b [Josh Rosen] Remove EasyMock from build.
fae1d8f [Josh Rosen] Remove EasyMock usage in KinesisReceiverSuite.
7cca486 [Josh Rosen] Remove EasyMock usage in MesosSchedulerBackendSuite
fc5e94d [Josh Rosen] Remove EasyMock in CacheManagerSuite
---
 core/pom.xml                                  |  10 -
 .../org/apache/spark/CacheManagerSuite.scala  |  42 ++-
 .../mesos/MesosSchedulerBackendSuite.scala    | 125 +++++----
 extras/kinesis-asl/pom.xml                    |   5 -
 .../kinesis/KinesisReceiverSuite.scala        | 263 +++++++++---------
 pom.xml                                       |  13 -
 6 files changed, 207 insertions(+), 251 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index 66180035e61f1..c993781c0e0d6 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -329,16 +329,6 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>org.easymock</groupId>
-      <artifactId>easymockclassextension</artifactId>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>asm</groupId>
-      <artifactId>asm</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
diff --git a/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala b/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala
index d7d9dc7b50f30..4b25c200a695a 100644
--- a/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/CacheManagerSuite.scala
@@ -17,16 +17,18 @@
 
 package org.apache.spark
 
+import org.mockito.Mockito._
 import org.scalatest.{BeforeAndAfter, FunSuite}
-import org.scalatest.mock.EasyMockSugar
+import org.scalatest.mock.MockitoSugar
 
-import org.apache.spark.executor.{DataReadMethod, TaskMetrics}
+import org.apache.spark.executor.DataReadMethod
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage._
 
 // TODO: Test the CacheManager's thread-safety aspects
-class CacheManagerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar {
-  var sc : SparkContext = _
+class CacheManagerSuite extends FunSuite with LocalSparkContext with BeforeAndAfter
+  with MockitoSugar {
+
   var blockManager: BlockManager = _
   var cacheManager: CacheManager = _
   var split: Partition = _
@@ -57,10 +59,6 @@ class CacheManagerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar
     }.cache()
   }
 
-  after {
-    sc.stop()
-  }
-
   test("get uncached rdd") {
     // Do not mock this test, because attempting to match Array[Any], which is not covariant,
     // in blockManager.put is a losing battle. You have been warned.
@@ -75,29 +73,21 @@ class CacheManagerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar
   }
 
   test("get cached rdd") {
-    expecting {
-      val result = new BlockResult(Array(5, 6, 7).iterator, DataReadMethod.Memory, 12)
-      blockManager.get(RDDBlockId(0, 0)).andReturn(Some(result))
-    }
+    val result = new BlockResult(Array(5, 6, 7).iterator, DataReadMethod.Memory, 12)
+    when(blockManager.get(RDDBlockId(0, 0))).thenReturn(Some(result))
 
-    whenExecuting(blockManager) {
-      val context = new TaskContextImpl(0, 0, 0, 0)
-      val value = cacheManager.getOrCompute(rdd, split, context, StorageLevel.MEMORY_ONLY)
-      assert(value.toList === List(5, 6, 7))
-    }
+    val context = new TaskContextImpl(0, 0, 0, 0)
+    val value = cacheManager.getOrCompute(rdd, split, context, StorageLevel.MEMORY_ONLY)
+    assert(value.toList === List(5, 6, 7))
   }
 
   test("get uncached local rdd") {
-    expecting {
-      // Local computation should not persist the resulting value, so don't expect a put().
-      blockManager.get(RDDBlockId(0, 0)).andReturn(None)
-    }
+    // Local computation should not persist the resulting value, so don't expect a put().
+    when(blockManager.get(RDDBlockId(0, 0))).thenReturn(None)
 
-    whenExecuting(blockManager) {
-      val context = new TaskContextImpl(0, 0, 0, 0, true)
-      val value = cacheManager.getOrCompute(rdd, split, context, StorageLevel.MEMORY_ONLY)
-      assert(value.toList === List(1, 2, 3, 4))
-    }
+    val context = new TaskContextImpl(0, 0, 0, 0, true)
+    val value = cacheManager.getOrCompute(rdd, split, context, StorageLevel.MEMORY_ONLY)
+    assert(value.toList === List(1, 2, 3, 4))
   }
 
   test("verify task metrics updated correctly") {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala
index 46ab02bfef780..8cd302e2b4313 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala
@@ -17,45 +17,47 @@
 
 package org.apache.spark.scheduler.mesos
 
-import org.apache.spark.executor.MesosExecutorBackend
-import org.scalatest.FunSuite
-import org.apache.spark.{SparkConf, SparkContext, LocalSparkContext}
-import org.apache.spark.scheduler.{SparkListenerExecutorAdded, LiveListenerBus,
-  TaskDescription, WorkerOffer, TaskSchedulerImpl}
-import org.apache.spark.scheduler.cluster.ExecutorInfo
-import org.apache.spark.scheduler.cluster.mesos.{MemoryUtils, MesosSchedulerBackend}
-import org.apache.mesos.SchedulerDriver
-import org.apache.mesos.Protos.{ExecutorInfo => MesosExecutorInfo, _}
-import org.apache.mesos.Protos.Value.Scalar
-import org.easymock.{Capture, EasyMock}
 import java.nio.ByteBuffer
-import java.util.Collections
 import java.util
-import org.scalatest.mock.EasyMockSugar
+import java.util.Collections
 
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
-class MesosSchedulerBackendSuite extends FunSuite with LocalSparkContext with EasyMockSugar {
+import org.apache.mesos.SchedulerDriver
+import org.apache.mesos.Protos._
+import org.apache.mesos.Protos.Value.Scalar
+import org.mockito.Mockito._
+import org.mockito.Matchers._
+import org.mockito.{ArgumentCaptor, Matchers}
+import org.scalatest.FunSuite
+import org.scalatest.mock.MockitoSugar
+
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext}
+import org.apache.spark.executor.MesosExecutorBackend
+import org.apache.spark.scheduler._
+import org.apache.spark.scheduler.cluster.ExecutorInfo
+import org.apache.spark.scheduler.cluster.mesos.{MesosSchedulerBackend, MemoryUtils}
+
+class MesosSchedulerBackendSuite extends FunSuite with LocalSparkContext with MockitoSugar {
 
   test("check spark-class location correctly") {
     val conf = new SparkConf
     conf.set("spark.mesos.executor.home" , "/mesos-home")
 
-    val listenerBus = EasyMock.createMock(classOf[LiveListenerBus])
-    listenerBus.post(SparkListenerExecutorAdded(EasyMock.anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
-    EasyMock.replay(listenerBus)
-
-    val sc = EasyMock.createMock(classOf[SparkContext])
-    EasyMock.expect(sc.getSparkHome()).andReturn(Option("/spark-home")).anyTimes()
-    EasyMock.expect(sc.conf).andReturn(conf).anyTimes()
-    EasyMock.expect(sc.executorEnvs).andReturn(new mutable.HashMap).anyTimes()
-    EasyMock.expect(sc.executorMemory).andReturn(100).anyTimes()
-    EasyMock.expect(sc.listenerBus).andReturn(listenerBus)
-    EasyMock.replay(sc)
-    val taskScheduler = EasyMock.createMock(classOf[TaskSchedulerImpl])
-    EasyMock.expect(taskScheduler.CPUS_PER_TASK).andReturn(2).anyTimes()
-    EasyMock.replay(taskScheduler)
+    val listenerBus = mock[LiveListenerBus]
+    listenerBus.post(
+      SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
+
+    val sc = mock[SparkContext]
+    when(sc.getSparkHome()).thenReturn(Option("/spark-home"))
+
+    when(sc.conf).thenReturn(conf)
+    when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String])
+    when(sc.executorMemory).thenReturn(100)
+    when(sc.listenerBus).thenReturn(listenerBus)
+    val taskScheduler = mock[TaskSchedulerImpl]
+    when(taskScheduler.CPUS_PER_TASK).thenReturn(2)
 
     val mesosSchedulerBackend = new MesosSchedulerBackend(taskScheduler, sc, "master")
 
@@ -84,20 +86,19 @@ class MesosSchedulerBackendSuite extends FunSuite with LocalSparkContext with Ea
         .setSlaveId(SlaveID.newBuilder().setValue(s"s${id.toString}")).setHostname(s"host${id.toString}").build()
     }
 
-    val driver = EasyMock.createMock(classOf[SchedulerDriver])
-    val taskScheduler = EasyMock.createMock(classOf[TaskSchedulerImpl])
+    val driver = mock[SchedulerDriver]
+    val taskScheduler = mock[TaskSchedulerImpl]
 
-    val listenerBus = EasyMock.createMock(classOf[LiveListenerBus])
-    listenerBus.post(SparkListenerExecutorAdded(EasyMock.anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
-    EasyMock.replay(listenerBus)
+    val listenerBus = mock[LiveListenerBus]
+    listenerBus.post(
+      SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
 
-    val sc = EasyMock.createMock(classOf[SparkContext])
-    EasyMock.expect(sc.executorMemory).andReturn(100).anyTimes()
-    EasyMock.expect(sc.getSparkHome()).andReturn(Option("/path")).anyTimes()
-    EasyMock.expect(sc.executorEnvs).andReturn(new mutable.HashMap).anyTimes()
-    EasyMock.expect(sc.conf).andReturn(new SparkConf).anyTimes()
-    EasyMock.expect(sc.listenerBus).andReturn(listenerBus)
-    EasyMock.replay(sc)
+    val sc = mock[SparkContext]
+    when(sc.executorMemory).thenReturn(100)
+    when(sc.getSparkHome()).thenReturn(Option("/path"))
+    when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String])
+    when(sc.conf).thenReturn(new SparkConf)
+    when(sc.listenerBus).thenReturn(listenerBus)
 
     val minMem = MemoryUtils.calculateTotalMemory(sc).toInt
     val minCpu = 4
@@ -121,25 +122,29 @@ class MesosSchedulerBackendSuite extends FunSuite with LocalSparkContext with Ea
       2
     ))
     val taskDesc = new TaskDescription(1L, 0, "s1", "n1", 0, ByteBuffer.wrap(new Array[Byte](0)))
-    EasyMock.expect(taskScheduler.resourceOffers(EasyMock.eq(expectedWorkerOffers))).andReturn(Seq(Seq(taskDesc)))
-    EasyMock.expect(taskScheduler.CPUS_PER_TASK).andReturn(2).anyTimes()
-    EasyMock.replay(taskScheduler)
+    when(taskScheduler.resourceOffers(expectedWorkerOffers)).thenReturn(Seq(Seq(taskDesc)))
+    when(taskScheduler.CPUS_PER_TASK).thenReturn(2)
 
-    val capture = new Capture[util.Collection[TaskInfo]]
-    EasyMock.expect(
+    val capture = ArgumentCaptor.forClass(classOf[util.Collection[TaskInfo]])
+    when(
       driver.launchTasks(
-        EasyMock.eq(Collections.singleton(mesosOffers.get(0).getId)),
-        EasyMock.capture(capture),
-        EasyMock.anyObject(classOf[Filters])
+        Matchers.eq(Collections.singleton(mesosOffers.get(0).getId)),
+        capture.capture(),
+        any(classOf[Filters])
       )
-    ).andReturn(Status.valueOf(1)).once
-    EasyMock.expect(driver.declineOffer(mesosOffers.get(1).getId)).andReturn(Status.valueOf(1)).times(1)
-    EasyMock.expect(driver.declineOffer(mesosOffers.get(2).getId)).andReturn(Status.valueOf(1)).times(1)
-    EasyMock.replay(driver)
+    ).thenReturn(Status.valueOf(1))
+    when(driver.declineOffer(mesosOffers.get(1).getId)).thenReturn(Status.valueOf(1))
+    when(driver.declineOffer(mesosOffers.get(2).getId)).thenReturn(Status.valueOf(1))
 
     backend.resourceOffers(driver, mesosOffers)
 
-    EasyMock.verify(driver)
+    verify(driver, times(1)).launchTasks(
+      Matchers.eq(Collections.singleton(mesosOffers.get(0).getId)),
+      capture.capture(),
+      any(classOf[Filters])
+    )
+    verify(driver, times(1)).declineOffer(mesosOffers.get(1).getId)
+    verify(driver, times(1)).declineOffer(mesosOffers.get(2).getId)
     assert(capture.getValue.size() == 1)
     val taskInfo = capture.getValue.iterator().next()
     assert(taskInfo.getName.equals("n1"))
@@ -151,15 +156,13 @@ class MesosSchedulerBackendSuite extends FunSuite with LocalSparkContext with Ea
     // Unwanted resources offered on an existing node. Make sure they are declined
     val mesosOffers2 = new java.util.ArrayList[Offer]
     mesosOffers2.add(createOffer(1, minMem, minCpu))
-    EasyMock.reset(taskScheduler)
-    EasyMock.reset(driver)
-    EasyMock.expect(taskScheduler.resourceOffers(EasyMock.anyObject(classOf[Seq[WorkerOffer]])).andReturn(Seq(Seq())))
-    EasyMock.expect(taskScheduler.CPUS_PER_TASK).andReturn(2).anyTimes()
-    EasyMock.replay(taskScheduler)
-    EasyMock.expect(driver.declineOffer(mesosOffers2.get(0).getId)).andReturn(Status.valueOf(1)).times(1)
-    EasyMock.replay(driver)
+    reset(taskScheduler)
+    reset(driver)
+    when(taskScheduler.resourceOffers(any(classOf[Seq[WorkerOffer]]))).thenReturn(Seq(Seq()))
+    when(taskScheduler.CPUS_PER_TASK).thenReturn(2)
+    when(driver.declineOffer(mesosOffers2.get(0).getId)).thenReturn(Status.valueOf(1))
 
     backend.resourceOffers(driver, mesosOffers2)
-    EasyMock.verify(driver)
+    verify(driver, times(1)).declineOffer(mesosOffers2.get(0).getId)
   }
 }
diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml
index c815eda52bda7..216661b8bc73a 100644
--- a/extras/kinesis-asl/pom.xml
+++ b/extras/kinesis-asl/pom.xml
@@ -67,11 +67,6 @@
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>org.easymock</groupId>
-      <artifactId>easymockclassextension</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>com.novocode</groupId>
       <artifactId>junit-interface</artifactId>
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
index 41dbd64c2b1fa..f56898af029c1 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
@@ -20,7 +20,6 @@ import java.nio.ByteBuffer
 
 import scala.collection.JavaConversions.seqAsJavaList
 
-import org.apache.spark.annotation.Experimental
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.Milliseconds
 import org.apache.spark.streaming.Seconds
@@ -28,9 +27,11 @@ import org.apache.spark.streaming.StreamingContext
 import org.apache.spark.streaming.TestSuiteBase
 import org.apache.spark.streaming.util.Clock
 import org.apache.spark.streaming.util.ManualClock
+
+import org.mockito.Mockito._
 import org.scalatest.BeforeAndAfter
 import org.scalatest.Matchers
-import org.scalatest.mock.EasyMockSugar
+import org.scalatest.mock.MockitoSugar
 
 import com.amazonaws.services.kinesis.clientlibrary.exceptions.InvalidStateException
 import com.amazonaws.services.kinesis.clientlibrary.exceptions.KinesisClientLibDependencyException
@@ -42,10 +43,10 @@ import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason
 import com.amazonaws.services.kinesis.model.Record
 
 /**
- *  Suite of Kinesis streaming receiver tests focusing mostly on the KinesisRecordProcessor 
+ * Suite of Kinesis streaming receiver tests focusing mostly on the KinesisRecordProcessor
  */
 class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAfter
-    with EasyMockSugar {
+    with MockitoSugar {
 
   val app = "TestKinesisReceiver"
   val stream = "mySparkStream"
@@ -73,6 +74,14 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft
     currentClockMock = mock[Clock]
   }
 
+  override def afterFunction(): Unit = {
+    super.afterFunction()
+    // Since this suite was originally written using EasyMock, add this to preserve the old
+    // mocking semantics (see SPARK-5735 for more details)
+    verifyNoMoreInteractions(receiverMock, checkpointerMock, checkpointClockMock,
+      checkpointStateMock, currentClockMock)
+  }
+
   test("kinesis utils api") {
     val ssc = new StreamingContext(master, framework, batchDuration)
     // Tests the API, does not actually test data receiving
@@ -83,193 +92,175 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft
   }
 
   test("process records including store and checkpoint") {
-    val expectedCheckpointIntervalMillis = 10
-    expecting {
-      receiverMock.isStopped().andReturn(false).once()
-      receiverMock.store(record1.getData().array()).once()
-      receiverMock.store(record2.getData().array()).once()
-      checkpointStateMock.shouldCheckpoint().andReturn(true).once()
-      checkpointerMock.checkpoint().once()
-      checkpointStateMock.advanceCheckpoint().once()
-    }
-    whenExecuting(receiverMock, checkpointerMock, checkpointStateMock) {
-      val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId,
-          checkpointStateMock)
-      recordProcessor.processRecords(batch, checkpointerMock)
-    }
+    when(receiverMock.isStopped()).thenReturn(false)
+    when(checkpointStateMock.shouldCheckpoint()).thenReturn(true)
+
+    val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, checkpointStateMock)
+    recordProcessor.processRecords(batch, checkpointerMock)
+
+    verify(receiverMock, times(1)).isStopped()
+    verify(receiverMock, times(1)).store(record1.getData().array())
+    verify(receiverMock, times(1)).store(record2.getData().array())
+    verify(checkpointStateMock, times(1)).shouldCheckpoint()
+    verify(checkpointerMock, times(1)).checkpoint()
+    verify(checkpointStateMock, times(1)).advanceCheckpoint()
   }
 
   test("shouldn't store and checkpoint when receiver is stopped") {
-    expecting {
-      receiverMock.isStopped().andReturn(true).once()
-    }
-    whenExecuting(receiverMock, checkpointerMock, checkpointStateMock) {
-      val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId,
-          checkpointStateMock)
-      recordProcessor.processRecords(batch, checkpointerMock)
-    }
+    when(receiverMock.isStopped()).thenReturn(true)
+
+    val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, checkpointStateMock)
+    recordProcessor.processRecords(batch, checkpointerMock)
+
+    verify(receiverMock, times(1)).isStopped()
   }
 
   test("shouldn't checkpoint when exception occurs during store") {
-    expecting {
-      receiverMock.isStopped().andReturn(false).once()
-      receiverMock.store(record1.getData().array()).andThrow(new RuntimeException()).once()
-    }
-    whenExecuting(receiverMock, checkpointerMock, checkpointStateMock) {
-      intercept[RuntimeException] {
-        val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId,
-            checkpointStateMock)
-        recordProcessor.processRecords(batch, checkpointerMock)
-      }
+    when(receiverMock.isStopped()).thenReturn(false)
+    when(receiverMock.store(record1.getData().array())).thenThrow(new RuntimeException())
+
+    intercept[RuntimeException] {
+      val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, checkpointStateMock)
+      recordProcessor.processRecords(batch, checkpointerMock)
     }
+
+    verify(receiverMock, times(1)).isStopped()
+    verify(receiverMock, times(1)).store(record1.getData().array())
   }
 
   test("should set checkpoint time to currentTime + checkpoint interval upon instantiation") {
-    expecting {
-      currentClockMock.currentTime().andReturn(0).once()
-    }
-    whenExecuting(currentClockMock) {
+    when(currentClockMock.currentTime()).thenReturn(0)
+
     val checkpointIntervalMillis = 10
-    val checkpointState = new KinesisCheckpointState(Milliseconds(checkpointIntervalMillis), currentClockMock)
+    val checkpointState =
+      new KinesisCheckpointState(Milliseconds(checkpointIntervalMillis), currentClockMock)
     assert(checkpointState.checkpointClock.currentTime() == checkpointIntervalMillis)
-    }
+
+    verify(currentClockMock, times(1)).currentTime()
   }
 
   test("should checkpoint if we have exceeded the checkpoint interval") {
-    expecting {
-      currentClockMock.currentTime().andReturn(0).once()
-    }
-    whenExecuting(currentClockMock) {
-      val checkpointState = new KinesisCheckpointState(Milliseconds(Long.MinValue), currentClockMock)
-      assert(checkpointState.shouldCheckpoint())
-    }
+    when(currentClockMock.currentTime()).thenReturn(0)
+
+    val checkpointState = new KinesisCheckpointState(Milliseconds(Long.MinValue), currentClockMock)
+    assert(checkpointState.shouldCheckpoint())
+
+    verify(currentClockMock, times(1)).currentTime()
   }
 
   test("shouldn't checkpoint if we have not exceeded the checkpoint interval") {
-    expecting {
-      currentClockMock.currentTime().andReturn(0).once()
-    }
-    whenExecuting(currentClockMock) {
-      val checkpointState = new KinesisCheckpointState(Milliseconds(Long.MaxValue), currentClockMock)
-      assert(!checkpointState.shouldCheckpoint())
-    }
+    when(currentClockMock.currentTime()).thenReturn(0)
+
+    val checkpointState = new KinesisCheckpointState(Milliseconds(Long.MaxValue), currentClockMock)
+    assert(!checkpointState.shouldCheckpoint())
+
+    verify(currentClockMock, times(1)).currentTime()
   }
 
   test("should add to time when advancing checkpoint") {
-    expecting {
-      currentClockMock.currentTime().andReturn(0).once()
-    }
-    whenExecuting(currentClockMock) {
-      val checkpointIntervalMillis = 10
-      val checkpointState = new KinesisCheckpointState(Milliseconds(checkpointIntervalMillis), currentClockMock)
-      assert(checkpointState.checkpointClock.currentTime() == checkpointIntervalMillis)
-      checkpointState.advanceCheckpoint()
-      assert(checkpointState.checkpointClock.currentTime() == (2 * checkpointIntervalMillis))
-    }
+    when(currentClockMock.currentTime()).thenReturn(0)
+
+    val checkpointIntervalMillis = 10
+    val checkpointState =
+      new KinesisCheckpointState(Milliseconds(checkpointIntervalMillis), currentClockMock)
+    assert(checkpointState.checkpointClock.currentTime() == checkpointIntervalMillis)
+    checkpointState.advanceCheckpoint()
+    assert(checkpointState.checkpointClock.currentTime() == (2 * checkpointIntervalMillis))
+
+    verify(currentClockMock, times(1)).currentTime()
   }
 
   test("shutdown should checkpoint if the reason is TERMINATE") {
-    expecting {
-      checkpointerMock.checkpoint().once()
-    }
-    whenExecuting(checkpointerMock, checkpointStateMock) {
-      val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, 
-          checkpointStateMock)
-      val reason = ShutdownReason.TERMINATE
-      recordProcessor.shutdown(checkpointerMock, reason)
-    }
+    val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, checkpointStateMock)
+    val reason = ShutdownReason.TERMINATE
+    recordProcessor.shutdown(checkpointerMock, reason)
+
+    verify(checkpointerMock, times(1)).checkpoint()
   }
 
   test("shutdown should not checkpoint if the reason is something other than TERMINATE") {
-    expecting {
-    }
-    whenExecuting(checkpointerMock, checkpointStateMock) {
-      val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, 
-          checkpointStateMock)
-      recordProcessor.shutdown(checkpointerMock, ShutdownReason.ZOMBIE)
-      recordProcessor.shutdown(checkpointerMock, null)
-    }
+    val recordProcessor = new KinesisRecordProcessor(receiverMock, workerId, checkpointStateMock)
+    recordProcessor.shutdown(checkpointerMock, ShutdownReason.ZOMBIE)
+    recordProcessor.shutdown(checkpointerMock, null)
+
+    verify(checkpointerMock, never()).checkpoint()
   }
 
   test("retry success on first attempt") {
     val expectedIsStopped = false
-    expecting {
-      receiverMock.isStopped().andReturn(expectedIsStopped).once()
-    }
-    whenExecuting(receiverMock) {
-      val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100)
-      assert(actualVal == expectedIsStopped)
-    }
+    when(receiverMock.isStopped()).thenReturn(expectedIsStopped)
+
+    val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100)
+    assert(actualVal == expectedIsStopped)
+
+    verify(receiverMock, times(1)).isStopped()
   }
 
   test("retry success on second attempt after a Kinesis throttling exception") {
     val expectedIsStopped = false
-    expecting {
-      receiverMock.isStopped().andThrow(new ThrottlingException("error message"))
-        .andReturn(expectedIsStopped).once()
-    }
-    whenExecuting(receiverMock) {
-      val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100)
-      assert(actualVal == expectedIsStopped)
-    }
+    when(receiverMock.isStopped())
+        .thenThrow(new ThrottlingException("error message"))
+        .thenReturn(expectedIsStopped)
+
+    val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100)
+    assert(actualVal == expectedIsStopped)
+
+    verify(receiverMock, times(2)).isStopped()
   }
 
   test("retry success on second attempt after a Kinesis dependency exception") {
     val expectedIsStopped = false
-    expecting {
-      receiverMock.isStopped().andThrow(new KinesisClientLibDependencyException("error message"))
-        .andReturn(expectedIsStopped).once()
-    }
-    whenExecuting(receiverMock) {
-      val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100)
-      assert(actualVal == expectedIsStopped)
-    }
+    when(receiverMock.isStopped())
+        .thenThrow(new KinesisClientLibDependencyException("error message"))
+        .thenReturn(expectedIsStopped)
+
+    val actualVal = KinesisRecordProcessor.retryRandom(receiverMock.isStopped(), 2, 100)
+    assert(actualVal == expectedIsStopped)
+
+    verify(receiverMock, times(2)).isStopped()
   }
 
   test("retry failed after a shutdown exception") {
-    expecting {
-      checkpointerMock.checkpoint().andThrow(new ShutdownException("error message")).once()
-    }
-    whenExecuting(checkpointerMock) {
-      intercept[ShutdownException] {
-        KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
-      }
+    when(checkpointerMock.checkpoint()).thenThrow(new ShutdownException("error message"))
+
+    intercept[ShutdownException] {
+      KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
     }
+
+    verify(checkpointerMock, times(1)).checkpoint()
   }
 
   test("retry failed after an invalid state exception") {
-    expecting {
-      checkpointerMock.checkpoint().andThrow(new InvalidStateException("error message")).once()
-    }
-    whenExecuting(checkpointerMock) {
-      intercept[InvalidStateException] {
-        KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
-      }
+    when(checkpointerMock.checkpoint()).thenThrow(new InvalidStateException("error message"))
+
+    intercept[InvalidStateException] {
+      KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
     }
+
+    verify(checkpointerMock, times(1)).checkpoint()
   }
 
   test("retry failed after unexpected exception") {
-    expecting {
-      checkpointerMock.checkpoint().andThrow(new RuntimeException("error message")).once()
-    }
-    whenExecuting(checkpointerMock) {
-      intercept[RuntimeException] {
-        KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
-      }
+    when(checkpointerMock.checkpoint()).thenThrow(new RuntimeException("error message"))
+
+    intercept[RuntimeException] {
+      KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
     }
+
+    verify(checkpointerMock, times(1)).checkpoint()
   }
 
   test("retry failed after exhausing all retries") {
     val expectedErrorMessage = "final try error message"
-    expecting {
-      checkpointerMock.checkpoint().andThrow(new ThrottlingException("error message"))
-        .andThrow(new ThrottlingException(expectedErrorMessage)).once()
-    }
-    whenExecuting(checkpointerMock) {
-      val exception = intercept[RuntimeException] {
-        KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
-      }
-      exception.getMessage().shouldBe(expectedErrorMessage)
+    when(checkpointerMock.checkpoint())
+        .thenThrow(new ThrottlingException("error message"))
+        .thenThrow(new ThrottlingException(expectedErrorMessage))
+
+    val exception = intercept[RuntimeException] {
+      KinesisRecordProcessor.retryRandom(checkpointerMock.checkpoint(), 2, 100)
     }
+    exception.getMessage().shouldBe(expectedErrorMessage)
+
+    verify(checkpointerMock, times(2)).checkpoint()
   }
 }
diff --git a/pom.xml b/pom.xml
index 56e37d42265c0..53372d5cfc624 100644
--- a/pom.xml
+++ b/pom.xml
@@ -619,19 +619,6 @@
         <version>2.2.1</version>
         <scope>test</scope>
       </dependency>
-      <dependency>
-        <groupId>org.easymock</groupId>
-        <artifactId>easymockclassextension</artifactId>
-        <version>3.1</version>
-        <scope>test</scope>
-      </dependency>
-      <!-- Needed by cglib which is needed by easymock. -->
-      <dependency>
-        <groupId>asm</groupId>
-        <artifactId>asm</artifactId>
-        <version>3.3.1</version>
-        <scope>test</scope>
-      </dependency>
       <dependency>
         <groupId>org.mockito</groupId>
         <artifactId>mockito-all</artifactId>

From 9f31db061019414a964aac432e946eac61f8307c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Emre=20Sevin=C3=A7?= <emre.sevinc@gmail.com>
Date: Fri, 13 Feb 2015 12:31:27 -0800
Subject: [PATCH 158/357] SPARK-5805 Fixed the type error in documentation.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes SPARK-5805 : Fix the type error in the final example given in MLlib - Clustering documentation.

Author: Emre Sevinç <emre.sevinc@gmail.com>

Closes #4596 from emres/SPARK-5805 and squashes the following commits:

1029f66 [Emre Sevinç] SPARK-5805 Fixed the type error in documentation.
---
 docs/mllib-clustering.md | 62 ++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 99ed6b60e3f00..0450e6fd3e07e 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -14,7 +14,7 @@ Clustering is an unsupervised learning problem whereby we aim to group subsets
 of entities with one another based on some notion of similarity.  Clustering is
 often used for exploratory analysis and/or as a component of a hierarchical
 supervised learning pipeline (in which distinct classifiers or regression
-models are trained for each cluster). 
+models are trained for each cluster).
 
 MLlib supports the following models:
 
@@ -25,7 +25,7 @@ most commonly used clustering algorithms that clusters the data points into a
 predefined number of clusters. The MLlib implementation includes a parallelized
 variant of the [k-means++](http://en.wikipedia.org/wiki/K-means%2B%2B) method
 called [kmeans||](http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf).
-The implementation in MLlib has the following parameters:  
+The implementation in MLlib has the following parameters:
 
 * *k* is the number of desired clusters.
 * *maxIterations* is the maximum number of iterations to run.
@@ -35,12 +35,12 @@ initialization via k-means\|\|.
 guaranteed to find a globally optimal solution, and when run multiple times on
 a given dataset, the algorithm returns the best clustering result).
 * *initializationSteps* determines the number of steps in the k-means\|\| algorithm.
-* *epsilon* determines the distance threshold within which we consider k-means to have converged. 
+* *epsilon* determines the distance threshold within which we consider k-means to have converged.
 
 ### Gaussian mixture
 
 A [Gaussian Mixture Model](http://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model)
-represents a composite distribution whereby points are drawn from one of *k* Gaussian sub-distributions, 
+represents a composite distribution whereby points are drawn from one of *k* Gaussian sub-distributions,
 each with its own probability.  The MLlib implementation uses the
 [expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
  algorithm to induce the maximum-likelihood model given a set of samples.  The implementation
@@ -221,8 +221,8 @@ print("Within Set Sum of Squared Error = " + str(WSSSE))
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 In the following example after loading and parsing data, we use a
-[GaussianMixture](api/scala/index.html#org.apache.spark.mllib.clustering.GaussianMixture) 
-object to cluster the data into two clusters. The number of desired clusters is passed 
+[GaussianMixture](api/scala/index.html#org.apache.spark.mllib.clustering.GaussianMixture)
+object to cluster the data into two clusters. The number of desired clusters is passed
 to the algorithm. We then output the parameters of the mixture model.
 
 {% highlight scala %}
@@ -238,7 +238,7 @@ val gmm = new GaussianMixture().setK(2).run(parsedData)
 
 // output parameters of max-likelihood model
 for (i <- 0 until gmm.k) {
-  println("weight=%f\nmu=%s\nsigma=\n%s\n" format 
+  println("weight=%f\nmu=%s\nsigma=\n%s\n" format
     (gmm.weights(i), gmm.gaussians(i).mu, gmm.gaussians(i).sigma))
 }
 
@@ -298,7 +298,7 @@ public class GaussianMixtureExample {
 <div data-lang="python" markdown="1">
 In the following example after loading and parsing data, we use a
 [GaussianMixture](api/python/pyspark.mllib.html#pyspark.mllib.clustering.GaussianMixture)
-object to cluster the data into two clusters. The number of desired clusters is passed 
+object to cluster the data into two clusters. The number of desired clusters is passed
 to the algorithm. We then output the parameters of the mixture model.
 
 {% highlight python %}
@@ -326,7 +326,7 @@ for i in range(2):
 
 In the following example, we load word count vectors representing a corpus of documents.
 We then use [LDA](api/scala/index.html#org.apache.spark.mllib.clustering.LDA)
-to infer three topics from the documents. The number of desired clusters is passed 
+to infer three topics from the documents. The number of desired clusters is passed
 to the algorithm. We then output the topics, represented as probability distributions over words.
 
 <div class="codetabs">
@@ -428,27 +428,27 @@ a dependency.
 
 ## Streaming clustering
 
-When data arrive in a stream, we may want to estimate clusters dynamically, 
-updating them as new data arrive. MLlib provides support for streaming k-means clustering, 
-with parameters to control the decay (or "forgetfulness") of the estimates. The algorithm 
-uses a generalization of the mini-batch k-means update rule. For each batch of data, we assign 
+When data arrive in a stream, we may want to estimate clusters dynamically,
+updating them as new data arrive. MLlib provides support for streaming k-means clustering,
+with parameters to control the decay (or "forgetfulness") of the estimates. The algorithm
+uses a generalization of the mini-batch k-means update rule. For each batch of data, we assign
 all points to their nearest cluster, compute new cluster centers, then update each cluster using:
 
 `\begin{equation}
     c_{t+1} = \frac{c_tn_t\alpha + x_tm_t}{n_t\alpha+m_t}
 \end{equation}`
 `\begin{equation}
-    n_{t+1} = n_t + m_t  
+    n_{t+1} = n_t + m_t
 \end{equation}`
 
-Where `$c_t$` is the previous center for the cluster, `$n_t$` is the number of points assigned 
-to the cluster thus far, `$x_t$` is the new cluster center from the current batch, and `$m_t$` 
-is the number of points added to the cluster in the current batch. The decay factor `$\alpha$` 
-can be used to ignore the past: with `$\alpha$=1` all data will be used from the beginning; 
-with `$\alpha$=0` only the most recent data will be used. This is analogous to an 
-exponentially-weighted moving average. 
+Where `$c_t$` is the previous center for the cluster, `$n_t$` is the number of points assigned
+to the cluster thus far, `$x_t$` is the new cluster center from the current batch, and `$m_t$`
+is the number of points added to the cluster in the current batch. The decay factor `$\alpha$`
+can be used to ignore the past: with `$\alpha$=1` all data will be used from the beginning;
+with `$\alpha$=0` only the most recent data will be used. This is analogous to an
+exponentially-weighted moving average.
 
-The decay can be specified using a `halfLife` parameter, which determines the 
+The decay can be specified using a `halfLife` parameter, which determines the
 correct decay factor `a` such that, for data acquired
 at time `t`, its contribution by time `t + halfLife` will have dropped to 0.5.
 The unit of time can be specified either as `batches` or `points` and the update rule
@@ -472,9 +472,9 @@ import org.apache.spark.mllib.clustering.StreamingKMeans
 
 {% endhighlight %}
 
-Then we make an input stream of vectors for training, as well as a stream of labeled data 
-points for testing. We assume a StreamingContext `ssc` has been created, see 
-[Spark Streaming Programming Guide](streaming-programming-guide.html#initializing) for more info.  
+Then we make an input stream of vectors for training, as well as a stream of labeled data
+points for testing. We assume a StreamingContext `ssc` has been created, see
+[Spark Streaming Programming Guide](streaming-programming-guide.html#initializing) for more info.
 
 {% highlight scala %}
 
@@ -496,24 +496,24 @@ val model = new StreamingKMeans()
 
 {% endhighlight %}
 
-Now register the streams for training and testing and start the job, printing 
+Now register the streams for training and testing and start the job, printing
 the predicted cluster assignments on new data points as they arrive.
 
 {% highlight scala %}
 
 model.trainOn(trainingData)
-model.predictOnValues(testData).print()
+model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
 
 ssc.start()
 ssc.awaitTermination()
- 
+
 {% endhighlight %}
 
-As you add new text files with data the cluster centers will update. Each training 
+As you add new text files with data the cluster centers will update. Each training
 point should be formatted as `[x1, x2, x3]`, and each test data point
-should be formatted as `(y, [x1, x2, x3])`, where `y` is some useful label or identifier 
-(e.g. a true category assignment). Anytime a text file is placed in `/training/data/dir` 
-the model will update. Anytime a text file is placed in `/testing/data/dir` 
+should be formatted as `(y, [x1, x2, x3])`, where `y` is some useful label or identifier
+(e.g. a true category assignment). Anytime a text file is placed in `/training/data/dir`
+the model will update. Anytime a text file is placed in `/testing/data/dir`
 you will see predictions. With new data, the cluster centers will change!
 
 </div>

From 378c7eb0d6a9a86da5b9d75a41400128abbe67fb Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 13 Feb 2015 12:43:53 -0800
Subject: [PATCH 159/357] [HOTFIX] Ignore DirectKafkaStreamSuite.

---
 .../spark/streaming/kafka/DirectKafkaStreamSuite.scala    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
index b25c2120d54f7..926094449e7fc 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
@@ -67,7 +67,7 @@ class DirectKafkaStreamSuite extends KafkaStreamSuiteBase
   }
 
 
-  test("basic stream receiving with multiple topics and smallest starting offset") {
+  ignore("basic stream receiving with multiple topics and smallest starting offset") {
     val topics = Set("basic1", "basic2", "basic3")
     val data = Map("a" -> 7, "b" -> 9)
     topics.foreach { t =>
@@ -113,7 +113,7 @@ class DirectKafkaStreamSuite extends KafkaStreamSuiteBase
     ssc.stop()
   }
 
-  test("receiving from largest starting offset") {
+  ignore("receiving from largest starting offset") {
     val topic = "largest"
     val topicPartition = TopicAndPartition(topic, 0)
     val data = Map("a" -> 10)
@@ -158,7 +158,7 @@ class DirectKafkaStreamSuite extends KafkaStreamSuiteBase
   }
 
 
-  test("creating stream by offset") {
+  ignore("creating stream by offset") {
     val topic = "offset"
     val topicPartition = TopicAndPartition(topic, 0)
     val data = Map("a" -> 10)
@@ -204,7 +204,7 @@ class DirectKafkaStreamSuite extends KafkaStreamSuiteBase
   }
 
   // Test to verify the offset ranges can be recovered from the checkpoints
-  test("offset recovery") {
+  ignore("offset recovery") {
     val topic = "recovery"
     createTopic(topic)
     testDir = Utils.createTempDir()

From 5d3cc6b3d7e6ede3be68885e37e3c4ca3a608982 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Fri, 13 Feb 2015 13:10:29 -0800
Subject: [PATCH 160/357] [HOTFIX] Fix build break in
 MesosSchedulerBackendSuite

---
 .../spark/scheduler/mesos/MesosSchedulerBackendSuite.scala     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala
index 8cd302e2b4313..afbaa9ade811f 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosSchedulerBackendSuite.scala
@@ -35,7 +35,8 @@ import org.scalatest.mock.MockitoSugar
 
 import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext}
 import org.apache.spark.executor.MesosExecutorBackend
-import org.apache.spark.scheduler._
+import org.apache.spark.scheduler.{LiveListenerBus, SparkListenerExecutorAdded,
+  TaskDescription, TaskSchedulerImpl, WorkerOffer}
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 import org.apache.spark.scheduler.cluster.mesos.{MesosSchedulerBackend, MemoryUtils}
 

From 2cbb3e433ae334d5c318f05b987af314c854fbcc Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Fri, 13 Feb 2015 13:46:50 -0800
Subject: [PATCH 161/357] [SPARK-5642] [SQL] Apply column pruning on unused
 aggregation fields

select k from (select key k, max(value) v from src group by k) t

Author: Daoyuan Wang <daoyuan.wang@intel.com>
Author: Michael Armbrust <michael@databricks.com>

Closes #4415 from adrian-wang/groupprune and squashes the following commits:

5d2d8a3 [Daoyuan Wang] address Michael's comments
61f8ef7 [Daoyuan Wang] add a unit test
80ddcc6 [Daoyuan Wang] keep project
b69d385 [Daoyuan Wang] add a prune rule for grouping set
---
 .../sql/catalyst/optimizer/Optimizer.scala    |  9 +++++
 .../optimizer/FilterPushdownSuite.scala       | 37 ++++++++++++++++++-
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 0da081ed1a6e2..1a75fcf3545bd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -119,6 +119,15 @@ object ColumnPruning extends Rule[LogicalPlan] {
     case a @ Aggregate(_, _, child) if (child.outputSet -- a.references).nonEmpty =>
       a.copy(child = Project(a.references.toSeq, child))
 
+    case p @ Project(projectList, a @ Aggregate(groupingExpressions, aggregateExpressions, child))
+        if (a.outputSet -- p.references).nonEmpty =>
+      Project(
+        projectList,
+        Aggregate(
+          groupingExpressions,
+          aggregateExpressions.filter(e => p.references.contains(e)),
+          child))
+
     // Eliminate unneeded attributes from either side of a Join.
     case Project(projectList, Join(left, right, joinType, condition)) =>
       // Collect the list of all references required either above or to evaluate the condition.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 0b74bacb18f4b..55c6766520a1e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
-import org.apache.spark.sql.catalyst.expressions.Explode
+import org.apache.spark.sql.catalyst.expressions.{Count, Explode}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.{PlanTest, LeftOuter, RightOuter}
 import org.apache.spark.sql.catalyst.rules._
@@ -37,7 +37,8 @@ class FilterPushdownSuite extends PlanTest {
         CombineFilters,
         PushPredicateThroughProject,
         PushPredicateThroughJoin,
-        PushPredicateThroughGenerate) :: Nil
+        PushPredicateThroughGenerate,
+        ColumnPruning) :: Nil
   }
 
   val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
@@ -58,6 +59,38 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("column pruning for group") {
+    val originalQuery =
+      testRelation
+        .groupBy('a)('a, Count('b))
+        .select('a)
+
+    val optimized = Optimize(originalQuery.analyze)
+    val correctAnswer =
+      testRelation
+        .select('a)
+        .groupBy('a)('a)
+        .select('a).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("column pruning for group with alias") {
+    val originalQuery =
+      testRelation
+        .groupBy('a)('a as 'c, Count('b))
+        .select('c)
+
+    val optimized = Optimize(originalQuery.analyze)
+    val correctAnswer =
+      testRelation
+        .select('a)
+        .groupBy('a)('a as 'c)
+        .select('c).analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
   // After this line is unimplemented.
   test("simple push down") {
     val originalQuery =

From 2e0c084528409e1c565e6945521a33c0835ebbee Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 13 Feb 2015 13:51:06 -0800
Subject: [PATCH 162/357] [SPARK-5789][SQL]Throw a better error message if
 JsonRDD.parseJson encounters unrecoverable parsing errors.

Author: Yin Huai <yhuai@databricks.com>

Closes #4582 from yhuai/jsonErrorMessage and squashes the following commits:

152dbd4 [Yin Huai] Update error message.
1466256 [Yin Huai] Throw a better error message when a JSON object in the input dataset span multiple records (lines for files or strings for an RDD of strings).
---
 .../src/main/scala/org/apache/spark/sql/json/JsonRDD.scala    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index 7dfb30402136e..3b8dde1823370 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -306,6 +306,10 @@ private[sql] object JsonRDD extends Logging {
           val parsed = mapper.readValue(record, classOf[Object]) match {
             case map: java.util.Map[_, _] => scalafy(map).asInstanceOf[Map[String, Any]] :: Nil
             case list: java.util.List[_] => scalafy(list).asInstanceOf[Seq[Map[String, Any]]]
+            case _ =>
+              sys.error(
+                s"Failed to parse record $record. Please make sure that each line of the file " +
+                "(or each string in the RDD) is a valid JSON object or an array of JSON objects.")
           }
 
           parsed

From cc56c8729a76af85aa6eb5d2f99787cca5e5b38f Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Fri, 13 Feb 2015 15:09:27 -0800
Subject: [PATCH 163/357] [SPARK-5806] re-organize sections in
 mllib-clustering.md

Put example code close to the algorithm description.

Author: Xiangrui Meng <meng@databricks.com>

Closes #4598 from mengxr/SPARK-5806 and squashes the following commits:

a137872 [Xiangrui Meng] re-organize sections in mllib-clustering.md
---
 docs/mllib-clustering.md | 156 ++++++++++++++++++---------------------
 docs/mllib-guide.md      |   8 +-
 2 files changed, 77 insertions(+), 87 deletions(-)

diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 0450e6fd3e07e..09b56576699e0 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -4,12 +4,6 @@ title: Clustering - MLlib
 displayTitle: <a href="mllib-guide.html">MLlib</a> - Clustering
 ---
 
-* Table of contents
-{:toc}
-
-
-## Clustering
-
 Clustering is an unsupervised learning problem whereby we aim to group subsets
 of entities with one another based on some notion of similarity.  Clustering is
 often used for exploratory analysis and/or as a component of a hierarchical
@@ -18,7 +12,10 @@ models are trained for each cluster).
 
 MLlib supports the following models:
 
-### k-means
+* Table of contents
+{:toc}
+
+## K-means
 
 [k-means](http://en.wikipedia.org/wiki/K-means_clustering) is one of the
 most commonly used clustering algorithms that clusters the data points into a
@@ -37,72 +34,7 @@ a given dataset, the algorithm returns the best clustering result).
 * *initializationSteps* determines the number of steps in the k-means\|\| algorithm.
 * *epsilon* determines the distance threshold within which we consider k-means to have converged.
 
-### Gaussian mixture
-
-A [Gaussian Mixture Model](http://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model)
-represents a composite distribution whereby points are drawn from one of *k* Gaussian sub-distributions,
-each with its own probability.  The MLlib implementation uses the
-[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
- algorithm to induce the maximum-likelihood model given a set of samples.  The implementation
-has the following parameters:
-
-* *k* is the number of desired clusters.
-* *convergenceTol* is the maximum change in log-likelihood at which we consider convergence achieved.
-* *maxIterations* is the maximum number of iterations to perform without reaching convergence.
-* *initialModel* is an optional starting point from which to start the EM algorithm. If this parameter is omitted, a random starting point will be constructed from the data.
-
-### Power Iteration Clustering
-
-Power iteration clustering is a scalable and efficient algorithm for clustering points given pointwise mutual affinity values.  Internally the algorithm:
-
-* accepts a [Graph](api/graphx/index.html#org.apache.spark.graphx.Graph) that represents a  normalized pairwise affinity between all input points.
-* calculates the principal eigenvalue and eigenvector
-* Clusters each of the input points according to their principal eigenvector component value
-
-Details of this algorithm are found within [Power Iteration Clustering, Lin and Cohen]{www.icml2010.org/papers/387.pdf}
-
-Example outputs for a dataset inspired by the paper - but with five clusters instead of three- have he following output from our implementation:
-
-<p style="text-align: center;">
-  <img src="img/PIClusteringFiveCirclesInputsAndOutputs.png"
-       title="The Property Graph"
-       alt="The Property Graph"
-       width="50%" />
-  <!-- Images are downsized intentionally to improve quality on retina displays -->
-</p>
-
-### Latent Dirichlet Allocation (LDA)
-
-[Latent Dirichlet Allocation (LDA)](http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation)
-is a topic model which infers topics from a collection of text documents.
-LDA can be thought of as a clustering algorithm as follows:
-
-* Topics correspond to cluster centers, and documents correspond to examples (rows) in a dataset.
-* Topics and documents both exist in a feature space, where feature vectors are vectors of word counts.
-* Rather than estimating a clustering using a traditional distance, LDA uses a function based
- on a statistical model of how text documents are generated.
-
-LDA takes in a collection of documents as vectors of word counts.
-It learns clustering using [expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
-on the likelihood function. After fitting on the documents, LDA provides:
-
-* Topics: Inferred topics, each of which is a probability distribution over terms (words).
-* Topic distributions for documents: For each document in the training set, LDA gives a probability distribution over topics.
-
-LDA takes the following parameters:
-
-* `k`: Number of topics (i.e., cluster centers)
-* `maxIterations`: Limit on the number of iterations of EM used for learning
-* `docConcentration`: Hyperparameter for prior over documents' distributions over topics. Currently must be > 1, where larger values encourage smoother inferred distributions.
-* `topicConcentration`: Hyperparameter for prior over topics' distributions over terms (words). Currently must be > 1, where larger values encourage smoother inferred distributions.
-* `checkpointInterval`: If using checkpointing (set in the Spark configuration), this parameter specifies the frequency with which checkpoints will be created.  If `maxIterations` is large, using checkpointing can help reduce shuffle file sizes on disk and help with failure recovery.
-
-*Note*: LDA is a new feature with some missing functionality.  In particular, it does not yet
-support prediction on new documents, and it does not have a Python API.  These will be added in the future.
-
-### Examples
-
-#### k-means
+**Examples**
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
@@ -216,7 +148,21 @@ print("Within Set Sum of Squared Error = " + str(WSSSE))
 
 </div>
 
-#### GaussianMixture
+## Gaussian mixture
+
+A [Gaussian Mixture Model](http://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model)
+represents a composite distribution whereby points are drawn from one of *k* Gaussian sub-distributions,
+each with its own probability.  The MLlib implementation uses the
+[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
+ algorithm to induce the maximum-likelihood model given a set of samples.  The implementation
+has the following parameters:
+
+* *k* is the number of desired clusters.
+* *convergenceTol* is the maximum change in log-likelihood at which we consider convergence achieved.
+* *maxIterations* is the maximum number of iterations to perform without reaching convergence.
+* *initialModel* is an optional starting point from which to start the EM algorithm. If this parameter is omitted, a random starting point will be constructed from the data.
+
+**Examples**
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
@@ -322,7 +268,56 @@ for i in range(2):
 
 </div>
 
-#### Latent Dirichlet Allocation (LDA) Example
+## Power iteration clustering (PIC)
+
+Power iteration clustering (PIC) is a scalable and efficient algorithm for clustering points given pointwise mutual affinity values.  Internally the algorithm:
+
+* accepts a [Graph](api/graphx/index.html#org.apache.spark.graphx.Graph) that represents a  normalized pairwise affinity between all input points.
+* calculates the principal eigenvalue and eigenvector
+* Clusters each of the input points according to their principal eigenvector component value
+
+Details of this algorithm are found within [Power Iteration Clustering, Lin and Cohen]{www.icml2010.org/papers/387.pdf}
+
+Example outputs for a dataset inspired by the paper - but with five clusters instead of three- have he following output from our implementation:
+
+<p style="text-align: center;">
+  <img src="img/PIClusteringFiveCirclesInputsAndOutputs.png"
+       title="The Property Graph"
+       alt="The Property Graph"
+       width="50%" />
+  <!-- Images are downsized intentionally to improve quality on retina displays -->
+</p>
+
+## Latent Dirichlet allocation (LDA)
+
+[Latent Dirichlet allocation (LDA)](http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation)
+is a topic model which infers topics from a collection of text documents.
+LDA can be thought of as a clustering algorithm as follows:
+
+* Topics correspond to cluster centers, and documents correspond to examples (rows) in a dataset.
+* Topics and documents both exist in a feature space, where feature vectors are vectors of word counts.
+* Rather than estimating a clustering using a traditional distance, LDA uses a function based
+ on a statistical model of how text documents are generated.
+
+LDA takes in a collection of documents as vectors of word counts.
+It learns clustering using [expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
+on the likelihood function. After fitting on the documents, LDA provides:
+
+* Topics: Inferred topics, each of which is a probability distribution over terms (words).
+* Topic distributions for documents: For each document in the training set, LDA gives a probability distribution over topics.
+
+LDA takes the following parameters:
+
+* `k`: Number of topics (i.e., cluster centers)
+* `maxIterations`: Limit on the number of iterations of EM used for learning
+* `docConcentration`: Hyperparameter for prior over documents' distributions over topics. Currently must be > 1, where larger values encourage smoother inferred distributions.
+* `topicConcentration`: Hyperparameter for prior over topics' distributions over terms (words). Currently must be > 1, where larger values encourage smoother inferred distributions.
+* `checkpointInterval`: If using checkpointing (set in the Spark configuration), this parameter specifies the frequency with which checkpoints will be created.  If `maxIterations` is large, using checkpointing can help reduce shuffle file sizes on disk and help with failure recovery.
+
+*Note*: LDA is a new feature with some missing functionality.  In particular, it does not yet
+support prediction on new documents, and it does not have a Python API.  These will be added in the future.
+
+**Examples**
 
 In the following example, we load word count vectors representing a corpus of documents.
 We then use [LDA](api/scala/index.html#org.apache.spark.mllib.clustering.LDA)
@@ -419,14 +414,7 @@ public class JavaLDAExample {
 
 </div>
 
-
-In order to run the above application, follow the instructions
-provided in the [Self-Contained Applications](quick-start.html#self-contained-applications)
-section of the Spark
-Quick Start guide. Be sure to also include *spark-mllib* to your build file as
-a dependency.
-
-## Streaming clustering
+## Streaming k-means
 
 When data arrive in a stream, we may want to estimate clusters dynamically,
 updating them as new data arrive. MLlib provides support for streaming k-means clustering,
@@ -454,7 +442,7 @@ at time `t`, its contribution by time `t + halfLife` will have dropped to 0.5.
 The unit of time can be specified either as `batches` or `points` and the update rule
 will be adjusted accordingly.
 
-### Examples
+**Examples**
 
 This example shows how to estimate clusters on streaming data.
 
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 3d32d03e35c62..5091dbf329b97 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -24,9 +24,11 @@ filtering, dimensionality reduction, as well as underlying optimization primitiv
 * [Collaborative filtering](mllib-collaborative-filtering.html)
   * alternating least squares (ALS)
 * [Clustering](mllib-clustering.html)
-  * k-means
-  * Gaussian mixture
-  * power iteration
+  * [k-means](mllib-clustering.html#k-means)
+  * [Gaussian mixture](mllib-clustering.html#gaussian-mixture)
+  * [power iteration clustering (PIC)](mllib-clustering.html#power-iteration-clustering-pic)
+  * [latent Dirichlet allocation (LDA)](mllib-clustering.html#latent-dirichlet-allocation-lda)
+  * [streaming k-means](mllib-clustering.html#streaming-k-means)
 * [Dimensionality reduction](mllib-dimensionality-reduction.html)
   * singular value decomposition (SVD)
   * principal component analysis (PCA)

From d50a91d529b0913364b483c511397d4af308a435 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Fri, 13 Feb 2015 16:43:49 -0800
Subject: [PATCH 164/357] [SPARK-5803][MLLIB] use ArrayBuilder to build
 primitive arrays

because ArrayBuffer is not specialized.

Author: Xiangrui Meng <meng@databricks.com>

Closes #4594 from mengxr/SPARK-5803 and squashes the following commits:

1261bd5 [Xiangrui Meng] merge master
a4ea872 [Xiangrui Meng] use ArrayBuilder to build primitive arrays
---
 .../org/apache/spark/mllib/feature/Word2Vec.scala      |  6 +++---
 .../org/apache/spark/mllib/tree/DecisionTree.scala     | 10 +++++-----
 .../org/apache/spark/mllib/util/NumericParser.scala    | 10 +++++-----
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index a3e40200bc063..59a79e5c6a4ac 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -21,7 +21,7 @@ import java.lang.{Iterable => JavaIterable}
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
-import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.ArrayBuilder
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
@@ -272,7 +272,7 @@ class Word2Vec extends Serializable with Logging {
         def hasNext: Boolean = iter.hasNext
 
         def next(): Array[Int] = {
-          var sentence = new ArrayBuffer[Int]
+          val sentence = ArrayBuilder.make[Int]
           var sentenceLength = 0
           while (iter.hasNext && sentenceLength < MAX_SENTENCE_LENGTH) {
             val word = bcVocabHash.value.get(iter.next())
@@ -283,7 +283,7 @@ class Word2Vec extends Serializable with Logging {
               case None =>
             }
           }
-          sentence.toArray
+          sentence.result()
         }
       }
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index 9a586b9d9c68e..f1f85994e61b2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.mllib.tree
 
-import scala.collection.mutable
 import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuilder
 
 import org.apache.spark.Logging
 import org.apache.spark.annotation.Experimental
@@ -1136,7 +1136,7 @@ object DecisionTree extends Serializable with Logging {
         logDebug("stride = " + stride)
 
         // iterate `valueCount` to find splits
-        val splits = new ArrayBuffer[Double]
+        val splitsBuilder = ArrayBuilder.make[Double]
         var index = 1
         // currentCount: sum of counts of values that have been visited
         var currentCount = valueCounts(0)._2
@@ -1154,13 +1154,13 @@ object DecisionTree extends Serializable with Logging {
           // makes the gap between currentCount and targetCount smaller,
           // previous value is a split threshold.
           if (previousGap < currentGap) {
-            splits.append(valueCounts(index - 1)._1)
+            splitsBuilder += valueCounts(index - 1)._1
             targetCount += stride
           }
           index += 1
         }
 
-        splits.toArray
+        splitsBuilder.result()
       }
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
index f7cba6c6cb628..308f7f3578e21 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/NumericParser.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.util
 
 import java.util.StringTokenizer
 
-import scala.collection.mutable.{ArrayBuffer, ListBuffer}
+import scala.collection.mutable.{ArrayBuilder, ListBuffer}
 
 import org.apache.spark.SparkException
 
@@ -51,7 +51,7 @@ private[mllib] object NumericParser {
   }
 
   private def parseArray(tokenizer: StringTokenizer): Array[Double] = {
-    val values = ArrayBuffer.empty[Double]
+    val values = ArrayBuilder.make[Double]
     var parsing = true
     var allowComma = false
     var token: String = null
@@ -67,14 +67,14 @@ private[mllib] object NumericParser {
         }
       } else {
         // expecting a number
-        values.append(parseDouble(token))
+        values += parseDouble(token)
         allowComma = true
       }
     }
     if (parsing) {
       throw new SparkException(s"An array must end with ']'.")
     }
-    values.toArray
+    values.result()
   }
 
   private def parseTuple(tokenizer: StringTokenizer): Seq[_] = {
@@ -114,7 +114,7 @@ private[mllib] object NumericParser {
     try {
       java.lang.Double.parseDouble(s)
     } catch {
-      case e: Throwable =>
+      case e: NumberFormatException =>
         throw new SparkException(s"Cannot parse a double from: $s", e)
     }
   }

From 4f4c6d5a5db04a56906bacdc85d7e5589b6edada Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Fri, 13 Feb 2015 16:45:59 -0800
Subject: [PATCH 165/357] [SPARK-5730][ML] add doc groups to spark.ml
 components

This PR adds three groups to the ScalaDoc: `param`, `setParam`, and `getParam`. Params will show up in the generated Scala API doc as the top group. Setters/getters will be at the bottom.

Preview:

![screen shot 2015-02-13 at 2 47 49 pm](https://cloud.githubusercontent.com/assets/829644/6196657/5740c240-b38f-11e4-94bb-bd8ef5a796c5.png)

Author: Xiangrui Meng <meng@databricks.com>

Closes #4600 from mengxr/SPARK-5730 and squashes the following commits:

febed9a [Xiangrui Meng] add doc groups to spark.ml components
---
 .../org/apache/spark/ml/Transformer.scala     |  3 +
 .../spark/ml/classification/Classifier.scala  |  2 +
 .../classification/LogisticRegression.scala   |  6 ++
 .../ProbabilisticClassifier.scala             |  2 +
 .../BinaryClassificationEvaluator.scala       | 12 ++-
 .../apache/spark/ml/feature/HashingTF.scala   | 11 ++-
 .../spark/ml/feature/StandardScaler.scala     |  6 ++
 .../spark/ml/impl/estimator/Predictor.scala   |  7 ++
 .../scala/org/apache/spark/ml/package.scala   | 14 +++
 .../apache/spark/ml/param/sharedParams.scala  | 70 ++++++++++++---
 .../apache/spark/ml/recommendation/ALS.scala  | 90 +++++++++++++++++--
 .../ml/regression/LinearRegression.scala      |  3 +
 .../spark/ml/tuning/CrossValidator.scala      | 35 +++++++-
 13 files changed, 235 insertions(+), 26 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
index cd95c16aa768d..2ec2ccdb8c8b3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
@@ -62,7 +62,10 @@ abstract class Transformer extends PipelineStage with Params {
 private[ml] abstract class UnaryTransformer[IN, OUT, T <: UnaryTransformer[IN, OUT, T]]
   extends Transformer with HasInputCol with HasOutputCol with Logging {
 
+  /** @group setParam */
   def setInputCol(value: String): T = set(inputCol, value).asInstanceOf[T]
+
+  /** @group setParam */
   def setOutputCol(value: String): T = set(outputCol, value).asInstanceOf[T]
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
index 1bf8eb4640d11..124ab30f279c1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
@@ -66,6 +66,7 @@ private[spark] abstract class Classifier[
   extends Predictor[FeaturesType, E, M]
   with ClassifierParams {
 
+  /** @group setParam */
   def setRawPredictionCol(value: String): E =
     set(rawPredictionCol, value).asInstanceOf[E]
 
@@ -87,6 +88,7 @@ private[spark]
 abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[FeaturesType, M]]
   extends PredictionModel[FeaturesType, M] with ClassifierParams {
 
+  /** @group setParam */
   def setRawPredictionCol(value: String): M = set(rawPredictionCol, value).asInstanceOf[M]
 
   /** Number of classes (values which the label can take). */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index c146fe244c66e..a9a5af5f0f2d7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -49,8 +49,13 @@ class LogisticRegression
   setMaxIter(100)
   setThreshold(0.5)
 
+  /** @group setParam */
   def setRegParam(value: Double): this.type = set(regParam, value)
+
+  /** @group setParam */
   def setMaxIter(value: Int): this.type = set(maxIter, value)
+
+  /** @group setParam */
   def setThreshold(value: Double): this.type = set(threshold, value)
 
   override protected def train(dataset: DataFrame, paramMap: ParamMap): LogisticRegressionModel = {
@@ -93,6 +98,7 @@ class LogisticRegressionModel private[ml] (
 
   setThreshold(0.5)
 
+  /** @group setParam */
   def setThreshold(value: Double): this.type = set(threshold, value)
 
   private val margin: Vector => Double = (features) => {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
index 1202528ca654e..38518785dceae 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
@@ -61,6 +61,7 @@ private[spark] abstract class ProbabilisticClassifier[
     M <: ProbabilisticClassificationModel[FeaturesType, M]]
   extends Classifier[FeaturesType, E, M] with ProbabilisticClassifierParams {
 
+  /** @group setParam */
   def setProbabilityCol(value: String): E = set(probabilityCol, value).asInstanceOf[E]
 }
 
@@ -82,6 +83,7 @@ private[spark] abstract class ProbabilisticClassificationModel[
     M <: ProbabilisticClassificationModel[FeaturesType, M]]
   extends ClassificationModel[FeaturesType, M] with ProbabilisticClassifierParams {
 
+  /** @group setParam */
   def setProbabilityCol(value: String): M = set(probabilityCol, value).asInstanceOf[M]
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index f21a30627e540..2360f4479f1c2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -35,13 +35,23 @@ import org.apache.spark.sql.types.DoubleType
 class BinaryClassificationEvaluator extends Evaluator with Params
   with HasRawPredictionCol with HasLabelCol {
 
-  /** param for metric name in evaluation */
+  /**
+   * param for metric name in evaluation
+   * @group param
+   */
   val metricName: Param[String] = new Param(this, "metricName",
     "metric name in evaluation (areaUnderROC|areaUnderPR)", Some("areaUnderROC"))
+
+  /** @group getParam */
   def getMetricName: String = get(metricName)
+
+  /** @group setParam */
   def setMetricName(value: String): this.type = set(metricName, value)
 
+  /** @group setParam */
   def setScoreCol(value: String): this.type = set(rawPredictionCol, value)
+
+  /** @group setParam */
   def setLabelCol(value: String): this.type = set(labelCol, value)
 
   override def evaluate(dataset: DataFrame, paramMap: ParamMap): Double = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index 0956062643f23..6131ba8832691 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -31,11 +31,18 @@ import org.apache.spark.sql.types.DataType
 @AlphaComponent
 class HashingTF extends UnaryTransformer[Iterable[_], Vector, HashingTF] {
 
-  /** number of features */
+  /**
+   * number of features
+   * @group param
+   */
   val numFeatures = new IntParam(this, "numFeatures", "number of features", Some(1 << 18))
-  def setNumFeatures(value: Int) = set(numFeatures, value)
+
+  /** @group getParam */
   def getNumFeatures: Int = get(numFeatures)
 
+  /** @group setParam */
+  def setNumFeatures(value: Int) = set(numFeatures, value)
+
   override protected def createTransformFunc(paramMap: ParamMap): Iterable[_] => Vector = {
     val hashingTF = new feature.HashingTF(paramMap(numFeatures))
     hashingTF.transform
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index 4745a7ae95679..7623ec59ae0eb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -39,7 +39,10 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with
 @AlphaComponent
 class StandardScaler extends Estimator[StandardScalerModel] with StandardScalerParams {
 
+  /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
+
+  /** @group setParam */
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   override def fit(dataset: DataFrame, paramMap: ParamMap): StandardScalerModel = {
@@ -75,7 +78,10 @@ class StandardScalerModel private[ml] (
     scaler: feature.StandardScalerModel)
   extends Model[StandardScalerModel] with StandardScalerParams {
 
+  /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
+
+  /** @group setParam */
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
index 89b53f3890ea3..e416c1eb585fc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
@@ -85,8 +85,13 @@ private[spark] abstract class Predictor[
     M <: PredictionModel[FeaturesType, M]]
   extends Estimator[M] with PredictorParams {
 
+  /** @group setParam */
   def setLabelCol(value: String): Learner = set(labelCol, value).asInstanceOf[Learner]
+
+  /** @group setParam */
   def setFeaturesCol(value: String): Learner = set(featuresCol, value).asInstanceOf[Learner]
+
+  /** @group setParam */
   def setPredictionCol(value: String): Learner = set(predictionCol, value).asInstanceOf[Learner]
 
   override def fit(dataset: DataFrame, paramMap: ParamMap): M = {
@@ -160,8 +165,10 @@ private[spark] abstract class Predictor[
 private[spark] abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType, M]]
   extends Model[M] with PredictorParams {
 
+  /** @group setParam */
   def setFeaturesCol(value: String): M = set(featuresCol, value).asInstanceOf[M]
 
+  /** @group setParam */
   def setPredictionCol(value: String): M = set(predictionCol, value).asInstanceOf[M]
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/package.scala b/mllib/src/main/scala/org/apache/spark/ml/package.scala
index 51cd48c90432a..b45bd1499b72e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/package.scala
@@ -20,5 +20,19 @@ package org.apache.spark
 /**
  * Spark ML is an ALPHA component that adds a new set of machine learning APIs to let users quickly
  * assemble and configure practical machine learning pipelines.
+ *
+ * @groupname param Parameters
+ * @groupdesc param A list of (hyper-)parameter keys this algorithm can take. Users can set and get
+ *            the parameter values through setters and getters, respectively.
+ * @groupprio param -5
+ *
+ * @groupname setParam Parameter setters
+ * @groupprio setParam 5
+ *
+ * @groupname getParam Parameter getters
+ * @groupprio getParam 6
+ *
+ * @groupname Ungrouped Members
+ * @groupprio Ungrouped 0
  */
 package object ml
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/sharedParams.scala
index 32fc74462ef4a..1a70322b4cace 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/sharedParams.scala
@@ -24,67 +24,117 @@ package org.apache.spark.ml.param
  */
 
 private[ml] trait HasRegParam extends Params {
-  /** param for regularization parameter */
+  /**
+   * param for regularization parameter
+   * @group param
+   */
   val regParam: DoubleParam = new DoubleParam(this, "regParam", "regularization parameter")
+
+  /** @group getParam */
   def getRegParam: Double = get(regParam)
 }
 
 private[ml] trait HasMaxIter extends Params {
-  /** param for max number of iterations */
+  /**
+   * param for max number of iterations
+   * @group param
+   */
   val maxIter: IntParam = new IntParam(this, "maxIter", "max number of iterations")
+
+  /** @group getParam */
   def getMaxIter: Int = get(maxIter)
 }
 
 private[ml] trait HasFeaturesCol extends Params {
-  /** param for features column name */
+  /**
+   * param for features column name
+   * @group param
+   */
   val featuresCol: Param[String] =
     new Param(this, "featuresCol", "features column name", Some("features"))
+
+  /** @group getParam */
   def getFeaturesCol: String = get(featuresCol)
 }
 
 private[ml] trait HasLabelCol extends Params {
-  /** param for label column name */
+  /**
+   * param for label column name
+   * @group param
+   */
   val labelCol: Param[String] = new Param(this, "labelCol", "label column name", Some("label"))
+
+  /** @group getParam */
   def getLabelCol: String = get(labelCol)
 }
 
 private[ml] trait HasPredictionCol extends Params {
-  /** param for prediction column name */
+  /**
+   * param for prediction column name
+   * @group param
+   */
   val predictionCol: Param[String] =
     new Param(this, "predictionCol", "prediction column name", Some("prediction"))
+
+  /** @group getParam */
   def getPredictionCol: String = get(predictionCol)
 }
 
 private[ml] trait HasRawPredictionCol extends Params {
-  /** param for raw prediction column name */
+  /**
+   * param for raw prediction column name
+   * @group param
+   */
   val rawPredictionCol: Param[String] =
     new Param(this, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name",
       Some("rawPrediction"))
+
+  /** @group getParam */
   def getRawPredictionCol: String = get(rawPredictionCol)
 }
 
 private[ml] trait HasProbabilityCol extends Params {
-  /** param for predicted class conditional probabilities column name */
+  /**
+   * param for predicted class conditional probabilities column name
+   * @group param
+   */
   val probabilityCol: Param[String] =
     new Param(this, "probabilityCol", "column name for predicted class conditional probabilities",
       Some("probability"))
+
+  /** @group getParam */
   def getProbabilityCol: String = get(probabilityCol)
 }
 
 private[ml] trait HasThreshold extends Params {
-  /** param for threshold in (binary) prediction */
+  /**
+   * param for threshold in (binary) prediction
+   * @group param
+   */
   val threshold: DoubleParam = new DoubleParam(this, "threshold", "threshold in prediction")
+
+  /** @group getParam */
   def getThreshold: Double = get(threshold)
 }
 
 private[ml] trait HasInputCol extends Params {
-  /** param for input column name */
+  /**
+   * param for input column name
+   * @group param
+   */
   val inputCol: Param[String] = new Param(this, "inputCol", "input column name")
+
+  /** @group getParam */
   def getInputCol: String = get(inputCol)
 }
 
 private[ml] trait HasOutputCol extends Params {
-  /** param for output column name */
+  /**
+   * param for output column name
+   * @group param
+   */
   val outputCol: Param[String] = new Param(this, "outputCol", "output column name")
+
+  /** @group getParam */
   def getOutputCol: String = get(outputCol)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index bf5737177ceee..aac487745f77d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -49,43 +49,89 @@ import org.apache.spark.util.random.XORShiftRandom
 private[recommendation] trait ALSParams extends Params with HasMaxIter with HasRegParam
   with HasPredictionCol {
 
-  /** Param for rank of the matrix factorization. */
+  /**
+   * Param for rank of the matrix factorization.
+   * @group param
+   */
   val rank = new IntParam(this, "rank", "rank of the factorization", Some(10))
+
+  /** @group getParam */
   def getRank: Int = get(rank)
 
-  /** Param for number of user blocks. */
+  /**
+   * Param for number of user blocks.
+   * @group param
+   */
   val numUserBlocks = new IntParam(this, "numUserBlocks", "number of user blocks", Some(10))
+
+  /** @group getParam */
   def getNumUserBlocks: Int = get(numUserBlocks)
 
-  /** Param for number of item blocks. */
+  /**
+   * Param for number of item blocks.
+   * @group param
+   */
   val numItemBlocks =
     new IntParam(this, "numItemBlocks", "number of item blocks", Some(10))
+
+  /** @group getParam */
   def getNumItemBlocks: Int = get(numItemBlocks)
 
-  /** Param to decide whether to use implicit preference. */
+  /**
+   * Param to decide whether to use implicit preference.
+   * @group param
+   */
   val implicitPrefs =
     new BooleanParam(this, "implicitPrefs", "whether to use implicit preference", Some(false))
+
+  /** @group getParam */
   def getImplicitPrefs: Boolean = get(implicitPrefs)
 
-  /** Param for the alpha parameter in the implicit preference formulation. */
+  /**
+   * Param for the alpha parameter in the implicit preference formulation.
+   * @group param
+   */
   val alpha = new DoubleParam(this, "alpha", "alpha for implicit preference", Some(1.0))
+
+  /** @group getParam */
   def getAlpha: Double = get(alpha)
 
-  /** Param for the column name for user ids. */
+  /**
+   * Param for the column name for user ids.
+   * @group param
+   */
   val userCol = new Param[String](this, "userCol", "column name for user ids", Some("user"))
+
+  /** @group getParam */
   def getUserCol: String = get(userCol)
 
-  /** Param for the column name for item ids. */
+  /**
+   * Param for the column name for item ids.
+   * @group param
+   */
   val itemCol =
     new Param[String](this, "itemCol", "column name for item ids", Some("item"))
+
+  /** @group getParam */
   def getItemCol: String = get(itemCol)
 
-  /** Param for the column name for ratings. */
+  /**
+   * Param for the column name for ratings.
+   * @group param
+   */
   val ratingCol = new Param[String](this, "ratingCol", "column name for ratings", Some("rating"))
+
+  /** @group getParam */
   def getRatingCol: String = get(ratingCol)
 
+  /**
+   * Param for whether to apply nonnegativity constraints.
+   * @group param
+   */
   val nonnegative = new BooleanParam(
     this, "nonnegative", "whether to use nonnegative constraint for least squares", Some(false))
+
+  /** @group getParam */
   val getNonnegative: Boolean = get(nonnegative)
 
   /**
@@ -181,20 +227,46 @@ class ALS extends Estimator[ALSModel] with ALSParams {
 
   import org.apache.spark.ml.recommendation.ALS.Rating
 
+  /** @group setParam */
   def setRank(value: Int): this.type = set(rank, value)
+
+  /** @group setParam */
   def setNumUserBlocks(value: Int): this.type = set(numUserBlocks, value)
+
+  /** @group setParam */
   def setNumItemBlocks(value: Int): this.type = set(numItemBlocks, value)
+
+  /** @group setParam */
   def setImplicitPrefs(value: Boolean): this.type = set(implicitPrefs, value)
+
+  /** @group setParam */
   def setAlpha(value: Double): this.type = set(alpha, value)
+
+  /** @group setParam */
   def setUserCol(value: String): this.type = set(userCol, value)
+
+  /** @group setParam */
   def setItemCol(value: String): this.type = set(itemCol, value)
+
+  /** @group setParam */
   def setRatingCol(value: String): this.type = set(ratingCol, value)
+
+  /** @group setParam */
   def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+  /** @group setParam */
   def setMaxIter(value: Int): this.type = set(maxIter, value)
+
+  /** @group setParam */
   def setRegParam(value: Double): this.type = set(regParam, value)
+
+  /** @group setParam */
   def setNonnegative(value: Boolean): this.type = set(nonnegative, value)
 
-  /** Sets both numUserBlocks and numItemBlocks to the specific value. */
+  /**
+   * Sets both numUserBlocks and numItemBlocks to the specific value.
+   * @group setParam
+   */
   def setNumBlocks(value: Int): this.type = {
     setNumUserBlocks(value)
     setNumItemBlocks(value)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index d5a7bdafcb623..65f6627a0c351 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -44,7 +44,10 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress
   setRegParam(0.1)
   setMaxIter(100)
 
+  /** @group setParam */
   def setRegParam(value: Double): this.type = set(regParam, value)
+
+  /** @group setParam */
   def setMaxIter(value: Int): this.type = set(maxIter, value)
 
   override protected def train(dataset: DataFrame, paramMap: ParamMap): LinearRegressionModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 324b1ba784387..b139bc8dcb44c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -31,22 +31,42 @@ import org.apache.spark.sql.types.StructType
  * Params for [[CrossValidator]] and [[CrossValidatorModel]].
  */
 private[ml] trait CrossValidatorParams extends Params {
-  /** param for the estimator to be cross-validated */
+  /**
+   * param for the estimator to be cross-validated
+   * @group param
+   */
   val estimator: Param[Estimator[_]] = new Param(this, "estimator", "estimator for selection")
+
+  /** @group getParam */
   def getEstimator: Estimator[_] = get(estimator)
 
-  /** param for estimator param maps */
+  /**
+   * param for estimator param maps
+   * @group param
+   */
   val estimatorParamMaps: Param[Array[ParamMap]] =
     new Param(this, "estimatorParamMaps", "param maps for the estimator")
+
+  /** @group getParam */
   def getEstimatorParamMaps: Array[ParamMap] = get(estimatorParamMaps)
 
-  /** param for the evaluator for selection */
+  /**
+   * param for the evaluator for selection
+   * @group param
+   */
   val evaluator: Param[Evaluator] = new Param(this, "evaluator", "evaluator for selection")
+
+  /** @group getParam */
   def getEvaluator: Evaluator = get(evaluator)
 
-  /** param for number of folds for cross validation */
+  /**
+   * param for number of folds for cross validation
+   * @group param
+   */
   val numFolds: IntParam =
     new IntParam(this, "numFolds", "number of folds for cross validation", Some(3))
+
+  /** @group getParam */
   def getNumFolds: Int = get(numFolds)
 }
 
@@ -59,9 +79,16 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
 
   private val f2jBLAS = new F2jBLAS
 
+  /** @group setParam */
   def setEstimator(value: Estimator[_]): this.type = set(estimator, value)
+
+  /** @group setParam */
   def setEstimatorParamMaps(value: Array[ParamMap]): this.type = set(estimatorParamMaps, value)
+
+  /** @group setParam */
   def setEvaluator(value: Evaluator): this.type = set(evaluator, value)
+
+  /** @group setParam */
   def setNumFolds(value: Int): this.type = set(numFolds, value)
 
   override def fit(dataset: DataFrame, paramMap: ParamMap): CrossValidatorModel = {

From d06d5ee9b33505774ef1e5becc01b47492f1a2dc Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 13 Feb 2015 17:45:31 -0800
Subject: [PATCH 166/357] [SPARK-5227] [SPARK-5679] Disable FileSystem cache in
 WholeTextFileRecordReaderSuite

This patch fixes two difficult-to-reproduce Jenkins test failures in InputOutputMetricsSuite (SPARK-5227 and SPARK-5679).  The problem was that WholeTextFileRecordReaderSuite modifies the `fs.local.block.size` Hadoop configuration and this change was affecting subsequent test suites due to Hadoop's caching of FileSystem instances (see HADOOP-8490 for more details).

The fix implemented here is to disable FileSystem caching in WholeTextFileRecordReaderSuite.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #4599 from JoshRosen/inputoutputsuite-fix and squashes the following commits:

47dc447 [Josh Rosen] [SPARK-5227] [SPARK-5679] Disable FileSystem cache in WholeTextFileRecordReaderSuite
---
 .../spark/input/WholeTextFileRecordReaderSuite.scala | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala
index 98b0a16ce88ba..2e58c159a2ed8 100644
--- a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala
@@ -28,7 +28,7 @@ import org.scalatest.FunSuite
 
 import org.apache.hadoop.io.Text
 
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.util.Utils
 import org.apache.hadoop.io.compress.{DefaultCodec, CompressionCodecFactory, GzipCodec}
 
@@ -42,7 +42,15 @@ class WholeTextFileRecordReaderSuite extends FunSuite with BeforeAndAfterAll {
   private var factory: CompressionCodecFactory = _
 
   override def beforeAll() {
-    sc = new SparkContext("local", "test")
+    // Hadoop's FileSystem caching does not use the Configuration as part of its cache key, which
+    // can cause Filesystem.get(Configuration) to return a cached instance created with a different
+    // configuration than the one passed to get() (see HADOOP-8490 for more details). This caused
+    // hard-to-reproduce test failures, since any suites that were run after this one would inherit
+    // the new value of "fs.local.block.size" (see SPARK-5227 and SPARK-5679). To work around this,
+    // we disable FileSystem caching in this suite.
+    val conf = new SparkConf().set("spark.hadoop.fs.file.impl.disable.cache", "true")
+
+    sc = new SparkContext("local", "test", conf)
 
     // Set the block size of local file system to test whether files are split right or not.
     sc.hadoopConfiguration.setLong("fs.local.block.size", 32)

From 0ce4e430a81532dc317136f968f28742e087d840 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 13 Feb 2015 20:12:52 -0800
Subject: [PATCH 167/357] SPARK-3290 [GRAPHX] No unpersist callls in
 SVDPlusPlus

This just unpersist()s each RDD in this code that was cache()ed.

Author: Sean Owen <sowen@cloudera.com>

Closes #4234 from srowen/SPARK-3290 and squashes the following commits:

66c1e11 [Sean Owen] unpersist() each RDD that was cache()ed
---
 .../apache/spark/graphx/lib/SVDPlusPlus.scala | 40 +++++++++++++++----
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
index f58587e10a820..112ed09ef46dc 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
@@ -72,17 +72,22 @@ object SVDPlusPlus {
 
     // construct graph
     var g = Graph.fromEdges(edges, defaultF(conf.rank)).cache()
+    materialize(g)
+    edges.unpersist()
 
     // Calculate initial bias and norm
     val t0 = g.aggregateMessages[(Long, Double)](
       ctx => { ctx.sendToSrc((1L, ctx.attr)); ctx.sendToDst((1L, ctx.attr)) },
       (g1, g2) => (g1._1 + g2._1, g1._2 + g2._2))
 
-    g = g.outerJoinVertices(t0) {
+    val gJoinT0 = g.outerJoinVertices(t0) {
       (vid: VertexId, vd: (DoubleMatrix, DoubleMatrix, Double, Double),
        msg: Option[(Long, Double)]) =>
         (vd._1, vd._2, msg.get._2 / msg.get._1, 1.0 / scala.math.sqrt(msg.get._1))
-    }
+    }.cache()
+    materialize(gJoinT0)
+    g.unpersist()
+    g = gJoinT0
 
     def sendMsgTrainF(conf: Conf, u: Double)
         (ctx: EdgeContext[
@@ -114,12 +119,15 @@ object SVDPlusPlus {
       val t1 = g.aggregateMessages[DoubleMatrix](
         ctx => ctx.sendToSrc(ctx.dstAttr._2),
         (g1, g2) => g1.addColumnVector(g2))
-      g = g.outerJoinVertices(t1) {
+      val gJoinT1 = g.outerJoinVertices(t1) {
         (vid: VertexId, vd: (DoubleMatrix, DoubleMatrix, Double, Double),
          msg: Option[DoubleMatrix]) =>
           if (msg.isDefined) (vd._1, vd._1
             .addColumnVector(msg.get.mul(vd._4)), vd._3, vd._4) else vd
-      }
+      }.cache()
+      materialize(gJoinT1)
+      g.unpersist()
+      g = gJoinT1
 
       // Phase 2, update p for user nodes and q, y for item nodes
       g.cache()
@@ -127,13 +135,16 @@ object SVDPlusPlus {
         sendMsgTrainF(conf, u),
         (g1: (DoubleMatrix, DoubleMatrix, Double), g2: (DoubleMatrix, DoubleMatrix, Double)) =>
           (g1._1.addColumnVector(g2._1), g1._2.addColumnVector(g2._2), g1._3 + g2._3))
-      g = g.outerJoinVertices(t2) {
+      val gJoinT2 = g.outerJoinVertices(t2) {
         (vid: VertexId,
          vd: (DoubleMatrix, DoubleMatrix, Double, Double),
          msg: Option[(DoubleMatrix, DoubleMatrix, Double)]) =>
           (vd._1.addColumnVector(msg.get._1), vd._2.addColumnVector(msg.get._2),
             vd._3 + msg.get._3, vd._4)
-      }
+      }.cache()
+      materialize(gJoinT2)
+      g.unpersist()
+      g = gJoinT2
     }
 
     // calculate error on training set
@@ -147,13 +158,26 @@ object SVDPlusPlus {
       val err = (ctx.attr - pred) * (ctx.attr - pred)
       ctx.sendToDst(err)
     }
+
     g.cache()
     val t3 = g.aggregateMessages[Double](sendMsgTestF(conf, u), _ + _)
-    g = g.outerJoinVertices(t3) {
+    val gJoinT3 = g.outerJoinVertices(t3) {
       (vid: VertexId, vd: (DoubleMatrix, DoubleMatrix, Double, Double), msg: Option[Double]) =>
         if (msg.isDefined) (vd._1, vd._2, vd._3, msg.get) else vd
-    }
+    }.cache()
+    materialize(gJoinT3)
+    g.unpersist()
+    g = gJoinT3
 
     (g, u)
   }
+
+  /**
+   * Forces materialization of a Graph by count()ing its RDDs.
+   */
+  private def materialize(g: Graph[_,_]): Unit = {
+    g.vertices.count()
+    g.edges.count()
+  }
+
 }

From e98dfe627c5d0201464cdd0f363f391ea84c389a Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 13 Feb 2015 23:03:22 -0800
Subject: [PATCH 168/357] [SPARK-5752][SQL] Don't implicitly convert RDDs
 directly to DataFrames

- The old implicit would convert RDDs directly to DataFrames, and that added too many methods.
- toDataFrame -> toDF
- Dsl -> functions
- implicits moved into SQLContext.implicits
- addColumn -> withColumn
- renameColumn -> withColumnRenamed

Python changes:
- toDataFrame -> toDF
- Dsl -> functions package
- addColumn -> withColumn
- renameColumn -> withColumnRenamed
- add toDF functions to RDD on SQLContext init
- add flatMap to DataFrame

Author: Reynold Xin <rxin@databricks.com>
Author: Davies Liu <davies@databricks.com>

Closes #4556 from rxin/SPARK-5752 and squashes the following commits:

5ef9910 [Reynold Xin] More fix
61d3fca [Reynold Xin] Merge branch 'df5' of github.com:davies/spark into SPARK-5752
ff5832c [Reynold Xin] Fix python
749c675 [Reynold Xin] count(*) fixes.
5806df0 [Reynold Xin] Fix build break again.
d941f3d [Reynold Xin] Fixed explode compilation break.
fe1267a [Davies Liu] flatMap
c4afb8e [Reynold Xin] style
d9de47f [Davies Liu] add comment
b783994 [Davies Liu] add comment for toDF
e2154e5 [Davies Liu] schema() -> schema
3a1004f [Davies Liu] Dsl -> functions, toDF()
fb256af [Reynold Xin] - toDataFrame -> toDF - Dsl -> functions - implicits moved into SQLContext.implicits - addColumn -> withColumn - renameColumn -> withColumnRenamed
0dd74eb [Reynold Xin] [SPARK-5752][SQL] Don't implicitly convert RDDs directly to DataFrames
97dd47c [Davies Liu] fix mistake
6168f74 [Davies Liu] fix test
1fc0199 [Davies Liu] fix test
a075cd5 [Davies Liu] clean up, toPandas
663d314 [Davies Liu] add test for agg('*')
9e214d5 [Reynold Xin] count(*) fixes.
1ed7136 [Reynold Xin] Fix build break again.
921b2e3 [Reynold Xin] Fixed explode compilation break.
14698d4 [Davies Liu] flatMap
ba3e12d [Reynold Xin] style
d08c92d [Davies Liu] add comment
5c8b524 [Davies Liu] add comment for toDF
a4e5e66 [Davies Liu] schema() -> schema
d377fc9 [Davies Liu] Dsl -> functions, toDF()
6b3086c [Reynold Xin] - toDataFrame -> toDF - Dsl -> functions - implicits moved into SQLContext.implicits - addColumn -> withColumn - renameColumn -> withColumnRenamed
807e8b1 [Reynold Xin] [SPARK-5752][SQL] Don't implicitly convert RDDs directly to DataFrames
---
 .../examples/ml/CrossValidatorExample.scala   |   4 +-
 .../examples/ml/DeveloperApiExample.scala     |   4 +-
 .../spark/examples/ml/MovieLensALS.scala      |   6 +-
 .../examples/ml/SimpleParamsExample.scala     |   6 +-
 .../ml/SimpleTextClassificationPipeline.scala |   4 +-
 .../spark/examples/mllib/DatasetExample.scala |   8 +-
 .../spark/examples/sql/RDDRelation.scala      |  10 +-
 .../examples/sql/hive/HiveFromSpark.scala     |   2 +-
 .../org/apache/spark/ml/Transformer.scala     |   6 +-
 .../spark/ml/classification/Classifier.scala  |  16 +-
 .../classification/LogisticRegression.scala   |  33 ++-
 .../ProbabilisticClassifier.scala             |   6 +-
 .../spark/ml/feature/StandardScaler.scala     |   4 +-
 .../spark/ml/impl/estimator/Predictor.scala   |   4 +-
 .../apache/spark/ml/recommendation/ALS.scala  |   6 +-
 .../mllib/classification/NaiveBayes.scala     |   2 +-
 .../impl/GLMClassificationModel.scala         |   2 +-
 .../MatrixFactorizationModel.scala            |   4 +-
 .../regression/impl/GLMRegressionModel.scala  |   2 +-
 .../mllib/tree/model/DecisionTreeModel.scala  |   2 +-
 .../mllib/tree/model/treeEnsembleModels.scala |   2 +-
 .../spark/ml/recommendation/ALSSuite.scala    |   4 +-
 python/docs/pyspark.sql.rst                   |   8 +
 python/pyspark/mllib/tests.py                 |   2 +-
 python/pyspark/sql/__init__.py                |   3 +-
 python/pyspark/sql/context.py                 |  34 ++-
 python/pyspark/sql/dataframe.py               | 221 ++++--------------
 python/pyspark/sql/functions.py               | 170 ++++++++++++++
 python/pyspark/sql/tests.py                   |  38 ++-
 python/run-tests                              |   3 +-
 .../apache/spark/repl/SparkILoopInit.scala    |   2 +-
 .../org/apache/spark/repl/ReplSuite.scala     |   2 +-
 .../org/apache/spark/repl/SparkILoop.scala    |   2 +-
 .../sql/catalyst/analysis/unresolved.scala    |   2 +-
 .../scala/org/apache/spark/sql/Column.scala   |  21 +-
 .../org/apache/spark/sql/DataFrame.scala      |  25 +-
 .../apache/spark/sql/DataFrameHolder.scala    |  30 +++
 .../org/apache/spark/sql/DataFrameImpl.scala  |   6 +-
 .../org/apache/spark/sql/GroupedData.scala    |  19 +-
 .../apache/spark/sql/IncomputableColumn.scala |   6 +-
 .../org/apache/spark/sql/SQLContext.scala     |  35 ++-
 .../spark/sql/UserDefinedFunction.scala       |   4 +-
 .../spark/sql/{Dsl.scala => functions.scala}  |  21 +-
 .../spark/sql/parquet/ParquetTest.scala       |   2 +-
 .../apache/spark/sql/api/java/JavaDsl.java    |   2 +-
 .../apache/spark/sql/CachedTableSuite.scala   |   7 +-
 .../spark/sql/ColumnExpressionSuite.scala     |  10 +-
 .../spark/sql/DataFrameImplicitsSuite.scala   |  10 +-
 .../org/apache/spark/sql/DataFrameSuite.scala |  51 ++--
 .../org/apache/spark/sql/JoinSuite.scala      |   3 +-
 .../apache/spark/sql/ListTablesSuite.scala    |   2 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |   8 +-
 .../sql/ScalaReflectionRelationSuite.scala    |  10 +-
 .../scala/org/apache/spark/sql/TestData.scala |  46 ++--
 .../scala/org/apache/spark/sql/UDFSuite.scala |   2 +-
 .../spark/sql/UserDefinedTypeSuite.scala      |   4 +-
 .../columnar/InMemoryColumnarQuerySuite.scala |   8 +-
 .../columnar/PartitionBatchPruningSuite.scala |   5 +-
 .../spark/sql/execution/PlannerSuite.scala    |   3 +-
 .../org/apache/spark/sql/json/JsonSuite.scala |   7 +-
 .../spark/sql/parquet/ParquetIOSuite.scala    |   6 +-
 .../sql/hive/InsertIntoHiveTableSuite.scala   |   8 +-
 .../spark/sql/hive/ListTablesSuite.scala      |   2 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  17 +-
 .../sql/hive/execution/HiveQuerySuite.scala   |  12 +-
 .../hive/execution/HiveResolutionSuite.scala  |   6 +-
 .../hive/execution/HiveTableScanSuite.scala   |   3 +-
 .../sql/hive/execution/HiveUdfSuite.scala     |  10 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |  11 +-
 .../spark/sql/parquet/parquetSuites.scala     |   6 +-
 70 files changed, 596 insertions(+), 456 deletions(-)
 create mode 100644 python/pyspark/sql/functions.py
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/DataFrameHolder.scala
 rename sql/core/src/main/scala/org/apache/spark/sql/{Dsl.scala => functions.scala} (97%)

diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
index a2893f78e0fec..f0241943ef410 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
@@ -90,7 +90,7 @@ object CrossValidatorExample {
     crossval.setNumFolds(2) // Use 3+ in practice
 
     // Run cross-validation, and choose the best set of parameters.
-    val cvModel = crossval.fit(training)
+    val cvModel = crossval.fit(training.toDF)
 
     // Prepare test documents, which are unlabeled.
     val test = sc.parallelize(Seq(
@@ -100,7 +100,7 @@ object CrossValidatorExample {
       Document(7L, "apache hadoop")))
 
     // Make predictions on test documents. cvModel uses the best model found (lrModel).
-    cvModel.transform(test)
+    cvModel.transform(test.toDF)
       .select("id", "text", "probability", "prediction")
       .collect()
       .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
index aed44238939c7..54aadd2288817 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
@@ -58,7 +58,7 @@ object DeveloperApiExample {
     lr.setMaxIter(10)
 
     // Learn a LogisticRegression model.  This uses the parameters stored in lr.
-    val model = lr.fit(training)
+    val model = lr.fit(training.toDF)
 
     // Prepare test data.
     val test = sc.parallelize(Seq(
@@ -67,7 +67,7 @@ object DeveloperApiExample {
       LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))))
 
     // Make predictions on test data.
-    val sumPredictions: Double = model.transform(test)
+    val sumPredictions: Double = model.transform(test.toDF)
       .select("features", "label", "prediction")
       .collect()
       .map { case Row(features: Vector, label: Double, prediction: Double) =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
index 836ea2e01201e..adaf796dc1896 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
@@ -137,9 +137,9 @@ object MovieLensALS {
       .setRegParam(params.regParam)
       .setNumBlocks(params.numBlocks)
 
-    val model = als.fit(training)
+    val model = als.fit(training.toDF)
 
-    val predictions = model.transform(test).cache()
+    val predictions = model.transform(test.toDF).cache()
 
     // Evaluate the model.
     // TODO: Create an evaluator to compute RMSE.
@@ -158,7 +158,7 @@ object MovieLensALS {
 
     // Inspect false positives.
     predictions.registerTempTable("prediction")
-    sc.textFile(params.movies).map(Movie.parseMovie).registerTempTable("movie")
+    sc.textFile(params.movies).map(Movie.parseMovie).toDF.registerTempTable("movie")
     sqlContext.sql(
       """
         |SELECT userId, prediction.movieId, title, rating, prediction
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
index 80c9f5ff5781e..c5bb5515b1930 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
@@ -58,7 +58,7 @@ object SimpleParamsExample {
       .setRegParam(0.01)
 
     // Learn a LogisticRegression model.  This uses the parameters stored in lr.
-    val model1 = lr.fit(training)
+    val model1 = lr.fit(training.toDF)
     // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
     // we can view the parameters it used during fit().
     // This prints the parameter (name: value) pairs, where names are unique IDs for this
@@ -77,7 +77,7 @@ object SimpleParamsExample {
 
     // Now learn a new model using the paramMapCombined parameters.
     // paramMapCombined overrides all parameters set earlier via lr.set* methods.
-    val model2 = lr.fit(training, paramMapCombined)
+    val model2 = lr.fit(training.toDF, paramMapCombined)
     println("Model 2 was fit using parameters: " + model2.fittingParamMap)
 
     // Prepare test data.
@@ -90,7 +90,7 @@ object SimpleParamsExample {
     // LogisticRegression.transform will only use the 'features' column.
     // Note that model2.transform() outputs a 'myProbability' column instead of the usual
     // 'probability' column since we renamed the lr.probabilityCol parameter previously.
-    model2.transform(test)
+    model2.transform(test.toDF)
       .select("features", "label", "myProbability", "prediction")
       .collect()
       .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
index 968cb292120d8..8b47f88e48df1 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
@@ -69,7 +69,7 @@ object SimpleTextClassificationPipeline {
       .setStages(Array(tokenizer, hashingTF, lr))
 
     // Fit the pipeline to training documents.
-    val model = pipeline.fit(training)
+    val model = pipeline.fit(training.toDF)
 
     // Prepare test documents, which are unlabeled.
     val test = sc.parallelize(Seq(
@@ -79,7 +79,7 @@ object SimpleTextClassificationPipeline {
       Document(7L, "apache hadoop")))
 
     // Make predictions on test documents.
-    model.transform(test)
+    model.transform(test.toDF)
       .select("id", "text", "probability", "prediction")
       .collect()
       .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
index 89b6255991a38..c98c68a02f2be 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
@@ -81,18 +81,18 @@ object DatasetExample {
     println(s"Loaded ${origData.count()} instances from file: ${params.input}")
 
     // Convert input data to DataFrame explicitly.
-    val df: DataFrame = origData.toDataFrame
+    val df: DataFrame = origData.toDF
     println(s"Inferred schema:\n${df.schema.prettyJson}")
     println(s"Converted to DataFrame with ${df.count()} records")
 
-    // Select columns, using implicit conversion to DataFrames.
-    val labelsDf: DataFrame = origData.select("label")
+    // Select columns
+    val labelsDf: DataFrame = df.select("label")
     val labels: RDD[Double] = labelsDf.map { case Row(v: Double) => v }
     val numLabels = labels.count()
     val meanLabel = labels.fold(0.0)(_ + _) / numLabels
     println(s"Selected label column with average value $meanLabel")
 
-    val featuresDf: DataFrame = origData.select("features")
+    val featuresDf: DataFrame = df.select("features")
     val features: RDD[Vector] = featuresDf.map { case Row(v: Vector) => v }
     val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())(
       (summary, feat) => summary.add(feat),
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
index 1eac3c8d03e39..79d3d5a24ceaf 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
@@ -19,7 +19,7 @@ package org.apache.spark.examples.sql
 
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.sql.SQLContext
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 
 // One method for defining the schema of an RDD is to make a case class with the desired column
 // names and types.
@@ -34,10 +34,10 @@ object RDDRelation {
     // Importing the SQL context gives access to all the SQL functions and implicit conversions.
     import sqlContext.implicits._
 
-    val rdd = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i")))
+    val df = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i"))).toDF
     // Any RDD containing case classes can be registered as a table.  The schema of the table is
     // automatically inferred using scala reflection.
-    rdd.registerTempTable("records")
+    df.registerTempTable("records")
 
     // Once tables have been registered, you can run SQL queries over them.
     println("Result of SELECT *:")
@@ -55,10 +55,10 @@ object RDDRelation {
     rddFromSql.map(row => s"Key: ${row(0)}, Value: ${row(1)}").collect().foreach(println)
 
     // Queries can also be written using a LINQ-like Scala DSL.
-    rdd.where($"key" === 1).orderBy($"value".asc).select($"key").collect().foreach(println)
+    df.where($"key" === 1).orderBy($"value".asc).select($"key").collect().foreach(println)
 
     // Write out an RDD as a parquet file.
-    rdd.saveAsParquetFile("pair.parquet")
+    df.saveAsParquetFile("pair.parquet")
 
     // Read in parquet file.  Parquet files are self-describing so the schmema is preserved.
     val parquetFile = sqlContext.parquetFile("pair.parquet")
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
index 15754cdfcc35e..7128deba54da7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
@@ -68,7 +68,7 @@ object HiveFromSpark {
 
     // You can also register RDDs as temporary tables within a HiveContext.
     val rdd = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i")))
-    rdd.registerTempTable("records")
+    rdd.toDF.registerTempTable("records")
 
     // Queries can then join RDD data with data stored in Hive.
     println("Result of SELECT *:")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
index 2ec2ccdb8c8b3..9a5848684b179 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
@@ -23,7 +23,7 @@ import org.apache.spark.Logging
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.param._
 import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 
 /**
@@ -100,7 +100,7 @@ private[ml] abstract class UnaryTransformer[IN, OUT, T <: UnaryTransformer[IN, O
   override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = {
     transformSchema(dataset.schema, paramMap, logging = true)
     val map = this.paramMap ++ paramMap
-    dataset.select($"*", callUDF(
-      this.createTransformFunc(map), outputDataType, dataset(map(inputCol))).as(map(outputCol)))
+    dataset.withColumn(map(outputCol),
+      callUDF(this.createTransformFunc(map), outputDataType, dataset(map(inputCol))))
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
index 124ab30f279c1..c5fc89f935432 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.{DeveloperApi, AlphaComponent}
 import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor, PredictorParams}
 import org.apache.spark.ml.param.{Params, ParamMap, HasRawPredictionCol}
 import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
 
@@ -182,24 +182,22 @@ private[ml] object ClassificationModel {
     if (map(model.rawPredictionCol) != "") {
       // output raw prediction
       val features2raw: FeaturesType => Vector = model.predictRaw
-      tmpData = tmpData.select($"*",
-        callUDF(features2raw, new VectorUDT,
-          col(map(model.featuresCol))).as(map(model.rawPredictionCol)))
+      tmpData = tmpData.withColumn(map(model.rawPredictionCol),
+        callUDF(features2raw, new VectorUDT, col(map(model.featuresCol))))
       numColsOutput += 1
       if (map(model.predictionCol) != "") {
         val raw2pred: Vector => Double = (rawPred) => {
           rawPred.toArray.zipWithIndex.maxBy(_._1)._2
         }
-        tmpData = tmpData.select($"*", callUDF(raw2pred, DoubleType,
-          col(map(model.rawPredictionCol))).as(map(model.predictionCol)))
+        tmpData = tmpData.withColumn(map(model.predictionCol),
+          callUDF(raw2pred, DoubleType, col(map(model.rawPredictionCol))))
         numColsOutput += 1
       }
     } else if (map(model.predictionCol) != "") {
       // output prediction
       val features2pred: FeaturesType => Double = model.predict
-      tmpData = tmpData.select($"*",
-        callUDF(features2pred, DoubleType,
-          col(map(model.featuresCol))).as(map(model.predictionCol)))
+      tmpData = tmpData.withColumn(map(model.predictionCol),
+        callUDF(features2pred, DoubleType, col(map(model.featuresCol))))
       numColsOutput += 1
     }
     (numColsOutput, tmpData)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index a9a5af5f0f2d7..21f61d80dd95a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -22,7 +22,7 @@ import org.apache.spark.ml.param._
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
 import org.apache.spark.mllib.linalg.{VectorUDT, BLAS, Vector, Vectors}
 import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.DoubleType
 import org.apache.spark.storage.StorageLevel
 
@@ -130,44 +130,39 @@ class LogisticRegressionModel private[ml] (
     var numColsOutput = 0
     if (map(rawPredictionCol) != "") {
       val features2raw: Vector => Vector = (features) => predictRaw(features)
-      tmpData = tmpData.select($"*",
-        callUDF(features2raw, new VectorUDT, col(map(featuresCol))).as(map(rawPredictionCol)))
+      tmpData = tmpData.withColumn(map(rawPredictionCol),
+        callUDF(features2raw, new VectorUDT, col(map(featuresCol))))
       numColsOutput += 1
     }
     if (map(probabilityCol) != "") {
       if (map(rawPredictionCol) != "") {
-        val raw2prob: Vector => Vector = { (rawPreds: Vector) =>
+        val raw2prob = udf { (rawPreds: Vector) =>
           val prob1 = 1.0 / (1.0 + math.exp(-rawPreds(1)))
-          Vectors.dense(1.0 - prob1, prob1)
+          Vectors.dense(1.0 - prob1, prob1): Vector
         }
-        tmpData = tmpData.select($"*",
-          callUDF(raw2prob, new VectorUDT, col(map(rawPredictionCol))).as(map(probabilityCol)))
+        tmpData = tmpData.withColumn(map(probabilityCol), raw2prob(col(map(rawPredictionCol))))
       } else {
-        val features2prob: Vector => Vector = (features: Vector) => predictProbabilities(features)
-        tmpData = tmpData.select($"*",
-          callUDF(features2prob, new VectorUDT, col(map(featuresCol))).as(map(probabilityCol)))
+        val features2prob = udf { (features: Vector) => predictProbabilities(features) : Vector }
+        tmpData = tmpData.withColumn(map(probabilityCol), features2prob(col(map(featuresCol))))
       }
       numColsOutput += 1
     }
     if (map(predictionCol) != "") {
       val t = map(threshold)
       if (map(probabilityCol) != "") {
-        val predict: Vector => Double = { probs: Vector =>
+        val predict = udf { probs: Vector =>
           if (probs(1) > t) 1.0 else 0.0
         }
-        tmpData = tmpData.select($"*",
-          callUDF(predict, DoubleType, col(map(probabilityCol))).as(map(predictionCol)))
+        tmpData = tmpData.withColumn(map(predictionCol), predict(col(map(probabilityCol))))
       } else if (map(rawPredictionCol) != "") {
-        val predict: Vector => Double = { rawPreds: Vector =>
+        val predict = udf { rawPreds: Vector =>
           val prob1 = 1.0 / (1.0 + math.exp(-rawPreds(1)))
           if (prob1 > t) 1.0 else 0.0
         }
-        tmpData = tmpData.select($"*",
-          callUDF(predict, DoubleType, col(map(rawPredictionCol))).as(map(predictionCol)))
+        tmpData = tmpData.withColumn(map(predictionCol), predict(col(map(rawPredictionCol))))
       } else {
-        val predict: Vector => Double = (features: Vector) => this.predict(features)
-        tmpData = tmpData.select($"*",
-          callUDF(predict, DoubleType, col(map(featuresCol))).as(map(predictionCol)))
+        val predict = udf { features: Vector => this.predict(features) }
+        tmpData = tmpData.withColumn(map(predictionCol), predict(col(map(featuresCol))))
       }
       numColsOutput += 1
     }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
index 38518785dceae..bd8caac855981 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.{AlphaComponent, DeveloperApi}
 import org.apache.spark.ml.param.{HasProbabilityCol, ParamMap, Params}
 import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
 import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DataType, StructType}
 
 
@@ -122,8 +122,8 @@ private[spark] abstract class ProbabilisticClassificationModel[
       val features2probs: FeaturesType => Vector = (features) => {
         tmpModel.predictProbabilities(features)
       }
-      outputData.select($"*",
-        callUDF(features2probs, new VectorUDT, col(map(featuresCol))).as(map(probabilityCol)))
+      outputData.withColumn(map(probabilityCol),
+        callUDF(features2probs, new VectorUDT, col(map(featuresCol))))
     } else {
       if (numColsOutput == 0) {
         this.logWarning(s"$uid: ProbabilisticClassificationModel.transform() was called as NOOP" +
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index 7623ec59ae0eb..ddbd648d64f23 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -23,7 +23,7 @@ import org.apache.spark.ml.param._
 import org.apache.spark.mllib.feature
 import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
 import org.apache.spark.sql._
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{StructField, StructType}
 
 /**
@@ -88,7 +88,7 @@ class StandardScalerModel private[ml] (
     transformSchema(dataset.schema, paramMap, logging = true)
     val map = this.paramMap ++ paramMap
     val scale = udf((v: Vector) => { scaler.transform(v) } : Vector)
-    dataset.select($"*", scale(col(map(inputCol))).as(map(outputCol)))
+    dataset.withColumn(map(outputCol), scale(col(map(inputCol))))
   }
 
   private[ml] override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
index e416c1eb585fc..7daeff980f0ea 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
@@ -24,7 +24,7 @@ import org.apache.spark.mllib.linalg.{VectorUDT, Vector}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row}
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
 
 
@@ -216,7 +216,7 @@ private[spark] abstract class PredictionModel[FeaturesType, M <: PredictionModel
       val pred: FeaturesType => Double = (features) => {
         tmpModel.predict(features)
       }
-      dataset.select($"*", callUDF(pred, DoubleType, col(map(featuresCol))).as(map(predictionCol)))
+      dataset.withColumn(map(predictionCol), callUDF(pred, DoubleType, col(map(featuresCol))))
     } else {
       this.logWarning(s"$uid: Predictor.transform() was called as NOOP" +
         " since no output columns were set.")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index aac487745f77d..8d70e4347c4c9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -36,7 +36,7 @@ import org.apache.spark.ml.param._
 import org.apache.spark.mllib.optimization.NNLS
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DoubleType, FloatType, IntegerType, StructField, StructType}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
@@ -170,8 +170,8 @@ class ALSModel private[ml] (
   override def transform(dataset: DataFrame, paramMap: ParamMap): DataFrame = {
     import dataset.sqlContext.implicits._
     val map = this.paramMap ++ paramMap
-    val users = userFactors.toDataFrame("id", "features")
-    val items = itemFactors.toDataFrame("id", "features")
+    val users = userFactors.toDF("id", "features")
+    val items = itemFactors.toDF("id", "features")
 
     // Register a UDF for DataFrame, and then
     // create a new column named map(predictionCol) by running the predict UDF.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index f9142bc2268b7..dd7a9469d5041 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -102,7 +102,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
       sc.parallelize(Seq(metadata), 1).saveAsTextFile(metadataPath(path))
 
       // Create Parquet data.
-      val dataRDD: DataFrame = sc.parallelize(Seq(data), 1)
+      val dataRDD: DataFrame = sc.parallelize(Seq(data), 1).toDF
       dataRDD.saveAsParquetFile(dataPath(path))
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala
index 1d118963b49fc..0a358f2e4f39c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala
@@ -62,7 +62,7 @@ private[classification] object GLMClassificationModel {
 
       // Create Parquet data.
       val data = Data(weights, intercept, threshold)
-      sc.parallelize(Seq(data), 1).saveAsParquetFile(Loader.dataPath(path))
+      sc.parallelize(Seq(data), 1).toDF.saveAsParquetFile(Loader.dataPath(path))
     }
 
     /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index a3a3b5d418c0a..c399496568bfb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -187,8 +187,8 @@ object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
       val metadata = compact(render(
         ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~ ("rank" -> model.rank)))
       sc.parallelize(Seq(metadata), 1).saveAsTextFile(metadataPath(path))
-      model.userFeatures.toDataFrame("id", "features").saveAsParquetFile(userPath(path))
-      model.productFeatures.toDataFrame("id", "features").saveAsParquetFile(productPath(path))
+      model.userFeatures.toDF("id", "features").saveAsParquetFile(userPath(path))
+      model.productFeatures.toDF("id", "features").saveAsParquetFile(productPath(path))
     }
 
     def load(sc: SparkContext, path: String): MatrixFactorizationModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala
index f75de6f637640..7b27aaa322b00 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala
@@ -58,7 +58,7 @@ private[regression] object GLMRegressionModel {
 
       // Create Parquet data.
       val data = Data(weights, intercept)
-      val dataRDD: DataFrame = sc.parallelize(Seq(data), 1)
+      val dataRDD: DataFrame = sc.parallelize(Seq(data), 1).toDF
       // TODO: repartition with 1 partition after SPARK-5532 gets fixed
       dataRDD.saveAsParquetFile(Loader.dataPath(path))
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
index 373192a20c0ea..5dac62b0c42f0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
@@ -197,7 +197,7 @@ object DecisionTreeModel extends Loader[DecisionTreeModel] {
       val nodes = model.topNode.subtreeIterator.toSeq
       val dataRDD: DataFrame = sc.parallelize(nodes)
         .map(NodeData.apply(0, _))
-        .toDataFrame
+        .toDF
       dataRDD.saveAsParquetFile(Loader.dataPath(path))
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
index dbd69dca60b92..e507f247cca76 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -289,7 +289,7 @@ private[tree] object TreeEnsembleModel {
       // Create Parquet data.
       val dataRDD = sc.parallelize(model.trees.zipWithIndex).flatMap { case (tree, treeId) =>
         tree.topNode.subtreeIterator.toSeq.map(node => NodeData(treeId, node))
-      }.toDataFrame
+      }.toDF
       dataRDD.saveAsParquetFile(Loader.dataPath(path))
     }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index cb7d57de35c34..b118a8dcf1363 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -358,8 +358,8 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
       .setNumUserBlocks(numUserBlocks)
       .setNumItemBlocks(numItemBlocks)
     val alpha = als.getAlpha
-    val model = als.fit(training)
-    val predictions = model.transform(test)
+    val model = als.fit(training.toDF)
+    val predictions = model.transform(test.toDF)
       .select("rating", "prediction")
       .map { case Row(rating: Float, prediction: Float) =>
         (rating.toDouble, prediction.toDouble)
diff --git a/python/docs/pyspark.sql.rst b/python/docs/pyspark.sql.rst
index 80c6f02a9df41..e03379e521a07 100644
--- a/python/docs/pyspark.sql.rst
+++ b/python/docs/pyspark.sql.rst
@@ -16,3 +16,11 @@ pyspark.sql.types module
     :members:
     :undoc-members:
     :show-inheritance:
+
+
+pyspark.sql.functions module
+------------------------
+.. automodule:: pyspark.sql.functions
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 49e5c9d58e5db..06207a076eece 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -335,7 +335,7 @@ def test_infer_schema(self):
         sqlCtx = SQLContext(self.sc)
         rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)])
         srdd = sqlCtx.inferSchema(rdd)
-        schema = srdd.schema()
+        schema = srdd.schema
         field = [f for f in schema.fields if f.name == "features"][0]
         self.assertEqual(field.dataType, self.udt)
         vectors = srdd.map(lambda p: p.features).collect()
diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py
index 0a5ba00393aab..b9ffd6945ea7e 100644
--- a/python/pyspark/sql/__init__.py
+++ b/python/pyspark/sql/__init__.py
@@ -34,9 +34,8 @@
 
 from pyspark.sql.context import SQLContext, HiveContext
 from pyspark.sql.types import Row
-from pyspark.sql.dataframe import DataFrame, GroupedData, Column, Dsl, SchemaRDD
+from pyspark.sql.dataframe import DataFrame, GroupedData, Column, SchemaRDD
 
 __all__ = [
     'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row',
-    'Dsl',
 ]
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 082f1b691b196..7683c1b4dfa4e 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -38,6 +38,25 @@
 __all__ = ["SQLContext", "HiveContext"]
 
 
+def _monkey_patch_RDD(sqlCtx):
+    def toDF(self, schema=None, sampleRatio=None):
+        """
+        Convert current :class:`RDD` into a :class:`DataFrame`
+
+        This is a shorthand for `sqlCtx.createDataFrame(rdd, schema, sampleRatio)`
+
+        :param schema: a StructType or list of names of columns
+        :param samplingRatio: the sample ratio of rows used for inferring
+        :return: a DataFrame
+
+        >>> rdd.toDF().collect()
+        [Row(name=u'Alice', age=1)]
+        """
+        return sqlCtx.createDataFrame(self, schema, sampleRatio)
+
+    RDD.toDF = toDF
+
+
 class SQLContext(object):
 
     """Main entry point for Spark SQL functionality.
@@ -49,15 +68,20 @@ class SQLContext(object):
     def __init__(self, sparkContext, sqlContext=None):
         """Create a new SQLContext.
 
+        It will add a method called `toDF` to :class:`RDD`, which could be
+        used to convert an RDD into a DataFrame, it's a shorthand for
+        :func:`SQLContext.createDataFrame`.
+
         :param sparkContext: The SparkContext to wrap.
         :param sqlContext: An optional JVM Scala SQLContext. If set, we do not instatiate a new
         SQLContext in the JVM, instead we make all calls to this object.
 
         >>> from datetime import datetime
+        >>> sqlCtx = SQLContext(sc)
         >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1L,
         ...     b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1),
         ...     time=datetime(2014, 8, 1, 14, 1, 5))])
-        >>> df = sqlCtx.createDataFrame(allTypes)
+        >>> df = allTypes.toDF()
         >>> df.registerTempTable("allTypes")
         >>> sqlCtx.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a '
         ...            'from allTypes where b and i > 0').collect()
@@ -70,6 +94,7 @@ def __init__(self, sparkContext, sqlContext=None):
         self._jsc = self._sc._jsc
         self._jvm = self._sc._jvm
         self._scala_SQLContext = sqlContext
+        _monkey_patch_RDD(self)
 
     @property
     def _ssql_ctx(self):
@@ -442,7 +467,7 @@ def jsonFile(self, path, schema=None, samplingRatio=1.0):
         Row(f1=2, f2=None, f3=Row(field4=22,..., f4=[Row(field7=u'row2')])
         Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
 
-        >>> df3 = sqlCtx.jsonFile(jsonFile, df1.schema())
+        >>> df3 = sqlCtx.jsonFile(jsonFile, df1.schema)
         >>> sqlCtx.registerRDDAsTable(df3, "table2")
         >>> df4 = sqlCtx.sql(
         ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
@@ -495,7 +520,7 @@ def jsonRDD(self, rdd, schema=None, samplingRatio=1.0):
         Row(f1=2, f2=None, f3=Row(field4=22..., f4=[Row(field7=u'row2')])
         Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
 
-        >>> df3 = sqlCtx.jsonRDD(json, df1.schema())
+        >>> df3 = sqlCtx.jsonRDD(json, df1.schema)
         >>> sqlCtx.registerRDDAsTable(df3, "table2")
         >>> df4 = sqlCtx.sql(
         ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
@@ -800,7 +825,8 @@ def _test():
          Row(field1=2, field2="row2"),
          Row(field1=3, field2="row3")]
     )
-    globs['df'] = sqlCtx.createDataFrame(rdd)
+    _monkey_patch_RDD(sqlCtx)
+    globs['df'] = rdd.toDF()
     jsonStrings = [
         '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
         '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index b6f052ee44ae2..1438fe5285cc5 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -21,21 +21,19 @@
 import random
 import os
 from tempfile import NamedTemporaryFile
-from itertools import imap
 
 from py4j.java_collections import ListConverter, MapConverter
 
 from pyspark.context import SparkContext
-from pyspark.rdd import RDD, _prepare_for_python_RDD
-from pyspark.serializers import BatchedSerializer, AutoBatchedSerializer, PickleSerializer, \
-    UTF8Deserializer
+from pyspark.rdd import RDD
+from pyspark.serializers import BatchedSerializer, PickleSerializer, UTF8Deserializer
 from pyspark.storagelevel import StorageLevel
 from pyspark.traceback_utils import SCCallSiteSync
 from pyspark.sql.types import *
 from pyspark.sql.types import _create_cls, _parse_datatype_json_string
 
 
-__all__ = ["DataFrame", "GroupedData", "Column", "Dsl", "SchemaRDD"]
+__all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD"]
 
 
 class DataFrame(object):
@@ -76,6 +74,7 @@ def __init__(self, jdf, sql_ctx):
         self.sql_ctx = sql_ctx
         self._sc = sql_ctx and sql_ctx._sc
         self.is_cached = False
+        self._schema = None  # initialized lazily
 
     @property
     def rdd(self):
@@ -86,7 +85,7 @@ def rdd(self):
         if not hasattr(self, '_lazy_rdd'):
             jrdd = self._jdf.javaToPython()
             rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer()))
-            schema = self.schema()
+            schema = self.schema
 
             def applySchema(it):
                 cls = _create_cls(schema)
@@ -216,14 +215,17 @@ def save(self, path=None, source=None, mode="append", **options):
                                           self._sc._gateway._gateway_client)
         self._jdf.save(source, jmode, joptions)
 
+    @property
     def schema(self):
         """Returns the schema of this DataFrame (represented by
         a L{StructType}).
 
-        >>> df.schema()
+        >>> df.schema
         StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true)))
         """
-        return _parse_datatype_json_string(self._jdf.schema().json())
+        if self._schema is None:
+            self._schema = _parse_datatype_json_string(self._jdf.schema().json())
+        return self._schema
 
     def printSchema(self):
         """Prints out the schema in the tree format.
@@ -284,7 +286,7 @@ def collect(self):
         with open(tempFile.name, 'rb') as tempFile:
             rs = list(BatchedSerializer(PickleSerializer()).load_stream(tempFile))
         os.unlink(tempFile.name)
-        cls = _create_cls(self.schema())
+        cls = _create_cls(self.schema)
         return [cls(r) for r in rs]
 
     def limit(self, num):
@@ -310,14 +312,26 @@ def take(self, num):
         return self.limit(num).collect()
 
     def map(self, f):
-        """ Return a new RDD by applying a function to each Row, it's a
-        shorthand for df.rdd.map()
+        """ Return a new RDD by applying a function to each Row
+
+        It's a shorthand for df.rdd.map()
 
         >>> df.map(lambda p: p.name).collect()
         [u'Alice', u'Bob']
         """
         return self.rdd.map(f)
 
+    def flatMap(self, f):
+        """ Return a new RDD by first applying a function to all elements of this,
+        and then flattening the results.
+
+        It's a shorthand for df.rdd.flatMap()
+
+        >>> df.flatMap(lambda p: p.name).collect()
+        [u'A', u'l', u'i', u'c', u'e', u'B', u'o', u'b']
+        """
+        return self.rdd.flatMap(f)
+
     def mapPartitions(self, f, preservesPartitioning=False):
         """
         Return a new RDD by applying a function to each partition.
@@ -378,21 +392,6 @@ def sample(self, withReplacement, fraction, seed=None):
         rdd = self._jdf.sample(withReplacement, fraction, long(seed))
         return DataFrame(rdd, self.sql_ctx)
 
-    # def takeSample(self, withReplacement, num, seed=None):
-    #     """Return a fixed-size sampled subset of this DataFrame.
-    #
-    #     >>> df = sqlCtx.inferSchema(rdd)
-    #     >>> df.takeSample(False, 2, 97)
-    #     [Row(field1=3, field2=u'row3'), Row(field1=1, field2=u'row1')]
-    #     """
-    #     seed = seed if seed is not None else random.randint(0, sys.maxint)
-    #     with SCCallSiteSync(self.context) as css:
-    #         bytesInJava = self._jdf \
-    #             .takeSampleToPython(withReplacement, num, long(seed)) \
-    #             .iterator()
-    #     cls = _create_cls(self.schema())
-    #     return map(cls, self._collect_iterator_through_file(bytesInJava))
-
     @property
     def dtypes(self):
         """Return all column names and their data types as a list.
@@ -400,7 +399,7 @@ def dtypes(self):
         >>> df.dtypes
         [('age', 'int'), ('name', 'string')]
         """
-        return [(str(f.name), f.dataType.simpleString()) for f in self.schema().fields]
+        return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields]
 
     @property
     def columns(self):
@@ -409,7 +408,7 @@ def columns(self):
         >>> df.columns
         [u'age', u'name']
         """
-        return [f.name for f in self.schema().fields]
+        return [f.name for f in self.schema.fields]
 
     def join(self, other, joinExprs=None, joinType=None):
         """
@@ -586,8 +585,8 @@ def agg(self, *exprs):
 
         >>> df.agg({"age": "max"}).collect()
         [Row(MAX(age#0)=5)]
-        >>> from pyspark.sql import Dsl
-        >>> df.agg(Dsl.min(df.age)).collect()
+        >>> from pyspark.sql import functions as F
+        >>> df.agg(F.min(df.age)).collect()
         [Row(MIN(age#0)=2)]
         """
         return self.groupBy().agg(*exprs)
@@ -616,18 +615,18 @@ def subtract(self, other):
         """
         return DataFrame(getattr(self._jdf, "except")(other._jdf), self.sql_ctx)
 
-    def addColumn(self, colName, col):
+    def withColumn(self, colName, col):
         """ Return a new :class:`DataFrame` by adding a column.
 
-        >>> df.addColumn('age2', df.age + 2).collect()
+        >>> df.withColumn('age2', df.age + 2).collect()
         [Row(age=2, name=u'Alice', age2=4), Row(age=5, name=u'Bob', age2=7)]
         """
         return self.select('*', col.alias(colName))
 
-    def renameColumn(self, existing, new):
+    def withColumnRenamed(self, existing, new):
         """ Rename an existing column to a new name
 
-        >>> df.renameColumn('age', 'age2').collect()
+        >>> df.withColumnRenamed('age', 'age2').collect()
         [Row(age2=2, name=u'Alice'), Row(age2=5, name=u'Bob')]
         """
         cols = [Column(_to_java_column(c), self.sql_ctx).alias(new)
@@ -635,11 +634,11 @@ def renameColumn(self, existing, new):
                 for c in self.columns]
         return self.select(*cols)
 
-    def to_pandas(self):
+    def toPandas(self):
         """
         Collect all the rows and return a `pandas.DataFrame`.
 
-        >>> df.to_pandas()  # doctest: +SKIP
+        >>> df.toPandas()  # doctest: +SKIP
            age   name
         0    2  Alice
         1    5    Bob
@@ -687,10 +686,11 @@ def agg(self, *exprs):
                       name to aggregate methods.
 
         >>> gdf = df.groupBy(df.name)
-        >>> gdf.agg({"age": "max"}).collect()
-        [Row(name=u'Bob', MAX(age#0)=5), Row(name=u'Alice', MAX(age#0)=2)]
-        >>> from pyspark.sql import Dsl
-        >>> gdf.agg(Dsl.min(df.age)).collect()
+        >>> gdf.agg({"*": "count"}).collect()
+        [Row(name=u'Bob', COUNT(1)=1), Row(name=u'Alice', COUNT(1)=1)]
+
+        >>> from pyspark.sql import functions as F
+        >>> gdf.agg(F.min(df.age)).collect()
         [Row(MIN(age#0)=5), Row(MIN(age#0)=2)]
         """
         assert exprs, "exprs should not be empty"
@@ -742,12 +742,12 @@ def sum(self):
 
 def _create_column_from_literal(literal):
     sc = SparkContext._active_spark_context
-    return sc._jvm.Dsl.lit(literal)
+    return sc._jvm.functions.lit(literal)
 
 
 def _create_column_from_name(name):
     sc = SparkContext._active_spark_context
-    return sc._jvm.Dsl.col(name)
+    return sc._jvm.functions.col(name)
 
 
 def _to_java_column(col):
@@ -767,9 +767,9 @@ def _(self):
     return _
 
 
-def _dsl_op(name, doc=''):
+def _func_op(name, doc=''):
     def _(self):
-        jc = getattr(self._sc._jvm.Dsl, name)(self._jc)
+        jc = getattr(self._sc._jvm.functions, name)(self._jc)
         return Column(jc, self.sql_ctx)
     _.__doc__ = doc
     return _
@@ -818,7 +818,7 @@ def __init__(self, jc, sql_ctx=None):
         super(Column, self).__init__(jc, sql_ctx)
 
     # arithmetic operators
-    __neg__ = _dsl_op("negate")
+    __neg__ = _func_op("negate")
     __add__ = _bin_op("plus")
     __sub__ = _bin_op("minus")
     __mul__ = _bin_op("multiply")
@@ -842,7 +842,7 @@ def __init__(self, jc, sql_ctx=None):
     # so use bitwise operators as boolean operators
     __and__ = _bin_op('and')
     __or__ = _bin_op('or')
-    __invert__ = _dsl_op('not')
+    __invert__ = _func_op('not')
     __rand__ = _bin_op("and")
     __ror__ = _bin_op("or")
 
@@ -920,11 +920,11 @@ def __repr__(self):
         else:
             return 'Column<%s>' % self._jdf.toString()
 
-    def to_pandas(self):
+    def toPandas(self):
         """
         Return a pandas.Series from the column
 
-        >>> df.age.to_pandas()  # doctest: +SKIP
+        >>> df.age.toPandas()  # doctest: +SKIP
         0    2
         1    5
         dtype: int64
@@ -934,123 +934,6 @@ def to_pandas(self):
         return pd.Series(data)
 
 
-def _aggregate_func(name, doc=""):
-    """ Create a function for aggregator by name"""
-    def _(col):
-        sc = SparkContext._active_spark_context
-        jc = getattr(sc._jvm.Dsl, name)(_to_java_column(col))
-        return Column(jc)
-    _.__name__ = name
-    _.__doc__ = doc
-    return staticmethod(_)
-
-
-class UserDefinedFunction(object):
-    def __init__(self, func, returnType):
-        self.func = func
-        self.returnType = returnType
-        self._broadcast = None
-        self._judf = self._create_judf()
-
-    def _create_judf(self):
-        f = self.func  # put it in closure `func`
-        func = lambda _, it: imap(lambda x: f(*x), it)
-        ser = AutoBatchedSerializer(PickleSerializer())
-        command = (func, None, ser, ser)
-        sc = SparkContext._active_spark_context
-        pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self)
-        ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
-        jdt = ssql_ctx.parseDataType(self.returnType.json())
-        judf = sc._jvm.UserDefinedPythonFunction(f.__name__, bytearray(pickled_command), env,
-                                                 includes, sc.pythonExec, broadcast_vars,
-                                                 sc._javaAccumulator, jdt)
-        return judf
-
-    def __del__(self):
-        if self._broadcast is not None:
-            self._broadcast.unpersist()
-            self._broadcast = None
-
-    def __call__(self, *cols):
-        sc = SparkContext._active_spark_context
-        jcols = ListConverter().convert([_to_java_column(c) for c in cols],
-                                        sc._gateway._gateway_client)
-        jc = self._judf.apply(sc._jvm.PythonUtils.toSeq(jcols))
-        return Column(jc)
-
-
-class Dsl(object):
-    """
-    A collections of builtin aggregators
-    """
-    DSLS = {
-        'lit': 'Creates a :class:`Column` of literal value.',
-        'col': 'Returns a :class:`Column` based on the given column name.',
-        'column': 'Returns a :class:`Column` based on the given column name.',
-        'upper': 'Converts a string expression to upper case.',
-        'lower': 'Converts a string expression to upper case.',
-        'sqrt': 'Computes the square root of the specified float value.',
-        'abs': 'Computes the absolutle value.',
-
-        'max': 'Aggregate function: returns the maximum value of the expression in a group.',
-        'min': 'Aggregate function: returns the minimum value of the expression in a group.',
-        'first': 'Aggregate function: returns the first value in a group.',
-        'last': 'Aggregate function: returns the last value in a group.',
-        'count': 'Aggregate function: returns the number of items in a group.',
-        'sum': 'Aggregate function: returns the sum of all values in the expression.',
-        'avg': 'Aggregate function: returns the average of the values in a group.',
-        'mean': 'Aggregate function: returns the average of the values in a group.',
-        'sumDistinct': 'Aggregate function: returns the sum of distinct values in the expression.',
-    }
-
-    for _name, _doc in DSLS.items():
-        locals()[_name] = _aggregate_func(_name, _doc)
-    del _name, _doc
-
-    @staticmethod
-    def countDistinct(col, *cols):
-        """ Return a new Column for distinct count of (col, *cols)
-
-        >>> from pyspark.sql import Dsl
-        >>> df.agg(Dsl.countDistinct(df.age, df.name).alias('c')).collect()
-        [Row(c=2)]
-
-        >>> df.agg(Dsl.countDistinct("age", "name").alias('c')).collect()
-        [Row(c=2)]
-        """
-        sc = SparkContext._active_spark_context
-        jcols = ListConverter().convert([_to_java_column(c) for c in cols],
-                                        sc._gateway._gateway_client)
-        jc = sc._jvm.Dsl.countDistinct(_to_java_column(col),
-                                       sc._jvm.PythonUtils.toSeq(jcols))
-        return Column(jc)
-
-    @staticmethod
-    def approxCountDistinct(col, rsd=None):
-        """ Return a new Column for approxiate distinct count of (col, *cols)
-
-        >>> from pyspark.sql import Dsl
-        >>> df.agg(Dsl.approxCountDistinct(df.age).alias('c')).collect()
-        [Row(c=2)]
-        """
-        sc = SparkContext._active_spark_context
-        if rsd is None:
-            jc = sc._jvm.Dsl.approxCountDistinct(_to_java_column(col))
-        else:
-            jc = sc._jvm.Dsl.approxCountDistinct(_to_java_column(col), rsd)
-        return Column(jc)
-
-    @staticmethod
-    def udf(f, returnType=StringType()):
-        """Create a user defined function (UDF)
-
-        >>> slen = Dsl.udf(lambda s: len(s), IntegerType())
-        >>> df.select(slen(df.name).alias('slen')).collect()
-        [Row(slen=5), Row(slen=3)]
-        """
-        return UserDefinedFunction(f, returnType)
-
-
 def _test():
     import doctest
     from pyspark.context import SparkContext
@@ -1059,11 +942,9 @@ def _test():
     globs = pyspark.sql.dataframe.__dict__.copy()
     sc = SparkContext('local[4]', 'PythonTest')
     globs['sc'] = sc
-    globs['sqlCtx'] = sqlCtx = SQLContext(sc)
-    rdd2 = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)])
-    rdd3 = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)])
-    globs['df'] = sqlCtx.inferSchema(rdd2)
-    globs['df2'] = sqlCtx.inferSchema(rdd3)
+    globs['sqlCtx'] = SQLContext(sc)
+    globs['df'] = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF()
+    globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF()
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.dataframe, globs=globs,
         optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
new file mode 100644
index 0000000000000..39aa550eeb5ad
--- /dev/null
+++ b/python/pyspark/sql/functions.py
@@ -0,0 +1,170 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+A collections of builtin functions
+"""
+
+from itertools import imap
+
+from py4j.java_collections import ListConverter
+
+from pyspark import SparkContext
+from pyspark.rdd import _prepare_for_python_RDD
+from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
+from pyspark.sql.types import StringType
+from pyspark.sql.dataframe import Column, _to_java_column
+
+
+__all__ = ['countDistinct', 'approxCountDistinct', 'udf']
+
+
+def _create_function(name, doc=""):
+    """ Create a function for aggregator by name"""
+    def _(col):
+        sc = SparkContext._active_spark_context
+        jc = getattr(sc._jvm.functions, name)(_to_java_column(col))
+        return Column(jc)
+    _.__name__ = name
+    _.__doc__ = doc
+    return _
+
+
+_functions = {
+    'lit': 'Creates a :class:`Column` of literal value.',
+    'col': 'Returns a :class:`Column` based on the given column name.',
+    'column': 'Returns a :class:`Column` based on the given column name.',
+    'upper': 'Converts a string expression to upper case.',
+    'lower': 'Converts a string expression to upper case.',
+    'sqrt': 'Computes the square root of the specified float value.',
+    'abs': 'Computes the absolutle value.',
+
+    'max': 'Aggregate function: returns the maximum value of the expression in a group.',
+    'min': 'Aggregate function: returns the minimum value of the expression in a group.',
+    'first': 'Aggregate function: returns the first value in a group.',
+    'last': 'Aggregate function: returns the last value in a group.',
+    'count': 'Aggregate function: returns the number of items in a group.',
+    'sum': 'Aggregate function: returns the sum of all values in the expression.',
+    'avg': 'Aggregate function: returns the average of the values in a group.',
+    'mean': 'Aggregate function: returns the average of the values in a group.',
+    'sumDistinct': 'Aggregate function: returns the sum of distinct values in the expression.',
+}
+
+
+for _name, _doc in _functions.items():
+    globals()[_name] = _create_function(_name, _doc)
+del _name, _doc
+__all__ += _functions.keys()
+
+
+def countDistinct(col, *cols):
+    """ Return a new Column for distinct count of `col` or `cols`
+
+    >>> df.agg(countDistinct(df.age, df.name).alias('c')).collect()
+    [Row(c=2)]
+
+    >>> df.agg(countDistinct("age", "name").alias('c')).collect()
+    [Row(c=2)]
+    """
+    sc = SparkContext._active_spark_context
+    jcols = ListConverter().convert([_to_java_column(c) for c in cols], sc._gateway._gateway_client)
+    jc = sc._jvm.functions.countDistinct(_to_java_column(col), sc._jvm.PythonUtils.toSeq(jcols))
+    return Column(jc)
+
+
+def approxCountDistinct(col, rsd=None):
+    """ Return a new Column for approximate distinct count of `col`
+
+    >>> df.agg(approxCountDistinct(df.age).alias('c')).collect()
+    [Row(c=2)]
+    """
+    sc = SparkContext._active_spark_context
+    if rsd is None:
+        jc = sc._jvm.functions.approxCountDistinct(_to_java_column(col))
+    else:
+        jc = sc._jvm.functions.approxCountDistinct(_to_java_column(col), rsd)
+    return Column(jc)
+
+
+class UserDefinedFunction(object):
+    """
+    User defined function in Python
+    """
+    def __init__(self, func, returnType):
+        self.func = func
+        self.returnType = returnType
+        self._broadcast = None
+        self._judf = self._create_judf()
+
+    def _create_judf(self):
+        f = self.func  # put it in closure `func`
+        func = lambda _, it: imap(lambda x: f(*x), it)
+        ser = AutoBatchedSerializer(PickleSerializer())
+        command = (func, None, ser, ser)
+        sc = SparkContext._active_spark_context
+        pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self)
+        ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
+        jdt = ssql_ctx.parseDataType(self.returnType.json())
+        judf = sc._jvm.UserDefinedPythonFunction(f.__name__, bytearray(pickled_command), env,
+                                                 includes, sc.pythonExec, broadcast_vars,
+                                                 sc._javaAccumulator, jdt)
+        return judf
+
+    def __del__(self):
+        if self._broadcast is not None:
+            self._broadcast.unpersist()
+            self._broadcast = None
+
+    def __call__(self, *cols):
+        sc = SparkContext._active_spark_context
+        jcols = ListConverter().convert([_to_java_column(c) for c in cols],
+                                        sc._gateway._gateway_client)
+        jc = self._judf.apply(sc._jvm.PythonUtils.toSeq(jcols))
+        return Column(jc)
+
+
+def udf(f, returnType=StringType()):
+    """Create a user defined function (UDF)
+
+    >>> slen = udf(lambda s: len(s), IntegerType())
+    >>> df.select(slen(df.name).alias('slen')).collect()
+    [Row(slen=5), Row(slen=3)]
+    """
+    return UserDefinedFunction(f, returnType)
+
+
+def _test():
+    import doctest
+    from pyspark.context import SparkContext
+    from pyspark.sql import Row, SQLContext
+    import pyspark.sql.dataframe
+    globs = pyspark.sql.dataframe.__dict__.copy()
+    sc = SparkContext('local[4]', 'PythonTest')
+    globs['sc'] = sc
+    globs['sqlCtx'] = SQLContext(sc)
+    globs['df'] = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF()
+    globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF()
+    (failure_count, test_count) = doctest.testmod(
+        pyspark.sql.dataframe, globs=globs,
+        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
+    globs['sc'].stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 43e5c3a1b00fa..aa80bca34655d 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -96,7 +96,7 @@ def setUpClass(cls):
         cls.sqlCtx = SQLContext(cls.sc)
         cls.testData = [Row(key=i, value=str(i)) for i in range(100)]
         rdd = cls.sc.parallelize(cls.testData)
-        cls.df = cls.sqlCtx.createDataFrame(rdd)
+        cls.df = rdd.toDF()
 
     @classmethod
     def tearDownClass(cls):
@@ -138,7 +138,7 @@ def test_basic_functions(self):
         df = self.sqlCtx.jsonRDD(rdd)
         df.count()
         df.collect()
-        df.schema()
+        df.schema
 
         # cache and checkpoint
         self.assertFalse(df.is_cached)
@@ -155,11 +155,11 @@ def test_basic_functions(self):
 
     def test_apply_schema_to_row(self):
         df = self.sqlCtx.jsonRDD(self.sc.parallelize(["""{"a":2}"""]))
-        df2 = self.sqlCtx.createDataFrame(df.map(lambda x: x), df.schema())
+        df2 = self.sqlCtx.createDataFrame(df.map(lambda x: x), df.schema)
         self.assertEqual(df.collect(), df2.collect())
 
         rdd = self.sc.parallelize(range(10)).map(lambda x: Row(a=x))
-        df3 = self.sqlCtx.createDataFrame(rdd, df.schema())
+        df3 = self.sqlCtx.createDataFrame(rdd, df.schema)
         self.assertEqual(10, df3.count())
 
     def test_serialize_nested_array_and_map(self):
@@ -195,7 +195,7 @@ def test_infer_schema(self):
         self.assertEqual(1, result.head()[0])
 
         df2 = self.sqlCtx.createDataFrame(rdd, samplingRatio=1.0)
-        self.assertEqual(df.schema(), df2.schema())
+        self.assertEqual(df.schema, df2.schema)
         self.assertEqual({}, df2.map(lambda r: r.d).first())
         self.assertEqual([None, ""], df2.map(lambda r: r.s).collect())
         df2.registerTempTable("test2")
@@ -204,8 +204,7 @@ def test_infer_schema(self):
 
     def test_struct_in_map(self):
         d = [Row(m={Row(i=1): Row(s="")})]
-        rdd = self.sc.parallelize(d)
-        df = self.sqlCtx.createDataFrame(rdd)
+        df = self.sc.parallelize(d).toDF()
         k, v = df.head().m.items()[0]
         self.assertEqual(1, k.i)
         self.assertEqual("", v.s)
@@ -213,8 +212,7 @@ def test_struct_in_map(self):
     def test_convert_row_to_dict(self):
         row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})
         self.assertEqual(1, row.asDict()['l'][0].a)
-        rdd = self.sc.parallelize([row])
-        df = self.sqlCtx.createDataFrame(rdd)
+        df = self.sc.parallelize([row]).toDF()
         df.registerTempTable("test")
         row = self.sqlCtx.sql("select l, d from test").head()
         self.assertEqual(1, row.asDict()["l"][0].a)
@@ -223,9 +221,8 @@ def test_convert_row_to_dict(self):
     def test_infer_schema_with_udt(self):
         from pyspark.sql.tests import ExamplePoint, ExamplePointUDT
         row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
-        rdd = self.sc.parallelize([row])
-        df = self.sqlCtx.createDataFrame(rdd)
-        schema = df.schema()
+        df = self.sc.parallelize([row]).toDF()
+        schema = df.schema
         field = [f for f in schema.fields if f.name == "point"][0]
         self.assertEqual(type(field.dataType), ExamplePointUDT)
         df.registerTempTable("labeled_point")
@@ -238,15 +235,14 @@ def test_apply_schema_with_udt(self):
         rdd = self.sc.parallelize([row])
         schema = StructType([StructField("label", DoubleType(), False),
                              StructField("point", ExamplePointUDT(), False)])
-        df = self.sqlCtx.createDataFrame(rdd, schema)
+        df = rdd.toDF(schema)
         point = df.head().point
         self.assertEquals(point, ExamplePoint(1.0, 2.0))
 
     def test_parquet_with_udt(self):
         from pyspark.sql.tests import ExamplePoint
         row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
-        rdd = self.sc.parallelize([row])
-        df0 = self.sqlCtx.createDataFrame(rdd)
+        df0 = self.sc.parallelize([row]).toDF()
         output_dir = os.path.join(self.tempdir.name, "labeled_point")
         df0.saveAsParquetFile(output_dir)
         df1 = self.sqlCtx.parquetFile(output_dir)
@@ -280,10 +276,11 @@ def test_aggregator(self):
         self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0]))
         self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect())
 
-        from pyspark.sql import Dsl
-        self.assertEqual((0, u'99'), tuple(g.agg(Dsl.first(df.key), Dsl.last(df.value)).first()))
-        self.assertTrue(95 < g.agg(Dsl.approxCountDistinct(df.key)).first()[0])
-        self.assertEqual(100, g.agg(Dsl.countDistinct(df.value)).first()[0])
+        from pyspark.sql import functions
+        self.assertEqual((0, u'99'),
+                         tuple(g.agg(functions.first(df.key), functions.last(df.value)).first()))
+        self.assertTrue(95 < g.agg(functions.approxCountDistinct(df.key)).first()[0])
+        self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
 
     def test_save_and_load(self):
         df = self.df
@@ -339,8 +336,7 @@ def setUpClass(cls):
             cls.sc._jvm.org.apache.spark.sql.hive.test.TestHiveContext(cls.sc._jsc.sc())
         cls.sqlCtx = HiveContext(cls.sc, _scala_HiveContext)
         cls.testData = [Row(key=i, value=str(i)) for i in range(100)]
-        rdd = cls.sc.parallelize(cls.testData)
-        cls.df = cls.sqlCtx.inferSchema(rdd)
+        cls.df = cls.sc.parallelize(cls.testData).toDF()
 
     @classmethod
     def tearDownClass(cls):
diff --git a/python/run-tests b/python/run-tests
index 077ad60d764a3..a2c2f37a54eda 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -35,7 +35,7 @@ rm -rf metastore warehouse
 function run_test() {
     echo "Running test: $1" | tee -a $LOG_FILE
 
-    SPARK_TESTING=1 time "$FWDIR"/bin/pyspark $1 >> $LOG_FILE 2>&1
+    SPARK_TESTING=1 time "$FWDIR"/bin/pyspark $1 > $LOG_FILE 2>&1
 
     FAILED=$((PIPESTATUS[0]||$FAILED))
 
@@ -67,6 +67,7 @@ function run_sql_tests() {
     run_test "pyspark/sql/types.py"
     run_test "pyspark/sql/context.py"
     run_test "pyspark/sql/dataframe.py"
+    run_test "pyspark/sql/functions.py"
     run_test "pyspark/sql/tests.py"
 }
 
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
index 0cf2de6d399b0..05faef8786d2c 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
@@ -137,7 +137,7 @@ private[repl] trait SparkILoopInit {
       command("import org.apache.spark.SparkContext._")
       command("import sqlContext.implicits._")
       command("import sqlContext.sql")
-      command("import org.apache.spark.sql.Dsl._")
+      command("import org.apache.spark.sql.functions._")
     }
   }
 
diff --git a/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index 201f2672d5474..529914a2b6141 100644
--- a/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/scala-2.10/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -262,7 +262,7 @@ class ReplSuite extends FunSuite {
         |val sqlContext = new org.apache.spark.sql.SQLContext(sc)
         |import sqlContext.implicits._
         |case class TestCaseClass(value: Int)
-        |sc.parallelize(1 to 10).map(x => TestCaseClass(x)).toDataFrame.collect()
+        |sc.parallelize(1 to 10).map(x => TestCaseClass(x)).toDF.collect()
       """.stripMargin)
     assertDoesNotContain("error:", output)
     assertDoesNotContain("Exception", output)
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index 1bd2a6991404b..7a5e94da5cbf3 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -77,7 +77,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter)
       command("import org.apache.spark.SparkContext._")
       command("import sqlContext.implicits._")
       command("import sqlContext.sql")
-      command("import org.apache.spark.sql.Dsl._")
+      command("import org.apache.spark.sql.functions._")
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index f959a50564011..a7cd4124e56f3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -152,7 +152,7 @@ case class MultiAlias(child: Expression, names: Seq[String])
 
   override lazy val resolved = false
 
-  override def newInstance = this
+  override def newInstance() = this
 
   override def withNullability(newNullability: Boolean) = this
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 9d5d6e78bd487..f6ecee1af8aad 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -17,12 +17,11 @@
 
 package org.apache.spark.sql
 
-import scala.annotation.tailrec
 import scala.language.implicitConversions
 
-import org.apache.spark.sql.Dsl.lit
+import org.apache.spark.sql.functions.lit
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.{Subquery, Project, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan}
 import org.apache.spark.sql.catalyst.analysis.UnresolvedGetField
 import org.apache.spark.sql.types._
 
@@ -127,7 +126,7 @@ trait Column extends DataFrame {
    *   df.select( -df("amount") )
    *
    *   // Java:
-   *   import static org.apache.spark.sql.Dsl.*;
+   *   import static org.apache.spark.sql.functions.*;
    *   df.select( negate(col("amount") );
    * }}}
    */
@@ -140,7 +139,7 @@ trait Column extends DataFrame {
    *   df.filter( !df("isActive") )
    *
    *   // Java:
-   *   import static org.apache.spark.sql.Dsl.*;
+   *   import static org.apache.spark.sql.functions.*;
    *   df.filter( not(df.col("isActive")) );
    * }}
    */
@@ -153,7 +152,7 @@ trait Column extends DataFrame {
    *   df.filter( df("colA") === df("colB") )
    *
    *   // Java
-   *   import static org.apache.spark.sql.Dsl.*;
+   *   import static org.apache.spark.sql.functions.*;
    *   df.filter( col("colA").equalTo(col("colB")) );
    * }}}
    */
@@ -168,7 +167,7 @@ trait Column extends DataFrame {
    *   df.filter( df("colA") === df("colB") )
    *
    *   // Java
-   *   import static org.apache.spark.sql.Dsl.*;
+   *   import static org.apache.spark.sql.functions.*;
    *   df.filter( col("colA").equalTo(col("colB")) );
    * }}}
    */
@@ -182,7 +181,7 @@ trait Column extends DataFrame {
    *   df.select( !(df("colA") === df("colB")) )
    *
    *   // Java:
-   *   import static org.apache.spark.sql.Dsl.*;
+   *   import static org.apache.spark.sql.functions.*;
    *   df.filter( col("colA").notEqual(col("colB")) );
    * }}}
    */
@@ -198,7 +197,7 @@ trait Column extends DataFrame {
    *   df.select( !(df("colA") === df("colB")) )
    *
    *   // Java:
-   *   import static org.apache.spark.sql.Dsl.*;
+   *   import static org.apache.spark.sql.functions.*;
    *   df.filter( col("colA").notEqual(col("colB")) );
    * }}}
    */
@@ -213,7 +212,7 @@ trait Column extends DataFrame {
    *   people.select( people("age") > 21 )
    *
    *   // Java:
-   *   import static org.apache.spark.sql.Dsl.*;
+   *   import static org.apache.spark.sql.functions.*;
    *   people.select( people("age").gt(21) );
    * }}}
    */
@@ -228,7 +227,7 @@ trait Column extends DataFrame {
    *   people.select( people("age") > lit(21) )
    *
    *   // Java:
-   *   import static org.apache.spark.sql.Dsl.*;
+   *   import static org.apache.spark.sql.functions.*;
    *   people.select( people("age").gt(21) );
    * }}}
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 4f8f19e2c1465..e21e989f36c65 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -48,7 +48,7 @@ private[sql] object DataFrame {
  * }}}
  *
  * Once created, it can be manipulated using the various domain-specific-language (DSL) functions
- * defined in: [[DataFrame]] (this class), [[Column]], [[Dsl]] for the DSL.
+ * defined in: [[DataFrame]] (this class), [[Column]], [[functions]] for the DSL.
  *
  * To select a column from the data frame, use the apply method:
  * {{{
@@ -94,27 +94,27 @@ trait DataFrame extends RDDApi[Row] with Serializable {
     }
 
   /** Left here for backward compatibility. */
-  @deprecated("1.3.0", "use toDataFrame")
+  @deprecated("1.3.0", "use toDF")
   def toSchemaRDD: DataFrame = this
 
   /**
    * Returns the object itself. Used to force an implicit conversion from RDD to DataFrame in Scala.
    */
   // This is declared with parentheses to prevent the Scala compiler from treating
-  // `rdd.toDataFrame("1")` as invoking this toDataFrame and then apply on the returned DataFrame.
-  def toDataFrame(): DataFrame = this
+  // `rdd.toDF("1")` as invoking this toDF and then apply on the returned DataFrame.
+  def toDF(): DataFrame = this
 
   /**
    * Returns a new [[DataFrame]] with columns renamed. This can be quite convenient in conversion
    * from a RDD of tuples into a [[DataFrame]] with meaningful names. For example:
    * {{{
    *   val rdd: RDD[(Int, String)] = ...
-   *   rdd.toDataFrame  // this implicit conversion creates a DataFrame with column name _1 and _2
-   *   rdd.toDataFrame("id", "name")  // this creates a DataFrame with column name "id" and "name"
+   *   rdd.toDF  // this implicit conversion creates a DataFrame with column name _1 and _2
+   *   rdd.toDF("id", "name")  // this creates a DataFrame with column name "id" and "name"
    * }}}
    */
   @scala.annotation.varargs
-  def toDataFrame(colNames: String*): DataFrame
+  def toDF(colNames: String*): DataFrame
 
   /** Returns the schema of this [[DataFrame]]. */
   def schema: StructType
@@ -132,7 +132,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
   def explain(extended: Boolean): Unit
 
   /** Only prints the physical plan to the console for debugging purpose. */
-  def explain(): Unit = explain(false)
+  def explain(): Unit = explain(extended = false)
 
   /**
    * Returns true if the `collect` and `take` methods can be run locally
@@ -179,11 +179,11 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *
    * {{{
    *   // Scala:
-   *   import org.apache.spark.sql.dsl._
+   *   import org.apache.spark.sql.functions._
    *   df1.join(df2, "outer", $"df1Key" === $"df2Key")
    *
    *   // Java:
-   *   import static org.apache.spark.sql.Dsl.*;
+   *   import static org.apache.spark.sql.functions.*;
    *   df1.join(df2, "outer", col("df1Key") === col("df2Key"));
    * }}}
    *
@@ -483,12 +483,12 @@ trait DataFrame extends RDDApi[Row] with Serializable {
   /**
    * Returns a new [[DataFrame]] by adding a column.
    */
-  def addColumn(colName: String, col: Column): DataFrame
+  def withColumn(colName: String, col: Column): DataFrame
 
   /**
    * Returns a new [[DataFrame]] with a column renamed.
    */
-  def renameColumn(existingName: String, newName: String): DataFrame
+  def withColumnRenamed(existingName: String, newName: String): DataFrame
 
   /**
    * Returns the first `n` rows.
@@ -520,6 +520,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * Returns a new RDD by applying a function to each partition of this DataFrame.
    */
   override def mapPartitions[R: ClassTag](f: Iterator[Row] => Iterator[R]): RDD[R]
+
   /**
    * Applies a function `f` to all rows.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameHolder.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameHolder.scala
new file mode 100644
index 0000000000000..a3187fe3230fd
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameHolder.scala
@@ -0,0 +1,30 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql
+
+/**
+ * A container for a [[DataFrame]], used for implicit conversions.
+ */
+private[sql] case class DataFrameHolder(df: DataFrame) {
+
+  // This is declared with parentheses to prevent the Scala compiler from treating
+  // `rdd.toDF("1")` as invoking this toDF and then apply on the returned DataFrame.
+  def toDF(): DataFrame = df
+
+  def toDF(colNames: String*): DataFrame = df.toDF(colNames :_*)
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index bb5c6226a2217..7b7efbe3477b6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -94,7 +94,7 @@ private[sql] class DataFrameImpl protected[sql](
     }
   }
 
-  override def toDataFrame(colNames: String*): DataFrame = {
+  override def toDF(colNames: String*): DataFrame = {
     require(schema.size == colNames.size,
       "The number of columns doesn't match.\n" +
         "Old column names: " + schema.fields.map(_.name).mkString(", ") + "\n" +
@@ -229,11 +229,11 @@ private[sql] class DataFrameImpl protected[sql](
     }: _*)
   }
 
-  override def addColumn(colName: String, col: Column): DataFrame = {
+  override def withColumn(colName: String, col: Column): DataFrame = {
     select(Column("*"), col.as(colName))
   }
 
-  override def renameColumn(existingName: String, newName: String): DataFrame = {
+  override def withColumnRenamed(existingName: String, newName: String): DataFrame = {
     val colNames = schema.map { field =>
       val name = field.name
       if (name == existingName) Column(name).as(newName) else Column(name)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index 3c20676355c9d..0868013fe7c96 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -20,8 +20,8 @@ package org.apache.spark.sql
 import scala.language.implicitConversions
 import scala.collection.JavaConversions._
 
+import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.{Literal => LiteralExpr}
 import org.apache.spark.sql.catalyst.plans.logical.Aggregate
 
 
@@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.plans.logical.Aggregate
  */
 class GroupedData protected[sql](df: DataFrameImpl, groupingExprs: Seq[Expression]) {
 
-  private[this] implicit def toDataFrame(aggExprs: Seq[NamedExpression]): DataFrame = {
+  private[this] implicit def toDF(aggExprs: Seq[NamedExpression]): DataFrame = {
     val namedGroupingExprs = groupingExprs.map {
       case expr: NamedExpression => expr
       case expr: Expression => Alias(expr, expr.toString)()
@@ -52,7 +52,12 @@ class GroupedData protected[sql](df: DataFrameImpl, groupingExprs: Seq[Expressio
       case "max" => Max
       case "min" => Min
       case "sum" => Sum
-      case "count" | "size" => Count
+      case "count" | "size" =>
+        // Turn count(*) into count(1)
+        (inputExpr: Expression) => inputExpr match {
+          case s: Star => Count(Literal(1))
+          case _ => Count(inputExpr)
+        }
     }
   }
 
@@ -115,17 +120,17 @@ class GroupedData protected[sql](df: DataFrameImpl, groupingExprs: Seq[Expressio
    * Compute aggregates by specifying a series of aggregate columns. Unlike other methods in this
    * class, the resulting [[DataFrame]] won't automatically include the grouping columns.
    *
-   * The available aggregate methods are defined in [[org.apache.spark.sql.Dsl]].
+   * The available aggregate methods are defined in [[org.apache.spark.sql.functions]].
    *
    * {{{
    *   // Selects the age of the oldest employee and the aggregate expense for each department
    *
    *   // Scala:
-   *   import org.apache.spark.sql.dsl._
+   *   import org.apache.spark.sql.functions._
    *   df.groupBy("department").agg($"department", max($"age"), sum($"expense"))
    *
    *   // Java:
-   *   import static org.apache.spark.sql.Dsl.*;
+   *   import static org.apache.spark.sql.functions.*;
    *   df.groupBy("department").agg(col("department"), max(col("age")), sum(col("expense")));
    * }}}
    */
@@ -142,7 +147,7 @@ class GroupedData protected[sql](df: DataFrameImpl, groupingExprs: Seq[Expressio
    * Count the number of rows for each group.
    * The resulting [[DataFrame]] will also contain the grouping columns.
    */
-  def count(): DataFrame = Seq(Alias(Count(LiteralExpr(1)), "count")())
+  def count(): DataFrame = Seq(Alias(Count(Literal(1)), "count")())
 
   /**
    * Compute the average value for each numeric columns for each group. This is an alias for `avg`.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
index cba3b77011cc3..fc37cfa7a899f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
@@ -50,7 +50,7 @@ private[sql] class IncomputableColumn(protected[sql] val expr: Expression) exten
 
   protected[sql] override def logicalPlan: LogicalPlan = err()
 
-  override def toDataFrame(colNames: String*): DataFrame = err()
+  override def toDF(colNames: String*): DataFrame = err()
 
   override def schema: StructType = err()
 
@@ -86,9 +86,9 @@ private[sql] class IncomputableColumn(protected[sql] val expr: Expression) exten
 
   override def selectExpr(exprs: String*): DataFrame = err()
 
-  override def addColumn(colName: String, col: Column): DataFrame = err()
+  override def withColumn(colName: String, col: Column): DataFrame = err()
 
-  override def renameColumn(existingName: String, newName: String): DataFrame = err()
+  override def withColumnRenamed(existingName: String, newName: String): DataFrame = err()
 
   override def filter(condition: Column): DataFrame = err()
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 2165949d32c6f..a1736d0277f1c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -183,14 +183,25 @@ class SQLContext(@transient val sparkContext: SparkContext)
   object implicits extends Serializable {
     // scalastyle:on
 
+    /** Converts $"col name" into an [[Column]]. */
+    implicit class StringToColumn(val sc: StringContext) {
+      def $(args: Any*): ColumnName = {
+        new ColumnName(sc.s(args :_*))
+      }
+    }
+
+    /** An implicit conversion that turns a Scala `Symbol` into a [[Column]]. */
+    implicit def symbolToColumn(s: Symbol): ColumnName = new ColumnName(s.name)
+
     /** Creates a DataFrame from an RDD of case classes or tuples. */
-    implicit def rddToDataFrame[A <: Product : TypeTag](rdd: RDD[A]): DataFrame = {
-      self.createDataFrame(rdd)
+    implicit def rddToDataFrameHolder[A <: Product : TypeTag](rdd: RDD[A]): DataFrameHolder = {
+      DataFrameHolder(self.createDataFrame(rdd))
     }
 
     /** Creates a DataFrame from a local Seq of Product. */
-    implicit def localSeqToDataFrame[A <: Product : TypeTag](data: Seq[A]): DataFrame = {
-      self.createDataFrame(data)
+    implicit def localSeqToDataFrameHolder[A <: Product : TypeTag](data: Seq[A]): DataFrameHolder =
+    {
+      DataFrameHolder(self.createDataFrame(data))
     }
 
     // Do NOT add more implicit conversions. They are likely to break source compatibility by
@@ -198,7 +209,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
     // because of [[DoubleRDDFunctions]].
 
     /** Creates a single column DataFrame from an RDD[Int]. */
-    implicit def intRddToDataFrame(data: RDD[Int]): DataFrame = {
+    implicit def intRddToDataFrameHolder(data: RDD[Int]): DataFrameHolder = {
       val dataType = IntegerType
       val rows = data.mapPartitions { iter =>
         val row = new SpecificMutableRow(dataType :: Nil)
@@ -207,11 +218,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
           row: Row
         }
       }
-      self.createDataFrame(rows, StructType(StructField("_1", dataType) :: Nil))
+      DataFrameHolder(self.createDataFrame(rows, StructType(StructField("_1", dataType) :: Nil)))
     }
 
     /** Creates a single column DataFrame from an RDD[Long]. */
-    implicit def longRddToDataFrame(data: RDD[Long]): DataFrame = {
+    implicit def longRddToDataFrameHolder(data: RDD[Long]): DataFrameHolder = {
       val dataType = LongType
       val rows = data.mapPartitions { iter =>
         val row = new SpecificMutableRow(dataType :: Nil)
@@ -220,11 +231,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
           row: Row
         }
       }
-      self.createDataFrame(rows, StructType(StructField("_1", dataType) :: Nil))
+      DataFrameHolder(self.createDataFrame(rows, StructType(StructField("_1", dataType) :: Nil)))
     }
 
     /** Creates a single column DataFrame from an RDD[String]. */
-    implicit def stringRddToDataFrame(data: RDD[String]): DataFrame = {
+    implicit def stringRddToDataFrame(data: RDD[String]): DataFrameHolder = {
       val dataType = StringType
       val rows = data.mapPartitions { iter =>
         val row = new SpecificMutableRow(dataType :: Nil)
@@ -233,7 +244,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
           row: Row
         }
       }
-      self.createDataFrame(rows, StructType(StructField("_1", dataType) :: Nil))
+      DataFrameHolder(self.createDataFrame(rows, StructType(StructField("_1", dataType) :: Nil)))
     }
   }
 
@@ -780,7 +791,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * indicating if a table is a temporary one or not).
    */
   def tables(): DataFrame = {
-    createDataFrame(catalog.getTables(None)).toDataFrame("tableName", "isTemporary")
+    createDataFrame(catalog.getTables(None)).toDF("tableName", "isTemporary")
   }
 
   /**
@@ -789,7 +800,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * indicating if a table is a temporary one or not).
    */
   def tables(databaseName: String): DataFrame = {
-    createDataFrame(catalog.getTables(Some(databaseName))).toDataFrame("tableName", "isTemporary")
+    createDataFrame(catalog.getTables(Some(databaseName))).toDF("tableName", "isTemporary")
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala
index c60d4070942a9..ee94a5fdbe376 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.execution.PythonUDF
 import org.apache.spark.sql.types.DataType
 
 /**
- * A user-defined function. To create one, use the `udf` functions in [[Dsl]].
+ * A user-defined function. To create one, use the `udf` functions in [[functions]].
  * As an example:
  * {{{
  *   // Defined a UDF that returns true or false based on some numeric score.
@@ -45,7 +45,7 @@ case class UserDefinedFunction(f: AnyRef, dataType: DataType) {
 }
 
 /**
- * A user-defined Python function. To create one, use the `pythonUDF` functions in [[Dsl]].
+ * A user-defined Python function. To create one, use the `pythonUDF` functions in [[functions]].
  * This is used by Python API.
  */
 private[sql] case class UserDefinedPythonFunction(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
similarity index 97%
rename from sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 7bc7683576b71..4a0ec0b72ce81 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dsl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -21,6 +21,7 @@ import scala.language.implicitConversions
 import scala.reflect.runtime.universe.{TypeTag, typeTag}
 
 import org.apache.spark.sql.catalyst.ScalaReflection
+import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types._
 
@@ -28,17 +29,9 @@ import org.apache.spark.sql.types._
 /**
  * Domain specific functions available for [[DataFrame]].
  */
-object Dsl {
-
-  /** An implicit conversion that turns a Scala `Symbol` into a [[Column]]. */
-  implicit def symbolToColumn(s: Symbol): ColumnName = new ColumnName(s.name)
-
-  /** Converts $"col name" into an [[Column]]. */
-  implicit class StringToColumn(val sc: StringContext) extends AnyVal {
-    def $(args: Any*): ColumnName = {
-      new ColumnName(sc.s(args :_*))
-    }
-  }
+// scalastyle:off
+object functions {
+// scalastyle:on
 
   private[this] implicit def toColumn(expr: Expression): Column = Column(expr)
 
@@ -104,7 +97,11 @@ object Dsl {
   def sumDistinct(columnName: String): Column = sumDistinct(Column(columnName))
 
   /** Aggregate function: returns the number of items in a group. */
-  def count(e: Column): Column = Count(e.expr)
+  def count(e: Column): Column = e.expr match {
+    // Turn count(*) into count(1)
+    case s: Star => Count(Literal(1))
+    case _ => Count(e.expr)
+  }
 
   /** Aggregate function: returns the number of items in a group. */
   def count(columnName: String): Column = count(Column(columnName))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
index 8d3e094e3344d..538d774eb97eb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
@@ -90,7 +90,7 @@ trait ParquetTest {
       (f: String => Unit): Unit = {
     import sqlContext.implicits._
     withTempPath { file =>
-      sparkContext.parallelize(data).saveAsParquetFile(file.getCanonicalPath)
+      sparkContext.parallelize(data).toDF().saveAsParquetFile(file.getCanonicalPath)
       f(file.getCanonicalPath)
     }
   }
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaDsl.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaDsl.java
index 639436368c4a3..05233dc5ffc58 100644
--- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaDsl.java
+++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaDsl.java
@@ -23,7 +23,7 @@
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.types.DataTypes;
 
-import static org.apache.spark.sql.Dsl.*;
+import static org.apache.spark.sql.functions.*;
 
 /**
  * This test doesn't actually run anything. It is here to check the API compatibility for Java.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 1318750a4a3b0..691dae0a0561b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -25,8 +25,9 @@ import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.columnar._
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.TestSQLContext._
+import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.storage.{StorageLevel, RDDBlockId}
 
 case class BigData(s: String)
@@ -34,8 +35,6 @@ case class BigData(s: String)
 class CachedTableSuite extends QueryTest {
   TestData // Load test tables.
 
-  import org.apache.spark.sql.test.TestSQLContext.implicits._
-
   def rddIdOf(tableName: String): Int = {
     val executedPlan = table(tableName).queryExecution.executedPlan
     executedPlan.collect {
@@ -95,7 +94,7 @@ class CachedTableSuite extends QueryTest {
 
   test("too big for memory") {
     val data = "*" * 10000
-    sparkContext.parallelize(1 to 200000, 1).map(_ => BigData(data)).registerTempTable("bigData")
+    sparkContext.parallelize(1 to 200000, 1).map(_ => BigData(data)).toDF().registerTempTable("bigData")
     table("bigData").persist(StorageLevel.MEMORY_AND_DISK)
     assert(table("bigData").count() === 200000L)
     table("bigData").unpersist(blocking = true)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index e3e6f652ed3ed..a63d733ece627 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.types.{BooleanType, IntegerType, StructField, StructType}
@@ -68,7 +68,7 @@ class ColumnExpressionSuite extends QueryTest {
   }
 
   test("collect on column produced by a binary operator") {
-    val df = Seq((1, 2, 3)).toDataFrame("a", "b", "c")
+    val df = Seq((1, 2, 3)).toDF("a", "b", "c")
     checkAnswer(df("a") + df("b"), Seq(Row(3)))
     checkAnswer(df("a") + df("b").as("c"), Seq(Row(3)))
   }
@@ -79,7 +79,7 @@ class ColumnExpressionSuite extends QueryTest {
 
   test("star qualified by data frame object") {
     // This is not yet supported.
-    val df = testData.toDataFrame
+    val df = testData.toDF
     val goldAnswer = df.collect().toSeq
     checkAnswer(df.select(df("*")), goldAnswer)
 
@@ -156,13 +156,13 @@ class ColumnExpressionSuite extends QueryTest {
 
   test("isNull") {
     checkAnswer(
-      nullStrings.toDataFrame.where($"s".isNull),
+      nullStrings.toDF.where($"s".isNull),
       nullStrings.collect().toSeq.filter(r => r.getString(1) eq null))
   }
 
   test("isNotNull") {
     checkAnswer(
-      nullStrings.toDataFrame.where($"s".isNotNull),
+      nullStrings.toDF.where($"s".isNotNull),
       nullStrings.collect().toSeq.filter(r => r.getString(1) ne null))
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
index 8fa830dd9390f..2d2367d6e7292 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameImplicitsSuite.scala
@@ -25,31 +25,31 @@ class DataFrameImplicitsSuite extends QueryTest {
 
   test("RDD of tuples") {
     checkAnswer(
-      sc.parallelize(1 to 10).map(i => (i, i.toString)).toDataFrame("intCol", "strCol"),
+      sc.parallelize(1 to 10).map(i => (i, i.toString)).toDF("intCol", "strCol"),
       (1 to 10).map(i => Row(i, i.toString)))
   }
 
   test("Seq of tuples") {
     checkAnswer(
-      (1 to 10).map(i => (i, i.toString)).toDataFrame("intCol", "strCol"),
+      (1 to 10).map(i => (i, i.toString)).toDF("intCol", "strCol"),
       (1 to 10).map(i => Row(i, i.toString)))
   }
 
   test("RDD[Int]") {
     checkAnswer(
-      sc.parallelize(1 to 10).toDataFrame("intCol"),
+      sc.parallelize(1 to 10).toDF("intCol"),
       (1 to 10).map(i => Row(i)))
   }
 
   test("RDD[Long]") {
     checkAnswer(
-      sc.parallelize(1L to 10L).toDataFrame("longCol"),
+      sc.parallelize(1L to 10L).toDF("longCol"),
       (1L to 10L).map(i => Row(i)))
   }
 
   test("RDD[String]") {
     checkAnswer(
-      sc.parallelize(1 to 10).map(_.toString).toDataFrame("stringCol"),
+      sc.parallelize(1 to 10).map(_.toString).toDF("stringCol"),
       (1 to 10).map(i => Row(i.toString)))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 33b35f376b270..f0cd43632ec3f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -21,7 +21,7 @@ import org.apache.spark.sql.TestData._
 
 import scala.language.postfixOps
 
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext.logicalPlanToSparkQuery
@@ -99,7 +99,7 @@ class DataFrameSuite extends QueryTest {
   }
 
   test("simple explode") {
-    val df = Seq(Tuple1("a b c"), Tuple1("d e")).toDataFrame("words")
+    val df = Seq(Tuple1("a b c"), Tuple1("d e")).toDF("words")
 
     checkAnswer(
       df.explode("words", "word") { word: String => word.split(" ").toSeq }.select('word),
@@ -108,7 +108,7 @@ class DataFrameSuite extends QueryTest {
   }
 
   test("explode") {
-    val df = Seq((1, "a b c"), (2, "a b"), (3, "a")).toDataFrame("number", "letters")
+    val df = Seq((1, "a b c"), (2, "a b"), (3, "a")).toDF("number", "letters")
     val df2 =
       df.explode('letters) {
         case Row(letters: String) => letters.split(" ").map(Tuple1(_)).toSeq
@@ -141,15 +141,30 @@ class DataFrameSuite extends QueryTest {
       testData.select('key).collect().toSeq)
   }
 
-  test("agg") {
+  test("groupBy") {
     checkAnswer(
       testData2.groupBy("a").agg($"a", sum($"b")),
-      Seq(Row(1,3), Row(2,3), Row(3,3))
+      Seq(Row(1, 3), Row(2, 3), Row(3, 3))
     )
     checkAnswer(
       testData2.groupBy("a").agg($"a", sum($"b").as("totB")).agg(sum('totB)),
       Row(9)
     )
+    checkAnswer(
+      testData2.groupBy("a").agg(col("a"), count("*")),
+      Row(1, 2) :: Row(2, 2) :: Row(3, 2) :: Nil
+    )
+    checkAnswer(
+      testData2.groupBy("a").agg(Map("*" -> "count")),
+      Row(1, 2) :: Row(2, 2) :: Row(3, 2) :: Nil
+    )
+    checkAnswer(
+      testData2.groupBy("a").agg(Map("b" -> "sum")),
+      Row(1, 3) :: Row(2, 3) :: Row(3, 3) :: Nil
+    )
+  }
+
+  test("agg without groups") {
     checkAnswer(
       testData2.agg(sum('b)),
       Row(9)
@@ -218,20 +233,20 @@ class DataFrameSuite extends QueryTest {
       Seq(Row(3,1), Row(3,2), Row(2,1), Row(2,2), Row(1,1), Row(1,2)))
 
     checkAnswer(
-      arrayData.orderBy('data.getItem(0).asc),
-      arrayData.toDataFrame.collect().sortBy(_.getAs[Seq[Int]](0)(0)).toSeq)
+      arrayData.toDF.orderBy('data.getItem(0).asc),
+      arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(0)).toSeq)
 
     checkAnswer(
-      arrayData.orderBy('data.getItem(0).desc),
-      arrayData.toDataFrame.collect().sortBy(_.getAs[Seq[Int]](0)(0)).reverse.toSeq)
+      arrayData.toDF.orderBy('data.getItem(0).desc),
+      arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(0)).reverse.toSeq)
 
     checkAnswer(
-      arrayData.orderBy('data.getItem(1).asc),
-      arrayData.toDataFrame.collect().sortBy(_.getAs[Seq[Int]](0)(1)).toSeq)
+      arrayData.toDF.orderBy('data.getItem(1).asc),
+      arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(1)).toSeq)
 
     checkAnswer(
-      arrayData.orderBy('data.getItem(1).desc),
-      arrayData.toDataFrame.collect().sortBy(_.getAs[Seq[Int]](0)(1)).reverse.toSeq)
+      arrayData.toDF.orderBy('data.getItem(1).desc),
+      arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(1)).reverse.toSeq)
   }
 
   test("limit") {
@@ -240,11 +255,11 @@ class DataFrameSuite extends QueryTest {
       testData.take(10).toSeq)
 
     checkAnswer(
-      arrayData.limit(1),
+      arrayData.toDF.limit(1),
       arrayData.take(1).map(r => Row.fromSeq(r.productIterator.toSeq)))
 
     checkAnswer(
-      mapData.limit(1),
+      mapData.toDF.limit(1),
       mapData.take(1).map(r => Row.fromSeq(r.productIterator.toSeq)))
   }
 
@@ -378,7 +393,7 @@ class DataFrameSuite extends QueryTest {
   }
 
   test("addColumn") {
-    val df = testData.toDataFrame.addColumn("newCol", col("key") + 1)
+    val df = testData.toDF.withColumn("newCol", col("key") + 1)
     checkAnswer(
       df,
       testData.collect().map { case Row(key: Int, value: String) =>
@@ -388,8 +403,8 @@ class DataFrameSuite extends QueryTest {
   }
 
   test("renameColumn") {
-    val df = testData.toDataFrame.addColumn("newCol", col("key") + 1)
-      .renameColumn("value", "valueRenamed")
+    val df = testData.toDF.withColumn("newCol", col("key") + 1)
+      .withColumnRenamed("value", "valueRenamed")
     checkAnswer(
       df,
       testData.collect().map { case Row(key: Int, value: String) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index f0c939dbb195f..fd73065c4ada3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -20,10 +20,11 @@ package org.apache.spark.sql
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.sql.TestData._
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.test.TestSQLContext._
+import org.apache.spark.sql.test.TestSQLContext.implicits._
 
 
 class JoinSuite extends QueryTest with BeforeAndAfterEach {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
index 5fc35349e166e..282b98a987dd4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
@@ -28,7 +28,7 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter {
   import org.apache.spark.sql.test.TestSQLContext.implicits._
 
   val df =
-    sparkContext.parallelize((1 to 10).map(i => (i,s"str$i"))).toDataFrame("key", "value")
+    sparkContext.parallelize((1 to 10).map(i => (i,s"str$i"))).toDF("key", "value")
 
   before {
     df.registerTempTable("ListTablesSuiteTable")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index a1c8cf58f2357..97684f75e79fe 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql
 import org.apache.spark.sql.test.TestSQLContext
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.types._
@@ -1034,10 +1034,10 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("Supporting relational operator '<=>' in Spark SQL") {
     val nullCheckData1 = TestData(1,"1") :: TestData(2,null) :: Nil
     val rdd1 = sparkContext.parallelize((0 to 1).map(i => nullCheckData1(i)))
-    rdd1.registerTempTable("nulldata1")
+    rdd1.toDF.registerTempTable("nulldata1")
     val nullCheckData2 = TestData(1,"1") :: TestData(2,null) :: Nil
     val rdd2 = sparkContext.parallelize((0 to 1).map(i => nullCheckData2(i)))
-    rdd2.registerTempTable("nulldata2")
+    rdd2.toDF.registerTempTable("nulldata2")
     checkAnswer(sql("SELECT nulldata1.key FROM nulldata1 join " +
       "nulldata2 on nulldata1.value <=> nulldata2.value"),
         (1 to 2).map(i => Row(i)))
@@ -1046,7 +1046,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("Multi-column COUNT(DISTINCT ...)") {
     val data = TestData(1,"val_1") :: TestData(2,"val_2") :: Nil
     val rdd = sparkContext.parallelize((0 to 1).map(i => data(i)))
-    rdd.registerTempTable("distinctData")
+    rdd.toDF.registerTempTable("distinctData")
     checkAnswer(sql("SELECT COUNT(DISTINCT key,value) FROM distinctData"), Row(2))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
index 93782619826f0..9a48f8d0634cb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
@@ -82,7 +82,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
     val data = ReflectData("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true,
                            new java.math.BigDecimal(1), new Date(12345), new Timestamp(12345), Seq(1,2,3))
     val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.registerTempTable("reflectData")
+    rdd.toDF.registerTempTable("reflectData")
 
     assert(sql("SELECT * FROM reflectData").collect().head ===
       Row("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true,
@@ -93,7 +93,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
   test("query case class RDD with nulls") {
     val data = NullReflectData(null, null, null, null, null, null, null)
     val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.registerTempTable("reflectNullData")
+    rdd.toDF.registerTempTable("reflectNullData")
 
     assert(sql("SELECT * FROM reflectNullData").collect().head === Row.fromSeq(Seq.fill(7)(null)))
   }
@@ -101,7 +101,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
   test("query case class RDD with Nones") {
     val data = OptionalReflectData(None, None, None, None, None, None, None)
     val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.registerTempTable("reflectOptionalData")
+    rdd.toDF.registerTempTable("reflectOptionalData")
 
     assert(sql("SELECT * FROM reflectOptionalData").collect().head === Row.fromSeq(Seq.fill(7)(null)))
   }
@@ -109,7 +109,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
   // Equality is broken for Arrays, so we test that separately.
   test("query binary data") {
     val rdd = sparkContext.parallelize(ReflectBinary(Array[Byte](1)) :: Nil)
-    rdd.registerTempTable("reflectBinary")
+    rdd.toDF.registerTempTable("reflectBinary")
 
     val result = sql("SELECT data FROM reflectBinary").collect().head(0).asInstanceOf[Array[Byte]]
     assert(result.toSeq === Seq[Byte](1))
@@ -128,7 +128,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
         Map(10 -> Some(100L), 20 -> Some(200L), 30 -> None),
         Nested(None, "abc")))
     val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.registerTempTable("reflectComplexData")
+    rdd.toDF.registerTempTable("reflectComplexData")
 
     assert(sql("SELECT * FROM reflectComplexData").collect().head ===
       new GenericRow(Array[Any](
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
index 0ed437edd05fd..c511eb1469167 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql
 import java.sql.Timestamp
 
 import org.apache.spark.sql.catalyst.plans.logical
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test._
 import org.apache.spark.sql.test.TestSQLContext.implicits._
 
@@ -29,11 +29,11 @@ case class TestData(key: Int, value: String)
 
 object TestData {
   val testData = TestSQLContext.sparkContext.parallelize(
-    (1 to 100).map(i => TestData(i, i.toString))).toDataFrame
+    (1 to 100).map(i => TestData(i, i.toString))).toDF
   testData.registerTempTable("testData")
 
   val negativeData = TestSQLContext.sparkContext.parallelize(
-    (1 to 100).map(i => TestData(-i, (-i).toString))).toDataFrame
+    (1 to 100).map(i => TestData(-i, (-i).toString))).toDF
   negativeData.registerTempTable("negativeData")
 
   case class LargeAndSmallInts(a: Int, b: Int)
@@ -44,7 +44,7 @@ object TestData {
       LargeAndSmallInts(2147483645, 1) ::
       LargeAndSmallInts(2, 2) ::
       LargeAndSmallInts(2147483646, 1) ::
-      LargeAndSmallInts(3, 2) :: Nil).toDataFrame
+      LargeAndSmallInts(3, 2) :: Nil).toDF
   largeAndSmallInts.registerTempTable("largeAndSmallInts")
 
   case class TestData2(a: Int, b: Int)
@@ -55,7 +55,7 @@ object TestData {
       TestData2(2, 1) ::
       TestData2(2, 2) ::
       TestData2(3, 1) ::
-      TestData2(3, 2) :: Nil, 2).toDataFrame
+      TestData2(3, 2) :: Nil, 2).toDF
   testData2.registerTempTable("testData2")
 
   case class DecimalData(a: BigDecimal, b: BigDecimal)
@@ -67,7 +67,7 @@ object TestData {
       DecimalData(2, 1) ::
       DecimalData(2, 2) ::
       DecimalData(3, 1) ::
-      DecimalData(3, 2) :: Nil).toDataFrame
+      DecimalData(3, 2) :: Nil).toDF
   decimalData.registerTempTable("decimalData")
 
   case class BinaryData(a: Array[Byte], b: Int)
@@ -77,14 +77,14 @@ object TestData {
       BinaryData("22".getBytes(), 5) ::
       BinaryData("122".getBytes(), 3) ::
       BinaryData("121".getBytes(), 2) ::
-      BinaryData("123".getBytes(), 4) :: Nil).toDataFrame
+      BinaryData("123".getBytes(), 4) :: Nil).toDF
   binaryData.registerTempTable("binaryData")
 
   case class TestData3(a: Int, b: Option[Int])
   val testData3 =
     TestSQLContext.sparkContext.parallelize(
       TestData3(1, None) ::
-      TestData3(2, Some(2)) :: Nil).toDataFrame
+      TestData3(2, Some(2)) :: Nil).toDF
   testData3.registerTempTable("testData3")
 
   val emptyTableData = logical.LocalRelation($"a".int, $"b".int)
@@ -97,7 +97,7 @@ object TestData {
       UpperCaseData(3, "C") ::
       UpperCaseData(4, "D") ::
       UpperCaseData(5, "E") ::
-      UpperCaseData(6, "F") :: Nil).toDataFrame
+      UpperCaseData(6, "F") :: Nil).toDF
   upperCaseData.registerTempTable("upperCaseData")
 
   case class LowerCaseData(n: Int, l: String)
@@ -106,7 +106,7 @@ object TestData {
       LowerCaseData(1, "a") ::
       LowerCaseData(2, "b") ::
       LowerCaseData(3, "c") ::
-      LowerCaseData(4, "d") :: Nil).toDataFrame
+      LowerCaseData(4, "d") :: Nil).toDF
   lowerCaseData.registerTempTable("lowerCaseData")
 
   case class ArrayData(data: Seq[Int], nestedData: Seq[Seq[Int]])
@@ -114,7 +114,7 @@ object TestData {
     TestSQLContext.sparkContext.parallelize(
       ArrayData(Seq(1,2,3), Seq(Seq(1,2,3))) ::
       ArrayData(Seq(2,3,4), Seq(Seq(2,3,4))) :: Nil)
-  arrayData.registerTempTable("arrayData")
+  arrayData.toDF.registerTempTable("arrayData")
 
   case class MapData(data: scala.collection.Map[Int, String])
   val mapData =
@@ -124,18 +124,18 @@ object TestData {
       MapData(Map(1 -> "a3", 2 -> "b3", 3 -> "c3")) ::
       MapData(Map(1 -> "a4", 2 -> "b4")) ::
       MapData(Map(1 -> "a5")) :: Nil)
-  mapData.registerTempTable("mapData")
+  mapData.toDF.registerTempTable("mapData")
 
   case class StringData(s: String)
   val repeatedData =
     TestSQLContext.sparkContext.parallelize(List.fill(2)(StringData("test")))
-  repeatedData.registerTempTable("repeatedData")
+  repeatedData.toDF.registerTempTable("repeatedData")
 
   val nullableRepeatedData =
     TestSQLContext.sparkContext.parallelize(
       List.fill(2)(StringData(null)) ++
       List.fill(2)(StringData("test")))
-  nullableRepeatedData.registerTempTable("nullableRepeatedData")
+  nullableRepeatedData.toDF.registerTempTable("nullableRepeatedData")
 
   case class NullInts(a: Integer)
   val nullInts =
@@ -144,7 +144,7 @@ object TestData {
       NullInts(2) ::
       NullInts(3) ::
       NullInts(null) :: Nil
-    )
+    ).toDF
   nullInts.registerTempTable("nullInts")
 
   val allNulls =
@@ -152,7 +152,7 @@ object TestData {
       NullInts(null) ::
       NullInts(null) ::
       NullInts(null) ::
-      NullInts(null) :: Nil)
+      NullInts(null) :: Nil).toDF
   allNulls.registerTempTable("allNulls")
 
   case class NullStrings(n: Int, s: String)
@@ -160,11 +160,11 @@ object TestData {
     TestSQLContext.sparkContext.parallelize(
       NullStrings(1, "abc") ::
       NullStrings(2, "ABC") ::
-      NullStrings(3, null) :: Nil).toDataFrame
+      NullStrings(3, null) :: Nil).toDF
   nullStrings.registerTempTable("nullStrings")
 
   case class TableName(tableName: String)
-  TestSQLContext.sparkContext.parallelize(TableName("test") :: Nil).registerTempTable("tableName")
+  TestSQLContext.sparkContext.parallelize(TableName("test") :: Nil).toDF.registerTempTable("tableName")
 
   val unparsedStrings =
     TestSQLContext.sparkContext.parallelize(
@@ -177,22 +177,22 @@ object TestData {
   val timestamps = TestSQLContext.sparkContext.parallelize((1 to 3).map { i =>
     TimestampField(new Timestamp(i))
   })
-  timestamps.registerTempTable("timestamps")
+  timestamps.toDF.registerTempTable("timestamps")
 
   case class IntField(i: Int)
   // An RDD with 4 elements and 8 partitions
   val withEmptyParts = TestSQLContext.sparkContext.parallelize((1 to 4).map(IntField), 8)
-  withEmptyParts.registerTempTable("withEmptyParts")
+  withEmptyParts.toDF.registerTempTable("withEmptyParts")
 
   case class Person(id: Int, name: String, age: Int)
   case class Salary(personId: Int, salary: Double)
   val person = TestSQLContext.sparkContext.parallelize(
     Person(0, "mike", 30) ::
-    Person(1, "jim", 20) :: Nil)
+    Person(1, "jim", 20) :: Nil).toDF
   person.registerTempTable("person")
   val salary = TestSQLContext.sparkContext.parallelize(
     Salary(0, 2000.0) ::
-    Salary(1, 1000.0) :: Nil)
+    Salary(1, 1000.0) :: Nil).toDF
   salary.registerTempTable("salary")
 
   case class ComplexData(m: Map[Int, String], s: TestData, a: Seq[Int], b: Boolean)
@@ -200,6 +200,6 @@ object TestData {
     TestSQLContext.sparkContext.parallelize(
       ComplexData(Map(1 -> "1"), TestData(1, "1"), Seq(1), true)
         :: ComplexData(Map(2 -> "2"), TestData(2, "2"), Seq(2), false)
-        :: Nil).toDataFrame
+        :: Nil).toDF
   complexData.registerTempTable("complexData")
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index 95923f9aad931..be105c6e83594 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -17,11 +17,11 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.sql.Dsl.StringToColumn
 import org.apache.spark.sql.test._
 
 /* Implicits */
 import TestSQLContext._
+import TestSQLContext.implicits._
 
 case class FunctionResult(f1: String, f2: String)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index 3c1657cd5fc3a..5f21d990e2e5b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql
 import scala.beans.{BeanInfo, BeanProperty}
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext.{sparkContext, sql}
 import org.apache.spark.sql.test.TestSQLContext.implicits._
@@ -66,7 +66,7 @@ class UserDefinedTypeSuite extends QueryTest {
   val points = Seq(
     MyLabeledPoint(1.0, new MyDenseVector(Array(0.1, 1.0))),
     MyLabeledPoint(0.0, new MyDenseVector(Array(0.2, 2.0))))
-  val pointsRDD: RDD[MyLabeledPoint] = sparkContext.parallelize(points)
+  val pointsRDD = sparkContext.parallelize(points).toDF()
 
 
   test("register user type: MyDenseVector for MyLabeledPoint") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
index 86b1b5fda1c0f..38b0f666ab90b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.sql.columnar
 
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.test.TestSQLContext._
+import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.{QueryTest, TestData}
 import org.apache.spark.storage.StorageLevel.MEMORY_ONLY
 
@@ -28,8 +29,6 @@ class InMemoryColumnarQuerySuite extends QueryTest {
   // Make sure the tables are loaded.
   TestData
 
-  import org.apache.spark.sql.test.TestSQLContext.implicits._
-
   test("simple columnar query") {
     val plan = executePlan(testData.logicalPlan).executedPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
@@ -39,7 +38,8 @@ class InMemoryColumnarQuerySuite extends QueryTest {
 
   test("default size avoids broadcast") {
     // TODO: Improve this test when we have better statistics
-    sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString)).registerTempTable("sizeTst")
+    sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString))
+      .toDF().registerTempTable("sizeTst")
     cacheTable("sizeTst")
     assert(
       table("sizeTst").queryExecution.logical.statistics.sizeInBytes >
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
index 55a9f735b3506..e57bb06e7263b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
@@ -21,13 +21,12 @@ import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite}
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.test.TestSQLContext._
+import org.apache.spark.sql.test.TestSQLContext.implicits._
 
 class PartitionBatchPruningSuite extends FunSuite with BeforeAndAfterAll with BeforeAndAfter {
   val originalColumnBatchSize = conf.columnBatchSize
   val originalInMemoryPartitionPruning = conf.inMemoryPartitionPruning
 
-  import org.apache.spark.sql.test.TestSQLContext.implicits._
-
   override protected def beforeAll(): Unit = {
     // Make a table with 5 partitions, 2 batches per partition, 10 elements per batch
     setConf(SQLConf.COLUMN_BATCH_SIZE, "10")
@@ -35,7 +34,7 @@ class PartitionBatchPruningSuite extends FunSuite with BeforeAndAfterAll with Be
     val pruningData = sparkContext.makeRDD((1 to 100).map { key =>
       val string = if (((key - 1) / 10) % 2 == 0) null else key.toString
       TestData(key, string)
-    }, 5)
+    }, 5).toDF()
     pruningData.registerTempTable("pruningData")
 
     // Enable in-memory partition pruning
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index c3210733f1d42..523be56df65ba 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -20,12 +20,13 @@ package org.apache.spark.sql.execution
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.{SQLConf, execution}
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoin, ShuffledHashJoin}
 import org.apache.spark.sql.test.TestSQLContext._
+import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.test.TestSQLContext.planner._
 import org.apache.spark.sql.types._
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index b5f13f8bd5e80..c94e44bd7c397 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -21,11 +21,12 @@ import java.sql.{Date, Timestamp}
 
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.catalyst.util._
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.json.JsonRDD.{compatibleType, enforceCorrectType}
 import org.apache.spark.sql.sources.LogicalRelation
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext._
+import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{QueryTest, Row, SQLConf}
 
@@ -822,7 +823,7 @@ class JsonSuite extends QueryTest {
 
     val df1 = createDataFrame(rowRDD1, schema1)
     df1.registerTempTable("applySchema1")
-    val df2 = df1.toDataFrame
+    val df2 = df1.toDF
     val result = df2.toJSON.collect()
     assert(result(0) === "{\"f1\":1,\"f2\":\"A1\",\"f3\":true,\"f4\":[\"1\",\" A1\",\" true\",\" null\"]}")
     assert(result(3) === "{\"f1\":4,\"f2\":\"D4\",\"f3\":true,\"f4\":[\"4\",\" D4\",\" true\",\" 2147483644\"],\"f5\":2147483644}")
@@ -843,7 +844,7 @@ class JsonSuite extends QueryTest {
 
     val df3 = createDataFrame(rowRDD2, schema2)
     df3.registerTempTable("applySchema2")
-    val df4 = df3.toDataFrame
+    val df4 = df3.toDF
     val result2 = df4.toJSON.collect()
 
     assert(result2(1) === "{\"f1\":{\"f11\":2,\"f12\":false},\"f2\":{\"B2\":null}}")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index c8ebbbc7d2eac..c306330818c0a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -33,11 +33,12 @@ import parquet.schema.{MessageType, MessageTypeParser}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.sql.{DataFrame, QueryTest, SQLConf}
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext._
+import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.types.DecimalType
 
 // Write support class for nested groups: ParquetWriter initializes GroupWriteSupport
@@ -64,6 +65,7 @@ private[parquet] class TestGroupWriteSupport(schema: MessageType) extends WriteS
  * A test suite that tests basic Parquet I/O.
  */
 class ParquetIOSuite extends QueryTest with ParquetTest {
+
   val sqlContext = TestSQLContext
 
   /**
@@ -99,12 +101,12 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
     }
 
     test(s"$prefix: fixed-length decimals") {
-      import org.apache.spark.sql.test.TestSQLContext.implicits._
 
       def makeDecimalRDD(decimal: DecimalType): DataFrame =
         sparkContext
           .parallelize(0 to 1000)
           .map(i => Tuple1(i / 100.0))
+          .toDF
           // Parquet doesn't allow column names with spaces, have to add an alias here
           .select($"_1" cast decimal as "dec")
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index 89b18c3439cf6..9fcb04ca23590 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -37,7 +37,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
   import org.apache.spark.sql.hive.test.TestHive.implicits._
 
   val testData = TestHive.sparkContext.parallelize(
-    (1 to 100).map(i => TestData(i, i.toString)))
+    (1 to 100).map(i => TestData(i, i.toString))).toDF
 
   before {
     // Since every we are doing tests for DDL statements,
@@ -56,7 +56,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
     // Make sure the table has also been updated.
     checkAnswer(
       sql("SELECT * FROM createAndInsertTest"),
-      testData.collect().toSeq.map(Row.fromTuple)
+      testData.collect().toSeq
     )
 
     // Add more data.
@@ -65,7 +65,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
     // Make sure the table has been updated.
     checkAnswer(
       sql("SELECT * FROM createAndInsertTest"),
-      testData.toDataFrame.collect().toSeq ++ testData.toDataFrame.collect().toSeq
+      testData.toDF.collect().toSeq ++ testData.toDF.collect().toSeq
     )
 
     // Now overwrite.
@@ -74,7 +74,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
     // Make sure the registered table has also been updated.
     checkAnswer(
       sql("SELECT * FROM createAndInsertTest"),
-      testData.collect().toSeq.map(Row.fromTuple)
+      testData.collect().toSeq
     )
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
index 068aa03330c33..321b784a3f842 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
@@ -29,7 +29,7 @@ class ListTablesSuite extends QueryTest with BeforeAndAfterAll {
   import org.apache.spark.sql.hive.test.TestHive.implicits._
 
   val df =
-    sparkContext.parallelize((1 to 10).map(i => (i,s"str$i"))).toDataFrame("key", "value")
+    sparkContext.parallelize((1 to 10).map(i => (i,s"str$i"))).toDF("key", "value")
 
   override def beforeAll(): Unit = {
     // The catalog in HiveContext is a case insensitive one.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 2916724f66e24..addf887ab9162 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -28,17 +28,14 @@ import org.apache.spark.sql.catalyst.util
 import org.apache.spark.sql._
 import org.apache.spark.util.Utils
 import org.apache.spark.sql.types._
-
-/* Implicits */
 import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.test.TestHive.implicits._
 
 /**
  * Tests for persisting tables created though the data sources API into the metastore.
  */
 class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
 
-  import org.apache.spark.sql.hive.test.TestHive.implicits._
-
   override def afterEach(): Unit = {
     reset()
     if (tempPath.exists()) Utils.deleteRecursively(tempPath)
@@ -154,7 +151,8 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
   test("check change without refresh") {
     val tempDir = File.createTempFile("sparksql", "json")
     tempDir.delete()
-    sparkContext.parallelize(("a", "b") :: Nil).toJSON.saveAsTextFile(tempDir.getCanonicalPath)
+    sparkContext.parallelize(("a", "b") :: Nil).toDF
+      .toJSON.saveAsTextFile(tempDir.getCanonicalPath)
 
     sql(
       s"""
@@ -170,7 +168,8 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       Row("a", "b"))
 
     FileUtils.deleteDirectory(tempDir)
-    sparkContext.parallelize(("a1", "b1", "c1") :: Nil).toJSON.saveAsTextFile(tempDir.getCanonicalPath)
+    sparkContext.parallelize(("a1", "b1", "c1") :: Nil).toDF
+      .toJSON.saveAsTextFile(tempDir.getCanonicalPath)
 
     // Schema is cached so the new column does not show. The updated values in existing columns
     // will show.
@@ -190,7 +189,8 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
   test("drop, change, recreate") {
     val tempDir = File.createTempFile("sparksql", "json")
     tempDir.delete()
-    sparkContext.parallelize(("a", "b") :: Nil).toJSON.saveAsTextFile(tempDir.getCanonicalPath)
+    sparkContext.parallelize(("a", "b") :: Nil).toDF
+      .toJSON.saveAsTextFile(tempDir.getCanonicalPath)
 
     sql(
       s"""
@@ -206,7 +206,8 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       Row("a", "b"))
 
     FileUtils.deleteDirectory(tempDir)
-    sparkContext.parallelize(("a", "b", "c") :: Nil).toJSON.saveAsTextFile(tempDir.getCanonicalPath)
+    sparkContext.parallelize(("a", "b", "c") :: Nil).toDF
+      .toJSON.saveAsTextFile(tempDir.getCanonicalPath)
 
     sql("DROP TABLE jsonTable")
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 405b200d05412..d01dbf80ef66d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -29,7 +29,7 @@ import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.spark.{SparkFiles, SparkException}
 import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.catalyst.plans.logical.Project
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
@@ -567,7 +567,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
       TestHive.sparkContext.parallelize(
         TestData(1, "str1") ::
         TestData(2, "str2") :: Nil)
-    testData.registerTempTable("REGisteredTABle")
+    testData.toDF.registerTempTable("REGisteredTABle")
 
     assertResult(Array(Row(2, "str2"))) {
       sql("SELECT tablealias.A, TABLEALIAS.b FROM reGisteredTABle TableAlias " +
@@ -592,7 +592,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
   test("SPARK-2180: HAVING support in GROUP BY clauses (positive)") {
     val fixture = List(("foo", 2), ("bar", 1), ("foo", 4), ("bar", 3))
       .zipWithIndex.map {case Pair(Pair(value, attr), key) => HavingRow(key, value, attr)}
-    TestHive.sparkContext.parallelize(fixture).registerTempTable("having_test")
+    TestHive.sparkContext.parallelize(fixture).toDF.registerTempTable("having_test")
     val results =
       sql("SELECT value, max(attr) AS attr FROM having_test GROUP BY value HAVING attr > 3")
       .collect()
@@ -740,7 +740,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
       TestHive.sparkContext.parallelize(
         TestData(1, "str1") ::
         TestData(1, "str2") :: Nil)
-    testData.registerTempTable("test_describe_commands2")
+    testData.toDF.registerTempTable("test_describe_commands2")
 
     assertResult(
       Array(
@@ -900,8 +900,8 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
   }
 
   test("SPARK-3414 regression: should store analyzed logical plan when registering a temp table") {
-    sparkContext.makeRDD(Seq.empty[LogEntry]).registerTempTable("rawLogs")
-    sparkContext.makeRDD(Seq.empty[LogFile]).registerTempTable("logFiles")
+    sparkContext.makeRDD(Seq.empty[LogEntry]).toDF.registerTempTable("rawLogs")
+    sparkContext.makeRDD(Seq.empty[LogFile]).toDF.registerTempTable("logFiles")
 
     sql(
       """
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
index 029c36aa89b26..6fc4cc14265ec 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
@@ -77,7 +77,7 @@ class HiveResolutionSuite extends HiveComparisonTest {
   test("case insensitivity with scala reflection") {
     // Test resolution with Scala Reflection
     sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
-      .registerTempTable("caseSensitivityTest")
+      .toDF.registerTempTable("caseSensitivityTest")
 
     val query = sql("SELECT a, b, A, B, n.a, n.b, n.A, n.B FROM caseSensitivityTest")
     assert(query.schema.fields.map(_.name) === Seq("a", "b", "A", "B", "a", "b", "A", "B"),
@@ -88,14 +88,14 @@ class HiveResolutionSuite extends HiveComparisonTest {
   ignore("case insensitivity with scala reflection joins") {
     // Test resolution with Scala Reflection
     sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
-      .registerTempTable("caseSensitivityTest")
+      .toDF.registerTempTable("caseSensitivityTest")
 
     sql("SELECT * FROM casesensitivitytest a JOIN casesensitivitytest b ON a.a = b.a").collect()
   }
 
   test("nested repeated resolution") {
     sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
-      .registerTempTable("nestedRepeatedTest")
+      .toDF.registerTempTable("nestedRepeatedTest")
     assert(sql("SELECT nestedArray[0].a FROM nestedRepeatedTest").collect().head(0) === 1)
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
index 8fb5e050a237a..ab53c6309e089 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala
@@ -18,9 +18,10 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.Dsl._
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.test.TestHive.implicits._
 
 import org.apache.spark.util.Utils
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
index 1e99003d3e9b5..245161d2ebbca 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
@@ -111,7 +111,7 @@ class HiveUdfSuite extends QueryTest {
   
   test("UDFIntegerToString") {
     val testData = TestHive.sparkContext.parallelize(
-      IntegerCaseClass(1) :: IntegerCaseClass(2) :: Nil)
+      IntegerCaseClass(1) :: IntegerCaseClass(2) :: Nil).toDF
     testData.registerTempTable("integerTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFIntegerToString AS '${classOf[UDFIntegerToString].getName}'")
@@ -127,7 +127,7 @@ class HiveUdfSuite extends QueryTest {
     val testData = TestHive.sparkContext.parallelize(
       ListListIntCaseClass(Nil) ::
       ListListIntCaseClass(Seq((1, 2, 3))) ::
-      ListListIntCaseClass(Seq((4, 5, 6), (7, 8, 9))) :: Nil)
+      ListListIntCaseClass(Seq((4, 5, 6), (7, 8, 9))) :: Nil).toDF
     testData.registerTempTable("listListIntTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFListListInt AS '${classOf[UDFListListInt].getName}'")
@@ -142,7 +142,7 @@ class HiveUdfSuite extends QueryTest {
   test("UDFListString") {
     val testData = TestHive.sparkContext.parallelize(
       ListStringCaseClass(Seq("a", "b", "c")) ::
-      ListStringCaseClass(Seq("d", "e")) :: Nil)
+      ListStringCaseClass(Seq("d", "e")) :: Nil).toDF
     testData.registerTempTable("listStringTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFListString AS '${classOf[UDFListString].getName}'")
@@ -156,7 +156,7 @@ class HiveUdfSuite extends QueryTest {
 
   test("UDFStringString") {
     val testData = TestHive.sparkContext.parallelize(
-      StringCaseClass("world") :: StringCaseClass("goodbye") :: Nil)
+      StringCaseClass("world") :: StringCaseClass("goodbye") :: Nil).toDF
     testData.registerTempTable("stringTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testStringStringUdf AS '${classOf[UDFStringString].getName}'")
@@ -173,7 +173,7 @@ class HiveUdfSuite extends QueryTest {
       ListListIntCaseClass(Nil) ::
       ListListIntCaseClass(Seq((1, 2, 3))) ::
       ListListIntCaseClass(Seq((4, 5, 6), (7, 8, 9))) ::
-      Nil)
+      Nil).toDF
     testData.registerTempTable("TwoListTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFTwoListList AS '${classOf[UDFTwoListList].getName}'")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 9a6e8650a0ec4..978825938395f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.hive.execution
 import org.apache.spark.sql.hive.HiveShim
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{QueryTest, Row, SQLConf}
 
@@ -34,9 +35,6 @@ case class Nested3(f3: Int)
  */
 class SQLQuerySuite extends QueryTest {
 
-  import org.apache.spark.sql.hive.test.TestHive.implicits._
-  val sqlCtx = TestHive
-
   test("SPARK-4512 Fix attribute reference resolution error when using SORT BY") {
     checkAnswer(
       sql("SELECT * FROM (SELECT key + key AS a FROM src SORT BY value) t ORDER BY t.a"),
@@ -176,7 +174,8 @@ class SQLQuerySuite extends QueryTest {
   }
 
   test("double nested data") {
-    sparkContext.parallelize(Nested1(Nested2(Nested3(1))) :: Nil).registerTempTable("nested")
+    sparkContext.parallelize(Nested1(Nested2(Nested3(1))) :: Nil)
+      .toDF().registerTempTable("nested")
     checkAnswer(
       sql("SELECT f1.f2.f3 FROM nested"),
       Row(1))
@@ -199,7 +198,7 @@ class SQLQuerySuite extends QueryTest {
   }
 
   test("SPARK-4825 save join to table") {
-    val testData = sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString))
+    val testData = sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString)).toDF()
     sql("CREATE TABLE test1 (key INT, value STRING)")
     testData.insertInto("test1")
     sql("CREATE TABLE test2 (key INT, value STRING)")
@@ -279,7 +278,7 @@ class SQLQuerySuite extends QueryTest {
 
     val rowRdd = sparkContext.parallelize(row :: Nil)
 
-    sqlCtx.createDataFrame(rowRdd, schema).registerTempTable("testTable")
+    TestHive.createDataFrame(rowRdd, schema).registerTempTable("testTable")
 
     sql(
       """CREATE TABLE nullValuesInInnerComplexTypes
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
index a7479a5b95864..e246cbb6d77f0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
@@ -27,6 +27,8 @@ import org.apache.spark.sql.{SQLConf, QueryTest}
 import org.apache.spark.sql.execution.PhysicalRDD
 import org.apache.spark.sql.hive.execution.HiveTableScan
 import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.hive.test.TestHive.implicits._
+
 
 // The data where the partitioning key exists only in the directory structure.
 case class ParquetData(intField: Int, stringField: String)
@@ -152,7 +154,6 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
   var normalTableDir: File = null
   var partitionedTableDirWithKey: File = null
 
-  import org.apache.spark.sql.hive.test.TestHive.implicits._
 
   override def beforeAll(): Unit = {
     partitionedTableDir = File.createTempFile("parquettests", "sparksql")
@@ -167,12 +168,14 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
       val partDir = new File(partitionedTableDir, s"p=$p")
       sparkContext.makeRDD(1 to 10)
         .map(i => ParquetData(i, s"part-$p"))
+        .toDF()
         .saveAsParquetFile(partDir.getCanonicalPath)
     }
 
     sparkContext
       .makeRDD(1 to 10)
       .map(i => ParquetData(i, s"part-1"))
+      .toDF()
       .saveAsParquetFile(new File(normalTableDir, "normal").getCanonicalPath)
 
     partitionedTableDirWithKey = File.createTempFile("parquettests", "sparksql")
@@ -183,6 +186,7 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
       val partDir = new File(partitionedTableDirWithKey, s"p=$p")
       sparkContext.makeRDD(1 to 10)
         .map(i => ParquetDataWithKey(p, i, s"part-$p"))
+        .toDF()
         .saveAsParquetFile(partDir.getCanonicalPath)
     }
   }

From f80e2629bb74bc62960c61ff313f7e7802d61319 Mon Sep 17 00:00:00 2001
From: gasparms <gmunoz@stratio.com>
Date: Sat, 14 Feb 2015 20:10:29 +0000
Subject: [PATCH 169/357] [SPARK-5800] Streaming Docs. Change linked files
 according the selected language

Currently, Spark Streaming Programming Guide after updateStateByKey  explanation links to file stateful_network_wordcount.py and note "For the complete Scala code ..." for any language tab selected. This is an incoherence.

I've changed the guide and link its pertinent example file. JavaStatefulNetworkWordCount.java example was not created so I added to the commit.

Author: gasparms <gmunoz@stratio.com>

Closes #4589 from gasparms/feature/streaming-guide and squashes the following commits:

7f37f89 [gasparms] More style changes
ec202b0 [gasparms] Follow spark style guide
f527328 [gasparms] Improve example to look like scala example
4d8785c [gasparms] Remove throw exception
e92e6b8 [gasparms] Fix incoherence
92db405 [gasparms] Fix Streaming Programming Guide. Change files according the selected language
---
 docs/streaming-programming-guide.md           |  21 +++-
 .../JavaStatefulNetworkWordCount.java         | 115 ++++++++++++++++++
 2 files changed, 132 insertions(+), 4 deletions(-)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java

diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 96fb12ce5e0b9..997de9511ca3e 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -878,6 +878,12 @@ This is applied on a DStream containing words (say, the `pairs` DStream containi
 val runningCounts = pairs.updateStateByKey[Int](updateFunction _)
 {% endhighlight %}
 
+The update function will be called for each word, with `newValues` having a sequence of 1's (from
+the `(word, 1)` pairs) and the `runningCount` having the previous count. For the complete
+Scala code, take a look at the example
+[StatefulNetworkWordCount.scala]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/scala/org/apache
+/spark/examples/streaming/StatefulNetworkWordCount.scala).
+
 </div>
 <div data-lang="java" markdown="1">
 
@@ -899,6 +905,13 @@ This is applied on a DStream containing words (say, the `pairs` DStream containi
 JavaPairDStream<String, Integer> runningCounts = pairs.updateStateByKey(updateFunction);
 {% endhighlight %}
 
+The update function will be called for each word, with `newValues` having a sequence of 1's (from
+the `(word, 1)` pairs) and the `runningCount` having the previous count. For the complete
+Java code, take a look at the example
+[JavaStatefulNetworkWordCount.java]({{site
+.SPARK_GITHUB_URL}}/blob/master/examples/src/main/java/org/apache/spark/examples/streaming
+/JavaStatefulNetworkWordCount.java).
+
 </div>
 <div data-lang="python" markdown="1">
 
@@ -916,14 +929,14 @@ This is applied on a DStream containing words (say, the `pairs` DStream containi
 runningCounts = pairs.updateStateByKey(updateFunction)
 {% endhighlight %}
 
-</div>
-</div>
-
 The update function will be called for each word, with `newValues` having a sequence of 1's (from
 the `(word, 1)` pairs) and the `runningCount` having the previous count. For the complete
-Scala code, take a look at the example
+Python code, take a look at the example
 [stateful_network_wordcount.py]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/python/streaming/stateful_network_wordcount.py).
 
+</div>
+</div>
+
 Note that using `updateStateByKey` requires the checkpoint directory to be configured, which is
 discussed in detail in the [checkpointing](#checkpointing) section.
 
diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java
new file mode 100644
index 0000000000000..09491fe300822
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.streaming;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import scala.Tuple2;
+
+import com.google.common.base.Optional;
+import com.google.common.collect.Lists;
+
+import org.apache.spark.HashPartitioner;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.StorageLevels;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.api.java.function.Function2;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.streaming.Durations;
+import org.apache.spark.streaming.api.java.JavaDStream;
+import org.apache.spark.streaming.api.java.JavaPairDStream;
+import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
+import org.apache.spark.streaming.api.java.JavaStreamingContext;
+
+
+/**
+ * Counts words cumulatively in UTF8 encoded, '\n' delimited text received from the network every
+ * second starting with initial value of word count.
+ * Usage: JavaStatefulNetworkWordCount <hostname> <port>
+ * <hostname> and <port> describe the TCP server that Spark Streaming would connect to receive
+ * data.
+ * <p/>
+ * To run this on your local machine, you need to first run a Netcat server
+ * `$ nc -lk 9999`
+ * and then run the example
+ * `$ bin/run-example
+ * org.apache.spark.examples.streaming.JavaStatefulNetworkWordCount localhost 9999`
+ */
+public class JavaStatefulNetworkWordCount {
+  private static final Pattern SPACE = Pattern.compile(" ");
+
+  public static void main(String[] args) {
+    if (args.length < 2) {
+      System.err.println("Usage: JavaStatefulNetworkWordCount <hostname> <port>");
+      System.exit(1);
+    }
+
+    StreamingExamples.setStreamingLogLevels();
+
+    // Update the cumulative count function
+    final Function2<List<Integer>, Optional<Integer>, Optional<Integer>> updateFunction = new
+            Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {
+              @Override
+              public Optional<Integer> call(List<Integer> values, Optional<Integer> state) {
+                Integer newSum = state.or(0);
+                for (Integer value : values) {
+                  newSum += value;
+                }
+                return Optional.of(newSum);
+              }
+            };
+
+    // Create the context with a 1 second batch size
+    SparkConf sparkConf = new SparkConf().setAppName("JavaStatefulNetworkWordCount");
+    JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));
+    ssc.checkpoint(".");
+
+    // Initial RDD input to updateStateByKey
+    List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<String, Integer>("hello", 1),
+            new Tuple2<String, Integer>("world", 1));
+    JavaPairRDD<String, Integer> initialRDD = ssc.sc().parallelizePairs(tuples);
+
+    JavaReceiverInputDStream<String> lines = ssc.socketTextStream(
+            args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER_2);
+
+    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
+      @Override
+      public Iterable<String> call(String x) {
+        return Lists.newArrayList(SPACE.split(x));
+      }
+    });
+
+    JavaPairDStream<String, Integer> wordsDstream = words.mapToPair(new PairFunction<String, String, Integer>() {
+      @Override
+      public Tuple2<String, Integer> call(String s) {
+        return new Tuple2<String, Integer>(s, 1);
+      }
+    });
+
+    // This will give a Dstream made of state (which is the cumulative count of the words)
+    JavaPairDStream<String, Integer> stateDstream = wordsDstream.updateStateByKey(updateFunction,
+            new HashPartitioner(ssc.sc().defaultParallelism()), initialRDD);
+
+    stateDstream.print();
+    ssc.start();
+    ssc.awaitTermination();
+  }
+}

From 15a2ab5f89d56e67c84e7163d28d93e72583393c Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 14 Feb 2015 20:12:29 +0000
Subject: [PATCH 170/357] Revise formatting of previous commit
 f80e2629bb74bc62960c61ff313f7e7802d61319

---
 .../JavaStatefulNetworkWordCount.java         | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java
index 09491fe300822..d46c7107c7a21 100644
--- a/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java
+++ b/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java
@@ -39,7 +39,6 @@
 import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
 import org.apache.spark.streaming.api.java.JavaStreamingContext;
 
-
 /**
  * Counts words cumulatively in UTF8 encoded, '\n' delimited text received from the network every
  * second starting with initial value of word count.
@@ -65,17 +64,17 @@ public static void main(String[] args) {
     StreamingExamples.setStreamingLogLevels();
 
     // Update the cumulative count function
-    final Function2<List<Integer>, Optional<Integer>, Optional<Integer>> updateFunction = new
-            Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {
-              @Override
-              public Optional<Integer> call(List<Integer> values, Optional<Integer> state) {
-                Integer newSum = state.or(0);
-                for (Integer value : values) {
-                  newSum += value;
-                }
-                return Optional.of(newSum);
-              }
-            };
+    final Function2<List<Integer>, Optional<Integer>, Optional<Integer>> updateFunction =
+        new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {
+          @Override
+          public Optional<Integer> call(List<Integer> values, Optional<Integer> state) {
+            Integer newSum = state.or(0);
+            for (Integer value : values) {
+              newSum += value;
+            }
+            return Optional.of(newSum);
+          }
+        };
 
     // Create the context with a 1 second batch size
     SparkConf sparkConf = new SparkConf().setAppName("JavaStatefulNetworkWordCount");
@@ -97,12 +96,13 @@ public Iterable<String> call(String x) {
       }
     });
 
-    JavaPairDStream<String, Integer> wordsDstream = words.mapToPair(new PairFunction<String, String, Integer>() {
-      @Override
-      public Tuple2<String, Integer> call(String s) {
-        return new Tuple2<String, Integer>(s, 1);
-      }
-    });
+    JavaPairDStream<String, Integer> wordsDstream = words.mapToPair(
+        new PairFunction<String, String, Integer>() {
+          @Override
+          public Tuple2<String, Integer> call(String s) {
+            return new Tuple2<String, Integer>(s, 1);
+          }
+        });
 
     // This will give a Dstream made of state (which is the cumulative count of the words)
     JavaPairDStream<String, Integer> stateDstream = wordsDstream.updateStateByKey(updateFunction,

From ed5f4bb7cb2c934b818d1e8b8b4e6a0056119c80 Mon Sep 17 00:00:00 2001
From: gli <gli@redhat.com>
Date: Sat, 14 Feb 2015 20:43:27 +0000
Subject: [PATCH 171/357] SPARK-5822 [BUILD] cannot import src/main/scala &
 src/test/scala into eclipse as source folder

   When import the whole project into eclipse as maven project, found that the
   src/main/scala & src/test/scala can not be set as source folder as default
   behavior, so add a "add-source" goal in scala-maven-plugin to let this work.

Author: gli <gli@redhat.com>

Closes #4531 from ligangty/addsource and squashes the following commits:

4e4db4c [gli] [IDE] cannot import src/main/scala & src/test/scala into eclipse as source folder
---
 pom.xml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pom.xml b/pom.xml
index 53372d5cfc624..6810d71be4230 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1083,6 +1083,12 @@
           <artifactId>scala-maven-plugin</artifactId>
           <version>3.2.0</version>
           <executions>
+            <execution>
+              <id>eclipse-add-source</id>
+              <goals>
+                <goal>add-source</goal>
+              </goals>
+            </execution>
             <execution>
               <id>scala-compile-first</id>
               <phase>process-resources</phase>

From c771e475c449fe07cf45f37bdca2ba6ce9600bfc Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <linguin.m.s@gmail.com>
Date: Sun, 15 Feb 2015 14:42:20 +0000
Subject: [PATCH 172/357] [SPARK-5827][SQL] Add missing import in the example
 of SqlContext

If one tries an example by using copy&paste, throw an exception.

Author: Takeshi Yamamuro <linguin.m.s@gmail.com>

Closes #4615 from maropu/AddMissingImportInSqlContext and squashes the following commits:

ab21b66 [Takeshi Yamamuro] Add missing import in the example of SqlContext
---
 sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index a1736d0277f1c..6d1914808aeed 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -286,6 +286,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * Example:
    * {{{
    *  import org.apache.spark.sql._
+   *  import org.apache.spark.sql.types._
    *  val sqlContext = new org.apache.spark.sql.SQLContext(sc)
    *
    *  val schema =
@@ -377,6 +378,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * Example:
    * {{{
    *  import org.apache.spark.sql._
+   *  import org.apache.spark.sql.types._
    *  val sqlContext = new org.apache.spark.sql.SQLContext(sc)
    *
    *  val schema =

From 61eb12674b90143388a01c22bf51cb7d02ab0447 Mon Sep 17 00:00:00 2001
From: martinzapletal <zapletal-martin@email.cz>
Date: Sun, 15 Feb 2015 09:10:03 -0800
Subject: [PATCH 173/357] [MLLIB][SPARK-5502] User guide for isotonic
 regression

User guide for isotonic regression added to docs/mllib-regression.md including code examples for Scala and Java.

Author: martinzapletal <zapletal-martin@email.cz>

Closes #4536 from zapletal-martin/SPARK-5502 and squashes the following commits:

67fe773 [martinzapletal] SPARK-5502 reworded model prediction rules to use more general language rather than the code/implementation specific terms
80bd4c3 [martinzapletal] SPARK-5502 created docs page for isotonic regression, added links to the page, updated data and examples
7d8136e [martinzapletal] SPARK-5502 Added documentation for Isotonic regression including examples for Scala and Java
504b5c3 [martinzapletal] SPARK-5502 Added documentation for Isotonic regression including examples for Scala and Java
---
 .../mllib/sample_isotonic_regression_data.txt | 100 +++++++++++
 docs/mllib-classification-regression.md       |   3 +-
 docs/mllib-guide.md                           |   1 +
 docs/mllib-isotonic-regression.md             | 155 ++++++++++++++++++
 4 files changed, 258 insertions(+), 1 deletion(-)
 create mode 100644 data/mllib/sample_isotonic_regression_data.txt
 create mode 100644 docs/mllib-isotonic-regression.md

diff --git a/data/mllib/sample_isotonic_regression_data.txt b/data/mllib/sample_isotonic_regression_data.txt
new file mode 100644
index 0000000000000..d257b509d4d37
--- /dev/null
+++ b/data/mllib/sample_isotonic_regression_data.txt
@@ -0,0 +1,100 @@
+0.24579296,0.01
+0.28505864,0.02
+0.31208567,0.03
+0.35900051,0.04
+0.35747068,0.05
+0.16675166,0.06
+0.17491076,0.07
+0.04181540,0.08
+0.04793473,0.09
+0.03926568,0.10
+0.12952575,0.11
+0.00000000,0.12
+0.01376849,0.13
+0.13105558,0.14
+0.08873024,0.15
+0.12595614,0.16
+0.15247323,0.17
+0.25956145,0.18
+0.20040796,0.19
+0.19581846,0.20
+0.15757267,0.21
+0.13717491,0.22
+0.19020908,0.23
+0.19581846,0.24
+0.20091790,0.25
+0.16879143,0.26
+0.18510964,0.27
+0.20040796,0.28
+0.29576747,0.29
+0.43396226,0.30
+0.53391127,0.31
+0.52116267,0.32
+0.48546660,0.33
+0.49209587,0.34
+0.54156043,0.35
+0.59765426,0.36
+0.56144824,0.37
+0.58592555,0.38
+0.52983172,0.39
+0.50178480,0.40
+0.52626211,0.41
+0.58286588,0.42
+0.64660887,0.43
+0.68077511,0.44
+0.74298827,0.45
+0.64864865,0.46
+0.67261601,0.47
+0.65782764,0.48
+0.69811321,0.49
+0.63029067,0.50
+0.61601224,0.51
+0.63233044,0.52
+0.65323814,0.53
+0.65323814,0.54
+0.67363590,0.55
+0.67006629,0.56
+0.51555329,0.57
+0.50892402,0.58
+0.33299337,0.59
+0.36206017,0.60
+0.43090260,0.61
+0.45996940,0.62
+0.56348802,0.63
+0.54920959,0.64
+0.48393677,0.65
+0.48495665,0.66
+0.46965834,0.67
+0.45181030,0.68
+0.45843957,0.69
+0.47118817,0.70
+0.51555329,0.71
+0.58031617,0.72
+0.55481897,0.73
+0.56297807,0.74
+0.56603774,0.75
+0.57929628,0.76
+0.64762876,0.77
+0.66241713,0.78
+0.69301377,0.79
+0.65119837,0.80
+0.68332483,0.81
+0.66598674,0.82
+0.73890872,0.83
+0.73992861,0.84
+0.84242733,0.85
+0.91330954,0.86
+0.88016318,0.87
+0.90719021,0.88
+0.93115757,0.89
+0.93115757,0.90
+0.91942886,0.91
+0.92911780,0.92
+0.95665477,0.93
+0.95002550,0.94
+0.96940337,0.95
+1.00000000,0.96
+0.89801122,0.97
+0.90311066,0.98
+0.90362060,0.99
+0.83477817,1.0
\ No newline at end of file
diff --git a/docs/mllib-classification-regression.md b/docs/mllib-classification-regression.md
index 719cc95767b00..5b9b4dd83b774 100644
--- a/docs/mllib-classification-regression.md
+++ b/docs/mllib-classification-regression.md
@@ -23,7 +23,7 @@ the supported algorithms for each type of problem.
       <td>Multiclass Classification</td><td>decision trees, naive Bayes</td>
     </tr>
     <tr>
-      <td>Regression</td><td>linear least squares, Lasso, ridge regression, decision trees</td>
+      <td>Regression</td><td>linear least squares, Lasso, ridge regression, decision trees, isotonic regression</td>
     </tr>
   </tbody>
 </table>
@@ -35,3 +35,4 @@ More details for these methods can be found here:
   * [linear regression (least squares, Lasso, ridge)](mllib-linear-methods.html#linear-least-squares-lasso-and-ridge-regression)
 * [Decision trees](mllib-decision-tree.html)
 * [Naive Bayes](mllib-naive-bayes.html)
+* [Isotonic regression](mllib-isotonic-regression.html)
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 5091dbf329b97..fbe809b3478e5 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -21,6 +21,7 @@ filtering, dimensionality reduction, as well as underlying optimization primitiv
   * [naive Bayes](mllib-naive-bayes.html)
   * [decision trees](mllib-decision-tree.html)
   * [ensembles of trees](mllib-ensembles.html) (Random Forests and Gradient-Boosted Trees)
+  * [isotonic regression](mllib-isotonic-regression.html)
 * [Collaborative filtering](mllib-collaborative-filtering.html)
   * alternating least squares (ALS)
 * [Clustering](mllib-clustering.html)
diff --git a/docs/mllib-isotonic-regression.md b/docs/mllib-isotonic-regression.md
new file mode 100644
index 0000000000000..12fb29d426741
--- /dev/null
+++ b/docs/mllib-isotonic-regression.md
@@ -0,0 +1,155 @@
+---
+layout: global
+title: Naive Bayes - MLlib
+displayTitle: <a href="mllib-guide.html">MLlib</a> - Regression
+---
+
+## Isotonic regression
+[Isotonic regression](http://en.wikipedia.org/wiki/Isotonic_regression)
+belongs to the family of regression algorithms. Formally isotonic regression is a problem where
+given a finite set of real numbers `$Y = {y_1, y_2, ..., y_n}$` representing observed responses
+and `$X = {x_1, x_2, ..., x_n}$` the unknown response values to be fitted
+finding a function that minimises
+
+`\begin{equation}
+  f(x) = \sum_{i=1}^n w_i (y_i - x_i)^2
+\end{equation}`
+
+with respect to complete order subject to
+`$x_1\le x_2\le ...\le x_n$` where `$w_i$` are positive weights.
+The resulting function is called isotonic regression and it is unique.
+It can be viewed as least squares problem under order restriction.
+Essentially isotonic regression is a
+[monotonic function](http://en.wikipedia.org/wiki/Monotonic_function)
+best fitting the original data points.
+
+MLlib supports a
+[pool adjacent violators algorithm](http://doi.org/10.1198/TECH.2010.10111)
+which uses an approach to
+[parallelizing isotonic regression](http://doi.org/10.1007/978-3-642-99789-1_10).
+The training input is a RDD of tuples of three double values that represent
+label, feature and weight in this order. Additionally IsotonicRegression algorithm has one
+optional parameter called $isotonic$ defaulting to true.
+This argument specifies if the isotonic regression is
+isotonic (monotonically increasing) or antitonic (monotonically decreasing).
+
+Training returns an IsotonicRegressionModel that can be used to predict
+labels for both known and unknown features. The result of isotonic regression
+is treated as piecewise linear function. The rules for prediction therefore are:
+
+* If the prediction input exactly matches a training feature
+  then associated prediction is returned. In case there are multiple predictions with the same
+  feature then one of them is returned. Which one is undefined
+  (same as java.util.Arrays.binarySearch).
+* If the prediction input is lower or higher than all training features
+  then prediction with lowest or highest feature is returned respectively.
+  In case there are multiple predictions with the same feature
+  then the lowest or highest is returned respectively.
+* If the prediction input falls between two training features then prediction is treated
+  as piecewise linear function and interpolated value is calculated from the
+  predictions of the two closest features. In case there are multiple values
+  with the same feature then the same rules as in previous point are used.
+
+### Examples
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+Data are read from a file where each line has a format label,feature
+i.e. 4710.28,500.00. The data are split to training and testing set.
+Model is created using the training set and a mean squared error is calculated from the predicted
+labels and real labels in the test set.
+
+{% highlight scala %}
+import org.apache.spark.mllib.regression.IsotonicRegression
+
+val data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt")
+
+// Create label, feature, weight tuples from input data with weight set to default value 1.0.
+val parsedData = data.map { line =>
+  val parts = line.split(',').map(_.toDouble)
+  (parts(0), parts(1), 1.0)
+}
+
+// Split data into training (60%) and test (40%) sets.
+val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
+val training = splits(0)
+val test = splits(1)
+
+// Create isotonic regression model from training data.
+// Isotonic parameter defaults to true so it is only shown for demonstration
+val model = new IsotonicRegression().setIsotonic(true).run(training)
+
+// Create tuples of predicted and real labels.
+val predictionAndLabel = test.map { point =>
+  val predictedLabel = model.predict(point._2)
+  (predictedLabel, point._1)
+}
+
+// Calculate mean squared error between predicted and real labels.
+val meanSquaredError = predictionAndLabel.map{case(p, l) => math.pow((p - l), 2)}.mean()
+println("Mean Squared Error = " + meanSquaredError)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+Data are read from a file where each line has a format label,feature
+i.e. 4710.28,500.00. The data are split to training and testing set.
+Model is created using the training set and a mean squared error is calculated from the predicted
+labels and real labels in the test set.
+
+{% highlight java %}
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaDoubleRDD;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.mllib.regression.IsotonicRegressionModel;
+import scala.Tuple2;
+import scala.Tuple3;
+
+JavaRDD<String> data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt");
+
+// Create label, feature, weight tuples from input data with weight set to default value 1.0.
+JavaRDD<Tuple3<Double, Double, Double>> parsedData = data.map(
+  new Function<String, Tuple3<Double, Double, Double>>() {
+    public Tuple3<Double, Double, Double> call(String line) {
+      String[] parts = line.split(",");
+      return new Tuple3<>(new Double(parts[0]), new Double(parts[1]), 1.0);
+    }
+  }
+);
+
+// Split data into training (60%) and test (40%) sets.
+JavaRDD<Tuple3<Double, Double, Double>>[] splits = parsedData.randomSplit(new double[] {0.6, 0.4}, 11L);
+JavaRDD<Tuple3<Double, Double, Double>> training = splits[0];
+JavaRDD<Tuple3<Double, Double, Double>> test = splits[1];
+
+// Create isotonic regression model from training data.
+// Isotonic parameter defaults to true so it is only shown for demonstration
+IsotonicRegressionModel model = new IsotonicRegression().setIsotonic(true).run(training);
+
+// Create tuples of predicted and real labels.
+JavaPairRDD<Double, Double> predictionAndLabel = test.mapToPair(
+  new PairFunction<Tuple3<Double, Double, Double>, Double, Double>() {
+    @Override public Tuple2<Double, Double> call(Tuple3<Double, Double, Double> point) {
+      Double predictedLabel = model.predict(point._2());
+      return new Tuple2<Double, Double>(predictedLabel, point._1());
+    }
+  }
+);
+
+// Calculate mean squared error between predicted and real labels.
+Double meanSquaredError = new JavaDoubleRDD(predictionAndLabel.map(
+  new Function<Tuple2<Double, Double>, Object>() {
+    @Override public Object call(Tuple2<Double, Double> pl) {
+      return Math.pow(pl._1() - pl._2(), 2);
+    }
+  }
+).rdd()).mean();
+
+System.out.println("Mean Squared Error = " + meanSquaredError);
+{% endhighlight %}
+</div>
+</div>
\ No newline at end of file

From 836577b382695558f5c97d94ee725d0156ebfad2 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sun, 15 Feb 2015 09:15:48 -0800
Subject: [PATCH 174/357] SPARK-5669 [BUILD] Spark assembly includes
 incompatibly licensed libgfortran, libgcc code via JBLAS

Exclude libgfortran, libgcc bundled by JBLAS for Windows. This much is simple, and solves the essential license issue. But the more important question is whether MLlib works on Windows then.

Author: Sean Owen <sowen@cloudera.com>

Closes #4453 from srowen/SPARK-5669 and squashes the following commits:

734dd86 [Sean Owen] Exclude libgfortran, libgcc bundled by JBLAS, affecting Windows / OS X / Linux 32-bit (not Linux 64-bit)
---
 assembly/pom.xml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index fa9f56e556d8b..fbb6e94839d42 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -114,6 +114,16 @@
                 <exclude>META-INF/*.RSA</exclude>
               </excludes>
             </filter>
+            <filter>
+              <!-- Exclude libgfortran, libgcc for license issues -->
+              <artifact>org.jblas:jblas</artifact>
+              <excludes>
+                <!-- Linux amd64 is OK; not statically linked -->
+                <exclude>lib/Linux/i386/**</exclude>
+                <exclude>lib/Mac OS X/**</exclude>
+                <exclude>lib/Windows/**</exclude>
+              </excludes>
+            </filter>
           </filters>
         </configuration>
         <executions>

From cd4a15366244657c4b7936abe5054754534366f2 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 15 Feb 2015 20:29:26 -0800
Subject: [PATCH 175/357] [SPARK-5769] Set params in constructors and in
 setParams in Python ML pipelines

This PR allow Python users to set params in constructors and in setParams, where we use decorator `keyword_only` to force keyword arguments. The trade-off is discussed in the design doc of SPARK-4586.

Generated doc:
![screen shot 2015-02-12 at 3 06 58 am](https://cloud.githubusercontent.com/assets/829644/6166491/9cfcd06a-b265-11e4-99ea-473d866634fc.png)

CC: davies rxin

Author: Xiangrui Meng <meng@databricks.com>

Closes #4564 from mengxr/py-pipeline-kw and squashes the following commits:

fedf720 [Xiangrui Meng] use toDF
d565f2c [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into py-pipeline-kw
cbc15d3 [Xiangrui Meng] fix style
5032097 [Xiangrui Meng] update pipeline signature
950774e [Xiangrui Meng] simplify keyword_only and update constructor/setParams signatures
fdde5fc [Xiangrui Meng] fix style
c9384b8 [Xiangrui Meng] fix sphinx doc
8e59180 [Xiangrui Meng] add setParams and make constructors take params, where we force keyword args
---
 .../ml/simple_text_classification_pipeline.py | 44 +++++-------
 python/docs/conf.py                           |  4 ++
 python/pyspark/ml/classification.py           | 44 +++++++++---
 python/pyspark/ml/feature.py                  | 72 +++++++++++++++----
 python/pyspark/ml/param/__init__.py           |  8 +++
 python/pyspark/ml/pipeline.py                 | 19 ++++-
 python/pyspark/ml/util.py                     | 15 ++++
 7 files changed, 153 insertions(+), 53 deletions(-)

diff --git a/examples/src/main/python/ml/simple_text_classification_pipeline.py b/examples/src/main/python/ml/simple_text_classification_pipeline.py
index c7df3d7b74767..b4d9355b681f6 100644
--- a/examples/src/main/python/ml/simple_text_classification_pipeline.py
+++ b/examples/src/main/python/ml/simple_text_classification_pipeline.py
@@ -36,43 +36,33 @@
     sqlCtx = SQLContext(sc)
 
     # Prepare training documents, which are labeled.
-    LabeledDocument = Row('id', 'text', 'label')
-    training = sqlCtx.inferSchema(
-        sc.parallelize([(0L, "a b c d e spark", 1.0),
-                        (1L, "b d", 0.0),
-                        (2L, "spark f g h", 1.0),
-                        (3L, "hadoop mapreduce", 0.0)])
-          .map(lambda x: LabeledDocument(*x)))
+    LabeledDocument = Row("id", "text", "label")
+    training = sc.parallelize([(0L, "a b c d e spark", 1.0),
+                               (1L, "b d", 0.0),
+                               (2L, "spark f g h", 1.0),
+                               (3L, "hadoop mapreduce", 0.0)]) \
+        .map(lambda x: LabeledDocument(*x)).toDF()
 
     # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
-    tokenizer = Tokenizer() \
-        .setInputCol("text") \
-        .setOutputCol("words")
-    hashingTF = HashingTF() \
-        .setInputCol(tokenizer.getOutputCol()) \
-        .setOutputCol("features")
-    lr = LogisticRegression() \
-        .setMaxIter(10) \
-        .setRegParam(0.01)
-    pipeline = Pipeline() \
-        .setStages([tokenizer, hashingTF, lr])
+    tokenizer = Tokenizer(inputCol="text", outputCol="words")
+    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
+    lr = LogisticRegression(maxIter=10, regParam=0.01)
+    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
 
     # Fit the pipeline to training documents.
     model = pipeline.fit(training)
 
     # Prepare test documents, which are unlabeled.
-    Document = Row('id', 'text')
-    test = sqlCtx.inferSchema(
-        sc.parallelize([(4L, "spark i j k"),
-                        (5L, "l m n"),
-                        (6L, "mapreduce spark"),
-                        (7L, "apache hadoop")])
-          .map(lambda x: Document(*x)))
+    Document = Row("id", "text")
+    test = sc.parallelize([(4L, "spark i j k"),
+                           (5L, "l m n"),
+                           (6L, "mapreduce spark"),
+                           (7L, "apache hadoop")]) \
+        .map(lambda x: Document(*x)).toDF()
 
     # Make predictions on test documents and print columns of interest.
     prediction = model.transform(test)
-    prediction.registerTempTable("prediction")
-    selected = sqlCtx.sql("SELECT id, text, prediction from prediction")
+    selected = prediction.select("id", "text", "prediction")
     for row in selected.collect():
         print row
 
diff --git a/python/docs/conf.py b/python/docs/conf.py
index b00dce95d65b4..cbbf7ffb08992 100644
--- a/python/docs/conf.py
+++ b/python/docs/conf.py
@@ -97,6 +97,10 @@
 # If true, keep warnings as "system message" paragraphs in the built documents.
 #keep_warnings = False
 
+# -- Options for autodoc --------------------------------------------------
+
+# Look at the first line of the docstring for function and method signatures.
+autodoc_docstring_signature = True
 
 # -- Options for HTML output ----------------------------------------------
 
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 6bd2aa8e47837..b6de7493d7523 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-from pyspark.ml.util import inherit_doc
+from pyspark.ml.util import inherit_doc, keyword_only
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
 from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,\
     HasRegParam
@@ -32,22 +32,46 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
 
     >>> from pyspark.sql import Row
     >>> from pyspark.mllib.linalg import Vectors
-    >>> dataset = sqlCtx.inferSchema(sc.parallelize([ \
-            Row(label=1.0, features=Vectors.dense(1.0)), \
-            Row(label=0.0, features=Vectors.sparse(1, [], []))]))
-    >>> lr = LogisticRegression() \
-            .setMaxIter(5) \
-            .setRegParam(0.01)
-    >>> model = lr.fit(dataset)
-    >>> test0 = sqlCtx.inferSchema(sc.parallelize([Row(features=Vectors.dense(-1.0))]))
+    >>> df = sc.parallelize([
+    ...     Row(label=1.0, features=Vectors.dense(1.0)),
+    ...     Row(label=0.0, features=Vectors.sparse(1, [], []))]).toDF()
+    >>> lr = LogisticRegression(maxIter=5, regParam=0.01)
+    >>> model = lr.fit(df)
+    >>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0))]).toDF()
     >>> print model.transform(test0).head().prediction
     0.0
-    >>> test1 = sqlCtx.inferSchema(sc.parallelize([Row(features=Vectors.sparse(1, [0], [1.0]))]))
+    >>> test1 = sc.parallelize([Row(features=Vectors.sparse(1, [0], [1.0]))]).toDF()
     >>> print model.transform(test1).head().prediction
     1.0
+    >>> lr.setParams("vector")
+    Traceback (most recent call last):
+        ...
+    TypeError: Method setParams forces keyword arguments.
     """
     _java_class = "org.apache.spark.ml.classification.LogisticRegression"
 
+    @keyword_only
+    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 maxIter=100, regParam=0.1):
+        """
+        __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+                 maxIter=100, regParam=0.1)
+        """
+        super(LogisticRegression, self).__init__()
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  maxIter=100, regParam=0.1):
+        """
+        setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
+                  maxIter=100, regParam=0.1)
+        Sets params for logistic regression.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set_params(**kwargs)
+
     def _create_model(self, java_model):
         return LogisticRegressionModel(java_model)
 
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index e088acd0ca82d..f1ddbb478dd9c 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -16,7 +16,7 @@
 #
 
 from pyspark.ml.param.shared import HasInputCol, HasOutputCol, HasNumFeatures
-from pyspark.ml.util import inherit_doc
+from pyspark.ml.util import inherit_doc, keyword_only
 from pyspark.ml.wrapper import JavaTransformer
 
 __all__ = ['Tokenizer', 'HashingTF']
@@ -29,18 +29,45 @@ class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol):
     splits it by white spaces.
 
     >>> from pyspark.sql import Row
-    >>> dataset = sqlCtx.inferSchema(sc.parallelize([Row(text="a b c")]))
-    >>> tokenizer = Tokenizer() \
-            .setInputCol("text") \
-            .setOutputCol("words")
-    >>> print tokenizer.transform(dataset).head()
+    >>> df = sc.parallelize([Row(text="a b c")]).toDF()
+    >>> tokenizer = Tokenizer(inputCol="text", outputCol="words")
+    >>> print tokenizer.transform(df).head()
     Row(text=u'a b c', words=[u'a', u'b', u'c'])
-    >>> print tokenizer.transform(dataset, {tokenizer.outputCol: "tokens"}).head()
+    >>> # Change a parameter.
+    >>> print tokenizer.setParams(outputCol="tokens").transform(df).head()
     Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
+    >>> # Temporarily modify a parameter.
+    >>> print tokenizer.transform(df, {tokenizer.outputCol: "words"}).head()
+    Row(text=u'a b c', words=[u'a', u'b', u'c'])
+    >>> print tokenizer.transform(df).head()
+    Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
+    >>> # Must use keyword arguments to specify params.
+    >>> tokenizer.setParams("text")
+    Traceback (most recent call last):
+        ...
+    TypeError: Method setParams forces keyword arguments.
     """
 
     _java_class = "org.apache.spark.ml.feature.Tokenizer"
 
+    @keyword_only
+    def __init__(self, inputCol="input", outputCol="output"):
+        """
+        __init__(self, inputCol="input", outputCol="output")
+        """
+        super(Tokenizer, self).__init__()
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, inputCol="input", outputCol="output"):
+        """
+        setParams(self, inputCol="input", outputCol="output")
+        Sets params for this Tokenizer.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set_params(**kwargs)
+
 
 @inherit_doc
 class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures):
@@ -49,20 +76,37 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures):
     hashing trick.
 
     >>> from pyspark.sql import Row
-    >>> dataset = sqlCtx.inferSchema(sc.parallelize([Row(words=["a", "b", "c"])]))
-    >>> hashingTF = HashingTF() \
-            .setNumFeatures(10) \
-            .setInputCol("words") \
-            .setOutputCol("features")
-    >>> print hashingTF.transform(dataset).head().features
+    >>> df = sc.parallelize([Row(words=["a", "b", "c"])]).toDF()
+    >>> hashingTF = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
+    >>> print hashingTF.transform(df).head().features
+    (10,[7,8,9],[1.0,1.0,1.0])
+    >>> print hashingTF.setParams(outputCol="freqs").transform(df).head().freqs
     (10,[7,8,9],[1.0,1.0,1.0])
     >>> params = {hashingTF.numFeatures: 5, hashingTF.outputCol: "vector"}
-    >>> print hashingTF.transform(dataset, params).head().vector
+    >>> print hashingTF.transform(df, params).head().vector
     (5,[2,3,4],[1.0,1.0,1.0])
     """
 
     _java_class = "org.apache.spark.ml.feature.HashingTF"
 
+    @keyword_only
+    def __init__(self, numFeatures=1 << 18, inputCol="input", outputCol="output"):
+        """
+        __init__(self, numFeatures=1 << 18, inputCol="input", outputCol="output")
+        """
+        super(HashingTF, self).__init__()
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, numFeatures=1 << 18, inputCol="input", outputCol="output"):
+        """
+        setParams(self, numFeatures=1 << 18, inputCol="input", outputCol="output")
+        Sets params for this HashingTF.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set_params(**kwargs)
+
 
 if __name__ == "__main__":
     import doctest
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 5566792cead48..e3a53dd780c4c 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -80,3 +80,11 @@ def _dummy():
         dummy = Params()
         dummy.uid = "undefined"
         return dummy
+
+    def _set_params(self, **kwargs):
+        """
+        Sets params.
+        """
+        for param, value in kwargs.iteritems():
+            self.paramMap[getattr(self, param)] = value
+        return self
diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
index 2d239f8c802a0..18d8a58f357bd 100644
--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@@ -18,7 +18,7 @@
 from abc import ABCMeta, abstractmethod
 
 from pyspark.ml.param import Param, Params
-from pyspark.ml.util import inherit_doc
+from pyspark.ml.util import inherit_doc, keyword_only
 
 
 __all__ = ['Estimator', 'Transformer', 'Pipeline', 'PipelineModel']
@@ -89,10 +89,16 @@ class Pipeline(Estimator):
     identity transformer.
     """
 
-    def __init__(self):
+    @keyword_only
+    def __init__(self, stages=[]):
+        """
+        __init__(self, stages=[])
+        """
         super(Pipeline, self).__init__()
         #: Param for pipeline stages.
         self.stages = Param(self, "stages", "pipeline stages")
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
 
     def setStages(self, value):
         """
@@ -110,6 +116,15 @@ def getStages(self):
         if self.stages in self.paramMap:
             return self.paramMap[self.stages]
 
+    @keyword_only
+    def setParams(self, stages=[]):
+        """
+        setParams(self, stages=[])
+        Sets params for Pipeline.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set_params(**kwargs)
+
     def fit(self, dataset, params={}):
         paramMap = self._merge_params(params)
         stages = paramMap[self.stages]
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index b1caa84b6306a..81d3f0882b8a9 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+from functools import wraps
 import uuid
 
 
@@ -32,6 +33,20 @@ def inherit_doc(cls):
     return cls
 
 
+def keyword_only(func):
+    """
+    A decorator that forces keyword arguments in the wrapped method
+    and saves actual input keyword arguments in `_input_kwargs`.
+    """
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if len(args) > 1:
+            raise TypeError("Method %s forces keyword arguments." % func.__name__)
+        wrapper._input_kwargs = kwargs
+        return func(*args, **kwargs)
+    return wrapper
+
+
 class Identifiable(object):
     """
     Object with a unique ID.

From acf2558dc92901c342262c35eebb95f2a9b7a9ae Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sun, 15 Feb 2015 20:41:27 -0800
Subject: [PATCH 176/357] SPARK-5815 [MLLIB] Deprecate SVDPlusPlus APIs that
 expose DoubleMatrix from JBLAS

Deprecate SVDPlusPlus.run and introduce SVDPlusPlus.runSVDPlusPlus with return type that doesn't include DoubleMatrix

CC mengxr

Author: Sean Owen <sowen@cloudera.com>

Closes #4614 from srowen/SPARK-5815 and squashes the following commits:

288cb05 [Sean Owen] Clarify deprecation plans in scaladoc
497458e [Sean Owen] Deprecate SVDPlusPlus.run and introduce SVDPlusPlus.runSVDPlusPlus with return type that doesn't include DoubleMatrix
---
 .../apache/spark/graphx/lib/SVDPlusPlus.scala | 25 +++++++++++++++++++
 .../spark/graphx/lib/SVDPlusPlusSuite.scala   |  2 +-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
index 112ed09ef46dc..fc84cfbe64184 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.graphx.lib
 
+import org.apache.spark.annotation.Experimental
+
 import scala.util.Random
 import org.jblas.DoubleMatrix
 import org.apache.spark.rdd._
@@ -38,6 +40,8 @@ object SVDPlusPlus {
     extends Serializable
 
   /**
+   * :: Experimental ::
+   *
    * Implement SVD++ based on "Factorization Meets the Neighborhood:
    * a Multifaceted Collaborative Filtering Model",
    * available at [[http://public.research.att.com/~volinsky/netflix/kdd08koren.pdf]].
@@ -45,12 +49,33 @@ object SVDPlusPlus {
    * The prediction rule is rui = u + bu + bi + qi*(pu + |N(u)|^(-0.5)*sum(y)),
    * see the details on page 6.
    *
+   * This method temporarily replaces `run()`, and replaces `DoubleMatrix` in `run()`'s return
+   * value with `Array[Double]`. In 1.4.0, this method will be deprecated, but will be copied
+   * to replace `run()`, which will then be undeprecated.
+   *
    * @param edges edges for constructing the graph
    *
    * @param conf SVDPlusPlus parameters
    *
    * @return a graph with vertex attributes containing the trained model
    */
+  @Experimental
+  def runSVDPlusPlus(edges: RDD[Edge[Double]], conf: Conf)
+    : (Graph[(Array[Double], Array[Double], Double, Double), Double], Double) =
+  {
+    val (graph, u) = run(edges, conf)
+    // Convert DoubleMatrix to Array[Double]:
+    val newVertices = graph.vertices.mapValues(v => (v._1.toArray, v._2.toArray, v._3, v._4))
+    (Graph(newVertices, graph.edges), u)
+  }
+
+  /**
+   * This method is deprecated in favor of `runSVDPlusPlus()`, which replaces `DoubleMatrix`
+   * with `Array[Double]` in its return value. This method is deprecated. It will effectively
+   * be removed in 1.4.0 when `runSVDPlusPlus()` is copied to replace `run()`, and hence the
+   * return type of this method changes.
+   */
+  @deprecated("Call runSVDPlusPlus", "1.3.0")
   def run(edges: RDD[Edge[Double]], conf: Conf)
     : (Graph[(DoubleMatrix, DoubleMatrix, Double, Double), Double], Double) =
   {
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala
index e01df56e94de9..9987a4b1a3c25 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala
@@ -32,7 +32,7 @@ class SVDPlusPlusSuite extends FunSuite with LocalSparkContext {
         Edge(fields(0).toLong * 2, fields(1).toLong * 2 + 1, fields(2).toDouble)
       }
       val conf = new SVDPlusPlus.Conf(10, 2, 0.0, 5.0, 0.007, 0.007, 0.005, 0.015) // 2 iterations
-      var (graph, u) = SVDPlusPlus.run(edges, conf)
+      var (graph, u) = SVDPlusPlus.runSVDPlusPlus(edges, conf)
       graph.cache()
       val err = graph.vertices.collect().map{ case (vid, vd) =>
         if (vid % 2 == 1) vd._4 else 0.0

From c78a12c4cc4d4312c4ee1069d3b218882d32d678 Mon Sep 17 00:00:00 2001
From: Peter Rudenko <petro.rudenko@gmail.com>
Date: Sun, 15 Feb 2015 20:51:32 -0800
Subject: [PATCH 177/357] [Ml] SPARK-5796 Don't transform data on a last
 estimator in Pipeline

If it's a last estimator in Pipeline there's no need to transform data, since there's no next stage that would consume this data.

Author: Peter Rudenko <petro.rudenko@gmail.com>

Closes #4590 from petro-rudenko/patch-1 and squashes the following commits:

d13ec33 [Peter Rudenko] [Ml] SPARK-5796 Don't transform data on a last estimator in Pipeline
---
 mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index bb291e6e1fd7d..5607ed21afe18 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -114,7 +114,9 @@ class Pipeline extends Estimator[PipelineModel] {
             throw new IllegalArgumentException(
               s"Do not support stage $stage of type ${stage.getClass}")
         }
-        curDataset = transformer.transform(curDataset, paramMap)
+        if (index < indexOfLastEstimator) {
+          curDataset = transformer.transform(curDataset, paramMap)
+        }
         transformers += transformer
       } else {
         transformers += stage.asInstanceOf[Transformer]

From d51d6ba1547ae75ac76c9e6d8ea99e937eb7d09f Mon Sep 17 00:00:00 2001
From: Peter Rudenko <petro.rudenko@gmail.com>
Date: Mon, 16 Feb 2015 00:07:23 -0800
Subject: [PATCH 178/357] [Ml] SPARK-5804 Explicitly manage cache in
 Crossvalidator k-fold loop

On a big dataset explicitly unpersist train and validation folds allows to load more data into memory in the next loop iteration. On my environment (single node 8Gb worker RAM, 2 GB dataset file, 3 folds for cross validation), saved more than 5 minutes.

Author: Peter Rudenko <petro.rudenko@gmail.com>

Closes #4595 from petro-rudenko/patch-2 and squashes the following commits:

66a7cfb [Peter Rudenko] Move validationDataset cache to declaration
c5f3265 [Peter Rudenko] [Ml] SPARK-5804 Explicitly manage cache in Crossvalidator k-fold loop
---
 .../main/scala/org/apache/spark/ml/tuning/CrossValidator.scala  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index b139bc8dcb44c..b07a68269cc2b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -108,6 +108,7 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
       // multi-model training
       logDebug(s"Train split $splitIndex with multiple sets of parameters.")
       val models = est.fit(trainingDataset, epm).asInstanceOf[Seq[Model[_]]]
+      trainingDataset.unpersist()
       var i = 0
       while (i < numModels) {
         val metric = eval.evaluate(models(i).transform(validationDataset, epm(i)), map)
@@ -115,6 +116,7 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
         metrics(i) += metric
         i += 1
       }
+      validationDataset.unpersist()
     }
     f2jBLAS.dscal(numModels, 1.0 / map(numFolds), metrics, 1)
     logInfo(s"Average cross-validation metrics: ${metrics.toSeq}")

From 199a9e80275ac70582ea32f0f2f5a0a15b168785 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 16 Feb 2015 01:33:37 -0800
Subject: [PATCH 179/357] [Minor] [SQL] Renames stringRddToDataFrame to
 stringRddToDataFrameHolder for consistency

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4613)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>

Closes #4613 from liancheng/df-implicit-rename and squashes the following commits:

db8bdd3 [Cheng Lian] Renames stringRddToDataFrame to stringRddToDataFrameHolder for consistency
---
 sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 6d1914808aeed..d20b42de22706 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -235,7 +235,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
     }
 
     /** Creates a single column DataFrame from an RDD[String]. */
-    implicit def stringRddToDataFrame(data: RDD[String]): DataFrameHolder = {
+    implicit def stringRddToDataFrameHolder(data: RDD[String]): DataFrameHolder = {
       val dataType = StringType
       val rows = data.mapPartitions { iter =>
         val row = new SpecificMutableRow(dataType :: Nil)

From 3ce58cf9c0ffe8b867ca79b404fe3fa291cf0e56 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 16 Feb 2015 01:38:31 -0800
Subject: [PATCH 180/357] [SPARK-4553] [SPARK-5767] [SQL] Wires Parquet data
 source with the newly introduced write support for data source API

This PR migrates the Parquet data source to the new data source write support API.  Now users can also overwriting and appending to existing tables. Notice that inserting into partitioned tables is not supported yet.

When Parquet data source is enabled, insertion to Hive Metastore Parquet tables is also fullfilled by the Parquet data source. This is done by the newly introduced `HiveMetastoreCatalog.ParquetConversions` rule, which is a "proper" implementation of the original hacky `HiveStrategies.ParquetConversion`. The latter is still preserved, and can be removed together with the old Parquet support in the future.

TODO:

- [x] Update outdated comments in `newParquet.scala`.

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4563)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>

Closes #4563 from liancheng/parquet-refining and squashes the following commits:

fa98d27 [Cheng Lian] Fixes test cases which should disable off Parquet data source
2476e82 [Cheng Lian] Fixes compilation error introduced during rebasing
a83d290 [Cheng Lian] Passes Hive Metastore partitioning information to ParquetRelation2
---
 .../org/apache/spark/sql/SQLContext.scala     |   2 +-
 .../sql/parquet/ParquetTableOperations.scala  |   2 +-
 .../spark/sql/parquet/ParquetTest.scala       |  38 +-
 .../apache/spark/sql/parquet/newParquet.scala | 265 ++++++-----
 .../sql/parquet/ParquetFilterSuite.scala      | 343 +++++++-------
 .../spark/sql/parquet/ParquetIOSuite.scala    | 417 ++++++++++--------
 .../ParquetPartitionDiscoverySuite.scala      | 202 ++++++++-
 .../spark/sql/parquet/ParquetQuerySuite.scala | 147 +++---
 .../apache/spark/sql/hive/HiveContext.scala   |  11 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 127 ++++--
 .../spark/sql/hive/HiveStrategies.scala       |  22 +-
 .../spark/sql/parquet/parquetSuites.scala     | 247 +++++++----
 12 files changed, 1148 insertions(+), 675 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index d20b42de22706..b42a52ebd2f16 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -446,7 +446,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
       baseRelationToDataFrame(parquet.ParquetRelation2(path +: paths, Map.empty)(this))
     } else {
       DataFrame(this, parquet.ParquetRelation(
-        paths.mkString(","), Some(sparkContext.hadoopConfiguration), this))
+        (path +: paths).mkString(","), Some(sparkContext.hadoopConfiguration), this))
     }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 28cd17fde46ab..7dd8bea49b8a5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -647,6 +647,6 @@ private[parquet] object FileSystemHelper {
         sys.error("ERROR: attempting to append to set of Parquet files and found file" +
           s"that does not match name pattern: $other")
       case _ => 0
-    }.reduceLeft((a, b) => if (a < b) b else a)
+    }.reduceOption(_ max _).getOrElse(0)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
index 538d774eb97eb..d0856df8d4f43 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
@@ -23,8 +23,8 @@ import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 import scala.util.Try
 
-import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.util
+import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
 import org.apache.spark.util.Utils
 
 /**
@@ -37,7 +37,8 @@ import org.apache.spark.util.Utils
 trait ParquetTest {
   val sqlContext: SQLContext
 
-  import sqlContext._
+  import sqlContext.implicits.{localSeqToDataFrameHolder, rddToDataFrameHolder}
+  import sqlContext.{conf, sparkContext}
 
   protected def configuration = sparkContext.hadoopConfiguration
 
@@ -49,11 +50,11 @@ trait ParquetTest {
    */
   protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
     val (keys, values) = pairs.unzip
-    val currentValues = keys.map(key => Try(getConf(key)).toOption)
-    (keys, values).zipped.foreach(setConf)
+    val currentValues = keys.map(key => Try(conf.getConf(key)).toOption)
+    (keys, values).zipped.foreach(conf.setConf)
     try f finally {
       keys.zip(currentValues).foreach {
-        case (key, Some(value)) => setConf(key, value)
+        case (key, Some(value)) => conf.setConf(key, value)
         case (key, None) => conf.unsetConf(key)
       }
     }
@@ -88,7 +89,6 @@ trait ParquetTest {
   protected def withParquetFile[T <: Product: ClassTag: TypeTag]
       (data: Seq[T])
       (f: String => Unit): Unit = {
-    import sqlContext.implicits._
     withTempPath { file =>
       sparkContext.parallelize(data).toDF().saveAsParquetFile(file.getCanonicalPath)
       f(file.getCanonicalPath)
@@ -102,14 +102,14 @@ trait ParquetTest {
   protected def withParquetRDD[T <: Product: ClassTag: TypeTag]
       (data: Seq[T])
       (f: DataFrame => Unit): Unit = {
-    withParquetFile(data)(path => f(parquetFile(path)))
+    withParquetFile(data)(path => f(sqlContext.parquetFile(path)))
   }
 
   /**
    * Drops temporary table `tableName` after calling `f`.
    */
   protected def withTempTable(tableName: String)(f: => Unit): Unit = {
-    try f finally dropTempTable(tableName)
+    try f finally sqlContext.dropTempTable(tableName)
   }
 
   /**
@@ -125,4 +125,26 @@ trait ParquetTest {
       withTempTable(tableName)(f)
     }
   }
+
+  protected def makeParquetFile[T <: Product: ClassTag: TypeTag](
+      data: Seq[T], path: File): Unit = {
+    data.toDF().save(path.getCanonicalPath, "org.apache.spark.sql.parquet", SaveMode.Overwrite)
+  }
+
+  protected def makePartitionDir(
+      basePath: File,
+      defaultPartitionName: String,
+      partitionCols: (String, Any)*): File = {
+    val partNames = partitionCols.map { case (k, v) =>
+      val valueString = if (v == null || v == "") defaultPartitionName else v.toString
+      s"$k=$valueString"
+    }
+
+    val partDir = partNames.foldLeft(basePath) { (parent, child) =>
+      new File(parent, child)
+    }
+
+    assert(partDir.mkdirs(), s"Couldn't create directory $partDir")
+    partDir
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 3a9f0600617be..9279f5a903f55 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -20,7 +20,7 @@ import java.io.IOException
 import java.lang.{Double => JDouble, Float => JFloat, Long => JLong}
 import java.math.{BigDecimal => JBigDecimal}
 import java.text.SimpleDateFormat
-import java.util.{List => JList, Date}
+import java.util.{Date, List => JList}
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
@@ -34,8 +34,9 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
 import org.apache.hadoop.mapreduce.{InputSplit, Job, JobContext}
 import parquet.filter2.predicate.FilterApi
 import parquet.format.converter.ParquetMetadataConverter
-import parquet.hadoop.{ParquetInputFormat, _}
+import parquet.hadoop.metadata.CompressionCodecName
 import parquet.hadoop.util.ContextUtil
+import parquet.hadoop.{ParquetInputFormat, _}
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.deploy.SparkHadoopUtil
@@ -44,21 +45,36 @@ import org.apache.spark.rdd.{NewHadoopPartition, NewHadoopRDD, RDD}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.parquet.ParquetTypesConverter._
 import org.apache.spark.sql.sources._
-import org.apache.spark.sql.{DataFrame, Row, SaveMode, SQLConf, SQLContext}
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType, _}
-import org.apache.spark.{Partition => SparkPartition, TaskContext, SerializableWritable, Logging, SparkException}
-
+import org.apache.spark.sql.{DataFrame, Row, SQLConf, SQLContext, SaveMode}
+import org.apache.spark.{Logging, Partition => SparkPartition, SerializableWritable, SparkException, TaskContext}
 
 /**
- * Allows creation of parquet based tables using the syntax
- * `CREATE TEMPORARY TABLE ... USING org.apache.spark.sql.parquet`.  Currently the only option
- * required is `path`, which should be the location of a collection of, optionally partitioned,
- * parquet files.
+ * Allows creation of Parquet based tables using the syntax:
+ * {{{
+ *   CREATE TEMPORARY TABLE ... USING org.apache.spark.sql.parquet OPTIONS (...)
+ * }}}
+ *
+ * Supported options include:
+ *
+ *  - `path`: Required. When reading Parquet files, `path` should point to the location of the
+ *    Parquet file(s). It can be either a single raw Parquet file, or a directory of Parquet files.
+ *    In the latter case, this data source tries to discover partitioning information if the the
+ *    directory is structured in the same style of Hive partitioned tables. When writing Parquet
+ *    file, `path` should point to the destination folder.
+ *
+ *  - `mergeSchema`: Optional. Indicates whether we should merge potentially different (but
+ *    compatible) schemas stored in all Parquet part-files.
+ *
+ *  - `partition.defaultName`: Optional. Partition name used when a value of a partition column is
+ *    null or empty string. This is similar to the `hive.exec.default.partition.name` configuration
+ *    in Hive.
  */
 class DefaultSource
     extends RelationProvider
     with SchemaRelationProvider
     with CreatableRelationProvider {
+
   private def checkPath(parameters: Map[String, String]): String = {
     parameters.getOrElse("path", sys.error("'path' must be specified for parquet tables."))
   }
@@ -70,6 +86,7 @@ class DefaultSource
     ParquetRelation2(Seq(checkPath(parameters)), parameters, None)(sqlContext)
   }
 
+  /** Returns a new base relation with the given parameters and schema. */
   override def createRelation(
       sqlContext: SQLContext,
       parameters: Map[String, String],
@@ -77,6 +94,7 @@ class DefaultSource
     ParquetRelation2(Seq(checkPath(parameters)), parameters, Some(schema))(sqlContext)
   }
 
+  /** Returns a new base relation with the given parameters and save given data into it. */
   override def createRelation(
       sqlContext: SQLContext,
       mode: SaveMode,
@@ -85,33 +103,19 @@ class DefaultSource
     val path = checkPath(parameters)
     val filesystemPath = new Path(path)
     val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
-    val doSave = if (fs.exists(filesystemPath)) {
-      mode match {
-        case SaveMode.Append =>
-          sys.error(s"Append mode is not supported by ${this.getClass.getCanonicalName}")
-        case SaveMode.Overwrite =>
-          fs.delete(filesystemPath, true)
-          true
-        case SaveMode.ErrorIfExists =>
-          sys.error(s"path $path already exists.")
-        case SaveMode.Ignore => false
-      }
-    } else {
-      true
+    val doInsertion = (mode, fs.exists(filesystemPath)) match {
+      case (SaveMode.ErrorIfExists, true) =>
+        sys.error(s"path $path already exists.")
+      case (SaveMode.Append, _) | (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) =>
+        true
+      case (SaveMode.Ignore, exists) =>
+        !exists
     }
 
-    val relation = if (doSave) {
-      // Only save data when the save mode is not ignore.
-      ParquetRelation.createEmpty(
-        path,
-        data.schema.toAttributes,
-        false,
-        sqlContext.sparkContext.hadoopConfiguration,
-        sqlContext)
-
-      val createdRelation = createRelation(sqlContext, parameters, data.schema)
-      createdRelation.asInstanceOf[ParquetRelation2].insert(data, true)
-
+    val relation = if (doInsertion) {
+      val createdRelation =
+        createRelation(sqlContext, parameters, data.schema).asInstanceOf[ParquetRelation2]
+      createdRelation.insert(data, overwrite = mode == SaveMode.Overwrite)
       createdRelation
     } else {
       // If the save mode is Ignore, we will just create the relation based on existing data.
@@ -122,37 +126,31 @@ class DefaultSource
   }
 }
 
-private[parquet] case class Partition(values: Row, path: String)
+private[sql] case class Partition(values: Row, path: String)
 
-private[parquet] case class PartitionSpec(partitionColumns: StructType, partitions: Seq[Partition])
+private[sql] case class PartitionSpec(partitionColumns: StructType, partitions: Seq[Partition])
 
 /**
  * An alternative to [[ParquetRelation]] that plugs in using the data sources API.  This class is
- * currently not intended as a full replacement of the parquet support in Spark SQL though it is
- * likely that it will eventually subsume the existing physical plan implementation.
- *
- * Compared with the current implementation, this class has the following notable differences:
- *
- * Partitioning: Partitions are auto discovered and must be in the form of directories `key=value/`
- * located at `path`.  Currently only a single partitioning column is supported and it must
- * be an integer.  This class supports both fully self-describing data, which contains the partition
- * key, and data where the partition key is only present in the folder structure.  The presence
- * of the partitioning key in the data is also auto-detected.  The `null` partition is not yet
- * supported.
+ * intended as a full replacement of the Parquet support in Spark SQL.  The old implementation will
+ * be deprecated and eventually removed once this version is proved to be stable enough.
  *
- * Metadata: The metadata is automatically discovered by reading the first parquet file present.
- * There is currently no support for working with files that have different schema.  Additionally,
- * when parquet metadata caching is turned on, the FileStatus objects for all data will be cached
- * to improve the speed of interactive querying.  When data is added to a table it must be dropped
- * and recreated to pick up any changes.
+ * Compared with the old implementation, this class has the following notable differences:
  *
- * Statistics: Statistics for the size of the table are automatically populated during metadata
- * discovery.
+ *  - Partitioning discovery: Hive style multi-level partitions are auto discovered.
+ *  - Metadata discovery: Parquet is a format comes with schema evolving support.  This data source
+ *    can detect and merge schemas from all Parquet part-files as long as they are compatible.
+ *    Also, metadata and [[FileStatus]]es are cached for better performance.
+ *  - Statistics: Statistics for the size of the table are automatically populated during schema
+ *    discovery.
  */
 @DeveloperApi
-case class ParquetRelation2
-    (paths: Seq[String], parameters: Map[String, String], maybeSchema: Option[StructType] = None)
-    (@transient val sqlContext: SQLContext)
+case class ParquetRelation2(
+    paths: Seq[String],
+    parameters: Map[String, String],
+    maybeSchema: Option[StructType] = None,
+    maybePartitionSpec: Option[PartitionSpec] = None)(
+    @transient val sqlContext: SQLContext)
   extends CatalystScan
   with InsertableRelation
   with SparkHadoopMapReduceUtil
@@ -175,43 +173,90 @@ case class ParquetRelation2
 
   override def equals(other: Any) = other match {
     case relation: ParquetRelation2 =>
+      // If schema merging is required, we don't compare the actual schemas since they may evolve.
+      val schemaEquality = if (shouldMergeSchemas) {
+        shouldMergeSchemas == relation.shouldMergeSchemas
+      } else {
+        schema == relation.schema
+      }
+
       paths.toSet == relation.paths.toSet &&
+        schemaEquality &&
         maybeMetastoreSchema == relation.maybeMetastoreSchema &&
-        (shouldMergeSchemas == relation.shouldMergeSchemas || schema == relation.schema)
+        maybePartitionSpec == relation.maybePartitionSpec
+
     case _ => false
   }
 
   private[sql] def sparkContext = sqlContext.sparkContext
 
-  @transient private val fs = FileSystem.get(sparkContext.hadoopConfiguration)
-
   private class MetadataCache {
+    // `FileStatus` objects of all "_metadata" files.
     private var metadataStatuses: Array[FileStatus] = _
+
+    // `FileStatus` objects of all "_common_metadata" files.
     private var commonMetadataStatuses: Array[FileStatus] = _
+
+    // Parquet footer cache.
     private var footers: Map[FileStatus, Footer] = _
-    private var parquetSchema: StructType = _
 
+    // `FileStatus` objects of all data files (Parquet part-files).
     var dataStatuses: Array[FileStatus] = _
+
+    // Partition spec of this table, including names, data types, and values of each partition
+    // column, and paths of each partition.
     var partitionSpec: PartitionSpec = _
+
+    // Schema of the actual Parquet files, without partition columns discovered from partition
+    // directory paths.
+    var parquetSchema: StructType = _
+
+    // Schema of the whole table, including partition columns.
     var schema: StructType = _
-    var dataSchemaIncludesPartitionKeys: Boolean = _
 
+    // Indicates whether partition columns are also included in Parquet data file schema.  If not,
+    // we need to fill in partition column values into read rows when scanning the table.
+    var partitionKeysIncludedInParquetSchema: Boolean = _
+
+    def prepareMetadata(path: Path, schema: StructType, conf: Configuration): Unit = {
+      conf.set(
+        ParquetOutputFormat.COMPRESSION,
+        ParquetRelation
+          .shortParquetCompressionCodecNames
+          .getOrElse(
+            sqlContext.conf.parquetCompressionCodec.toUpperCase,
+            CompressionCodecName.UNCOMPRESSED).name())
+
+      ParquetRelation.enableLogForwarding()
+      ParquetTypesConverter.writeMetaData(schema.toAttributes, path, conf)
+    }
+
+    /**
+     * Refreshes `FileStatus`es, footers, partition spec, and table schema.
+     */
     def refresh(): Unit = {
-      val baseStatuses = {
-        val statuses = paths.distinct.map(p => fs.getFileStatus(fs.makeQualified(new Path(p))))
-        // Support either reading a collection of raw Parquet part-files, or a collection of folders
-        // containing Parquet files (e.g. partitioned Parquet table).
-        assert(statuses.forall(!_.isDir) || statuses.forall(_.isDir))
-        statuses.toArray
-      }
+      val fs = FileSystem.get(sparkContext.hadoopConfiguration)
+
+      // Support either reading a collection of raw Parquet part-files, or a collection of folders
+      // containing Parquet files (e.g. partitioned Parquet table).
+      val baseStatuses = paths.distinct.map { p =>
+        val qualified = fs.makeQualified(new Path(p))
+
+        if (!fs.exists(qualified) && maybeSchema.isDefined) {
+          fs.mkdirs(qualified)
+          prepareMetadata(qualified, maybeSchema.get, sparkContext.hadoopConfiguration)
+        }
+
+        fs.getFileStatus(qualified)
+      }.toArray
+      assert(baseStatuses.forall(!_.isDir) || baseStatuses.forall(_.isDir))
 
+      // Lists `FileStatus`es of all leaf nodes (files) under all base directories.
       val leaves = baseStatuses.flatMap { f =>
-        val statuses = SparkHadoopUtil.get.listLeafStatuses(fs, f.getPath).filter { f =>
+        SparkHadoopUtil.get.listLeafStatuses(fs, f.getPath).filter { f =>
           isSummaryFile(f.getPath) ||
             !(f.getPath.getName.startsWith("_") || f.getPath.getName.startsWith("."))
         }
-        assert(statuses.nonEmpty, s"${f.getPath} is an empty folder.")
-        statuses
       }
 
       dataStatuses = leaves.filterNot(f => isSummaryFile(f.getPath))
@@ -225,13 +270,14 @@ case class ParquetRelation2
         f -> new Footer(f.getPath, parquetMetadata)
       }.seq.toMap
 
-      partitionSpec = {
-        val partitionDirs = dataStatuses
+      partitionSpec = maybePartitionSpec.getOrElse {
+        val partitionDirs = leaves
           .filterNot(baseStatuses.contains)
           .map(_.getPath.getParent)
           .distinct
 
         if (partitionDirs.nonEmpty) {
+          // Parses names and values of partition columns, and infer their data types.
           ParquetRelation2.parsePartitions(partitionDirs, defaultPartitionName)
         } else {
           // No partition directories found, makes an empty specification
@@ -241,20 +287,22 @@ case class ParquetRelation2
 
       parquetSchema = maybeSchema.getOrElse(readSchema())
 
-      dataSchemaIncludesPartitionKeys =
+      partitionKeysIncludedInParquetSchema =
         isPartitioned &&
-          partitionColumns.forall(f => metadataCache.parquetSchema.fieldNames.contains(f.name))
+          partitionColumns.forall(f => parquetSchema.fieldNames.contains(f.name))
 
       schema = {
-        val fullParquetSchema = if (dataSchemaIncludesPartitionKeys) {
-          metadataCache.parquetSchema
+        val fullRelationSchema = if (partitionKeysIncludedInParquetSchema) {
+          parquetSchema
         } else {
-          StructType(metadataCache.parquetSchema.fields ++ partitionColumns.fields)
+          StructType(parquetSchema.fields ++ partitionColumns.fields)
         }
 
+        // If this Parquet relation is converted from a Hive Metastore table, must reconcile case
+        // insensitivity issue and possible schema mismatch.
         maybeMetastoreSchema
-          .map(ParquetRelation2.mergeMetastoreParquetSchema(_, fullParquetSchema))
-          .getOrElse(fullParquetSchema)
+          .map(ParquetRelation2.mergeMetastoreParquetSchema(_, fullRelationSchema))
+          .getOrElse(fullRelationSchema)
       }
     }
 
@@ -303,13 +351,17 @@ case class ParquetRelation2
   @transient private val metadataCache = new MetadataCache
   metadataCache.refresh()
 
-  private def partitionColumns = metadataCache.partitionSpec.partitionColumns
+  def partitionSpec = metadataCache.partitionSpec
 
-  private def partitions = metadataCache.partitionSpec.partitions
+  def partitionColumns = metadataCache.partitionSpec.partitionColumns
 
-  private def isPartitioned = partitionColumns.nonEmpty
+  def partitions = metadataCache.partitionSpec.partitions
 
-  private def dataSchemaIncludesPartitionKeys = metadataCache.dataSchemaIncludesPartitionKeys
+  def isPartitioned = partitionColumns.nonEmpty
+
+  private def partitionKeysIncludedInDataSchema = metadataCache.partitionKeysIncludedInParquetSchema
+
+  private def parquetSchema = metadataCache.parquetSchema
 
   override def schema = metadataCache.schema
 
@@ -412,18 +464,21 @@ case class ParquetRelation2
 
     // When the data does not include the key and the key is requested then we must fill it in
     // based on information from the input split.
-    if (!dataSchemaIncludesPartitionKeys && partitionKeyLocations.nonEmpty) {
+    if (!partitionKeysIncludedInDataSchema && partitionKeyLocations.nonEmpty) {
       baseRDD.mapPartitionsWithInputSplit { case (split: ParquetInputSplit, iterator) =>
         val partValues = selectedPartitions.collectFirst {
           case p if split.getPath.getParent.toString == p.path => p.values
         }.get
 
+        val requiredPartOrdinal = partitionKeyLocations.keys.toSeq
+
         iterator.map { pair =>
           val row = pair._2.asInstanceOf[SpecificMutableRow]
           var i = 0
-          while (i < partValues.size) {
+          while (i < requiredPartOrdinal.size) {
             // TODO Avoids boxing cost here!
-            row.update(partitionKeyLocations(i), partValues(i))
+            val partOrdinal = requiredPartOrdinal(i)
+            row.update(partitionKeyLocations(partOrdinal), partValues(partOrdinal))
             i += 1
           }
           row
@@ -457,6 +512,8 @@ case class ParquetRelation2
   }
 
   override def insert(data: DataFrame, overwrite: Boolean): Unit = {
+    assert(paths.size == 1, s"Can't write to multiple destinations: ${paths.mkString(",")}")
+
     // TODO: currently we do not check whether the "schema"s are compatible
     // That means if one first creates a table and then INSERTs data with
     // and incompatible schema the execution will fail. It would be nice
@@ -464,7 +521,7 @@ case class ParquetRelation2
     // before calling execute().
 
     val job = new Job(sqlContext.sparkContext.hadoopConfiguration)
-    val writeSupport = if (schema.map(_.dataType).forall(_.isPrimitive)) {
+    val writeSupport = if (parquetSchema.map(_.dataType).forall(_.isPrimitive)) {
       log.debug("Initializing MutableRowWriteSupport")
       classOf[MutableRowWriteSupport]
     } else {
@@ -474,7 +531,7 @@ case class ParquetRelation2
     ParquetOutputFormat.setWriteSupportClass(job, writeSupport)
 
     val conf = ContextUtil.getConfiguration(job)
-    RowWriteSupport.setSchema(schema.toAttributes, conf)
+    RowWriteSupport.setSchema(data.schema.toAttributes, conf)
 
     val destinationPath = new Path(paths.head)
 
@@ -544,14 +601,12 @@ object ParquetRelation2 {
   // Whether we should merge schemas collected from all Parquet part-files.
   val MERGE_SCHEMA = "mergeSchema"
 
-  // Hive Metastore schema, passed in when the Parquet relation is converted from Metastore
-  val METASTORE_SCHEMA = "metastoreSchema"
-
-  // Default partition name to use when the partition column value is null or empty string
+  // Default partition name to use when the partition column value is null or empty string.
   val DEFAULT_PARTITION_NAME = "partition.defaultName"
 
-  // When true, the Parquet data source caches Parquet metadata for performance
-  val CACHE_METADATA = "cacheMetadata"
+  // Hive Metastore schema, used when converting Metastore Parquet tables.  This option is only used
+  // internally.
+  private[sql] val METASTORE_SCHEMA = "metastoreSchema"
 
   private[parquet] def readSchema(footers: Seq[Footer], sqlContext: SQLContext): StructType = {
     footers.map { footer =>
@@ -579,6 +634,15 @@ object ParquetRelation2 {
     }
   }
 
+  /**
+   * Reconciles Hive Metastore case insensitivity issue and data type conflicts between Metastore
+   * schema and Parquet schema.
+   *
+   * Hive doesn't retain case information, while Parquet is case sensitive. On the other hand, the
+   * schema read from Parquet files may be incomplete (e.g. older versions of Parquet doesn't
+   * distinguish binary and string).  This method generates a correct schema by merging Metastore
+   * schema data types and Parquet schema field names.
+   */
   private[parquet] def mergeMetastoreParquetSchema(
       metastoreSchema: StructType,
       parquetSchema: StructType): StructType = {
@@ -719,16 +783,15 @@ object ParquetRelation2 {
    * }}}
    */
   private[parquet] def resolvePartitions(values: Seq[PartitionValues]): Seq[PartitionValues] = {
-    val distinctColNamesOfPartitions = values.map(_.columnNames).distinct
-    val columnCount = values.head.columnNames.size
-
     // Column names of all partitions must match
-    assert(distinctColNamesOfPartitions.size == 1, {
-      val list = distinctColNamesOfPartitions.mkString("\t", "\n", "")
+    val distinctPartitionsColNames = values.map(_.columnNames).distinct
+    assert(distinctPartitionsColNames.size == 1, {
+      val list = distinctPartitionsColNames.mkString("\t", "\n", "")
       s"Conflicting partition column names detected:\n$list"
     })
 
     // Resolves possible type conflicts for each column
+    val columnCount = values.head.columnNames.size
     val resolvedValues = (0 until columnCount).map { i =>
       resolveTypeConflicts(values.map(_.literals(i)))
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
index f8117c21773ae..eb2d5f25290b1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.parquet
 
+import org.scalatest.BeforeAndAfterAll
 import parquet.filter2.predicate.Operators._
 import parquet.filter2.predicate.{FilterPredicate, Operators}
 
@@ -40,7 +41,7 @@ import org.apache.spark.sql.{Column, DataFrame, QueryTest, SQLConf}
  * 2. `Tuple1(Option(x))` is used together with `AnyVal` types like `Int` to ensure the inferred
  *    data type is nullable.
  */
-class ParquetFilterSuite extends QueryTest with ParquetTest {
+class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
   val sqlContext = TestSQLContext
 
   private def checkFilterPredicate(
@@ -112,210 +113,224 @@ class ParquetFilterSuite extends QueryTest with ParquetTest {
     checkBinaryFilterPredicate(predicate, filterClass, Seq(Row(expected)))(rdd)
   }
 
-  def run(prefix: String): Unit = {
-    test(s"$prefix: filter pushdown - boolean") {
-      withParquetRDD((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) { implicit rdd =>
-        checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
-        checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], Seq(Row(true), Row(false)))
+  test("filter pushdown - boolean") {
+    withParquetRDD((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) { implicit rdd =>
+      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], Seq(Row(true), Row(false)))
 
-        checkFilterPredicate('_1 === true, classOf[Eq[_]], true)
-        checkFilterPredicate('_1 !== true, classOf[NotEq[_]], false)
-      }
+      checkFilterPredicate('_1 === true, classOf[Eq[_]], true)
+      checkFilterPredicate('_1 !== true, classOf[NotEq[_]], false)
     }
+  }
 
-    test(s"$prefix: filter pushdown - short") {
-      withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toShort)))) { implicit rdd =>
-        checkFilterPredicate(Cast('_1, IntegerType) === 1, classOf[Eq[_]], 1)
-        checkFilterPredicate(
-          Cast('_1, IntegerType) !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
-
-        checkFilterPredicate(Cast('_1, IntegerType) < 2, classOf[Lt[_]], 1)
-        checkFilterPredicate(Cast('_1, IntegerType) > 3, classOf[Gt[_]], 4)
-        checkFilterPredicate(Cast('_1, IntegerType) <= 1, classOf[LtEq[_]], 1)
-        checkFilterPredicate(Cast('_1, IntegerType) >= 4, classOf[GtEq[_]], 4)
-
-        checkFilterPredicate(Literal(1) === Cast('_1, IntegerType), classOf[Eq[_]], 1)
-        checkFilterPredicate(Literal(2) > Cast('_1, IntegerType), classOf[Lt[_]], 1)
-        checkFilterPredicate(Literal(3) < Cast('_1, IntegerType), classOf[Gt[_]], 4)
-        checkFilterPredicate(Literal(1) >= Cast('_1, IntegerType), classOf[LtEq[_]], 1)
-        checkFilterPredicate(Literal(4) <= Cast('_1, IntegerType), classOf[GtEq[_]], 4)
-
-        checkFilterPredicate(!(Cast('_1, IntegerType) < 4), classOf[GtEq[_]], 4)
-        checkFilterPredicate(
-          Cast('_1, IntegerType) > 2 && Cast('_1, IntegerType) < 4, classOf[Operators.And], 3)
-        checkFilterPredicate(
-          Cast('_1, IntegerType) < 2 || Cast('_1, IntegerType) > 3,
-          classOf[Operators.Or],
-          Seq(Row(1), Row(4)))
-      }
+  test("filter pushdown - short") {
+    withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toShort)))) { implicit rdd =>
+      checkFilterPredicate(Cast('_1, IntegerType) === 1, classOf[Eq[_]], 1)
+      checkFilterPredicate(
+        Cast('_1, IntegerType) !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+
+      checkFilterPredicate(Cast('_1, IntegerType) < 2, classOf[Lt[_]], 1)
+      checkFilterPredicate(Cast('_1, IntegerType) > 3, classOf[Gt[_]], 4)
+      checkFilterPredicate(Cast('_1, IntegerType) <= 1, classOf[LtEq[_]], 1)
+      checkFilterPredicate(Cast('_1, IntegerType) >= 4, classOf[GtEq[_]], 4)
+
+      checkFilterPredicate(Literal(1) === Cast('_1, IntegerType), classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(2) > Cast('_1, IntegerType), classOf[Lt[_]], 1)
+      checkFilterPredicate(Literal(3) < Cast('_1, IntegerType), classOf[Gt[_]], 4)
+      checkFilterPredicate(Literal(1) >= Cast('_1, IntegerType), classOf[LtEq[_]], 1)
+      checkFilterPredicate(Literal(4) <= Cast('_1, IntegerType), classOf[GtEq[_]], 4)
+
+      checkFilterPredicate(!(Cast('_1, IntegerType) < 4), classOf[GtEq[_]], 4)
+      checkFilterPredicate(
+        Cast('_1, IntegerType) > 2 && Cast('_1, IntegerType) < 4, classOf[Operators.And], 3)
+      checkFilterPredicate(
+        Cast('_1, IntegerType) < 2 || Cast('_1, IntegerType) > 3,
+        classOf[Operators.Or],
+        Seq(Row(1), Row(4)))
     }
+  }
 
-    test(s"$prefix: filter pushdown - integer") {
-      withParquetRDD((1 to 4).map(i => Tuple1(Option(i)))) { implicit rdd =>
-        checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
-        checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
+  test("filter pushdown - integer") {
+    withParquetRDD((1 to 4).map(i => Tuple1(Option(i)))) { implicit rdd =>
+      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
 
-        checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
-        checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+      checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
+      checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
 
-        checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
-        checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
-        checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
-        checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
+      checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
+      checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
+      checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
+      checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
 
-        checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
-        checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
-        checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
-        checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
-        checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
+      checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
+      checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
+      checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
+      checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
 
-        checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
-        checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
-        checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
-      }
+      checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
+      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
+      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
     }
+  }
 
-    test(s"$prefix: filter pushdown - long") {
-      withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toLong)))) { implicit rdd =>
-        checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
-        checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
+  test("filter pushdown - long") {
+    withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toLong)))) { implicit rdd =>
+      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
 
-        checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
-        checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+      checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
+      checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
 
-        checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
-        checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
-        checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
-        checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
+      checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
+      checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
+      checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
+      checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
 
-        checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
-        checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
-        checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
-        checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
-        checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
+      checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
+      checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
+      checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
+      checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
 
-        checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
-        checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
-        checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
-      }
+      checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
+      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
+      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
     }
+  }
 
-    test(s"$prefix: filter pushdown - float") {
-      withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toFloat)))) { implicit rdd =>
-        checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
-        checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
+  test("filter pushdown - float") {
+    withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toFloat)))) { implicit rdd =>
+      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
 
-        checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
-        checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+      checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
+      checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
 
-        checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
-        checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
-        checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
-        checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
+      checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
+      checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
+      checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
+      checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
 
-        checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
-        checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
-        checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
-        checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
-        checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
+      checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
+      checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
+      checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
+      checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
 
-        checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
-        checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
-        checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
-      }
+      checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
+      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
+      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
     }
+  }
 
-    test(s"$prefix: filter pushdown - double") {
-      withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toDouble)))) { implicit rdd =>
-        checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
-        checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
+  test("filter pushdown - double") {
+    withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toDouble)))) { implicit rdd =>
+      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
 
-        checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
-        checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+      checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
+      checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
 
-        checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
-        checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
-        checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
-        checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
+      checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
+      checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
+      checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
+      checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
 
-        checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
-        checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
-        checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
-        checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
-        checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
+      checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
+      checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
+      checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
+      checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
 
-        checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
-        checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
-        checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
-      }
+      checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
+      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
+      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
     }
+  }
 
-    test(s"$prefix: filter pushdown - string") {
-      withParquetRDD((1 to 4).map(i => Tuple1(i.toString))) { implicit rdd =>
-        checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
-        checkFilterPredicate(
-          '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.toString)))
-
-        checkFilterPredicate('_1 === "1", classOf[Eq[_]], "1")
-        checkFilterPredicate(
-          '_1 !== "1", classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.toString)))
-
-        checkFilterPredicate('_1 < "2", classOf[Lt[_]], "1")
-        checkFilterPredicate('_1 > "3", classOf[Gt[_]], "4")
-        checkFilterPredicate('_1 <= "1", classOf[LtEq[_]], "1")
-        checkFilterPredicate('_1 >= "4", classOf[GtEq[_]], "4")
-
-        checkFilterPredicate(Literal("1") === '_1, classOf[Eq[_]], "1")
-        checkFilterPredicate(Literal("2") > '_1, classOf[Lt[_]], "1")
-        checkFilterPredicate(Literal("3") < '_1, classOf[Gt[_]], "4")
-        checkFilterPredicate(Literal("1") >= '_1, classOf[LtEq[_]], "1")
-        checkFilterPredicate(Literal("4") <= '_1, classOf[GtEq[_]], "4")
-
-        checkFilterPredicate(!('_1 < "4"), classOf[GtEq[_]], "4")
-        checkFilterPredicate('_1 > "2" && '_1 < "4", classOf[Operators.And], "3")
-        checkFilterPredicate('_1 < "2" || '_1 > "3", classOf[Operators.Or], Seq(Row("1"), Row("4")))
-      }
+  test("filter pushdown - string") {
+    withParquetRDD((1 to 4).map(i => Tuple1(i.toString))) { implicit rdd =>
+      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+      checkFilterPredicate(
+        '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.toString)))
+
+      checkFilterPredicate('_1 === "1", classOf[Eq[_]], "1")
+      checkFilterPredicate(
+        '_1 !== "1", classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.toString)))
+
+      checkFilterPredicate('_1 < "2", classOf[Lt[_]], "1")
+      checkFilterPredicate('_1 > "3", classOf[Gt[_]], "4")
+      checkFilterPredicate('_1 <= "1", classOf[LtEq[_]], "1")
+      checkFilterPredicate('_1 >= "4", classOf[GtEq[_]], "4")
+
+      checkFilterPredicate(Literal("1") === '_1, classOf[Eq[_]], "1")
+      checkFilterPredicate(Literal("2") > '_1, classOf[Lt[_]], "1")
+      checkFilterPredicate(Literal("3") < '_1, classOf[Gt[_]], "4")
+      checkFilterPredicate(Literal("1") >= '_1, classOf[LtEq[_]], "1")
+      checkFilterPredicate(Literal("4") <= '_1, classOf[GtEq[_]], "4")
+
+      checkFilterPredicate(!('_1 < "4"), classOf[GtEq[_]], "4")
+      checkFilterPredicate('_1 > "2" && '_1 < "4", classOf[Operators.And], "3")
+      checkFilterPredicate('_1 < "2" || '_1 > "3", classOf[Operators.Or], Seq(Row("1"), Row("4")))
     }
+  }
 
-    test(s"$prefix: filter pushdown - binary") {
-      implicit class IntToBinary(int: Int) {
-        def b: Array[Byte] = int.toString.getBytes("UTF-8")
-      }
+  test("filter pushdown - binary") {
+    implicit class IntToBinary(int: Int) {
+      def b: Array[Byte] = int.toString.getBytes("UTF-8")
+    }
 
-      withParquetRDD((1 to 4).map(i => Tuple1(i.b))) { implicit rdd =>
-        checkBinaryFilterPredicate('_1 === 1.b, classOf[Eq[_]], 1.b)
+    withParquetRDD((1 to 4).map(i => Tuple1(i.b))) { implicit rdd =>
+      checkBinaryFilterPredicate('_1 === 1.b, classOf[Eq[_]], 1.b)
 
-        checkBinaryFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
-        checkBinaryFilterPredicate(
-          '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.b)).toSeq)
+      checkBinaryFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+      checkBinaryFilterPredicate(
+        '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.b)).toSeq)
 
-        checkBinaryFilterPredicate(
-          '_1 !== 1.b, classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.b)).toSeq)
+      checkBinaryFilterPredicate(
+        '_1 !== 1.b, classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.b)).toSeq)
 
-        checkBinaryFilterPredicate('_1 < 2.b, classOf[Lt[_]], 1.b)
-        checkBinaryFilterPredicate('_1 > 3.b, classOf[Gt[_]], 4.b)
-        checkBinaryFilterPredicate('_1 <= 1.b, classOf[LtEq[_]], 1.b)
-        checkBinaryFilterPredicate('_1 >= 4.b, classOf[GtEq[_]], 4.b)
+      checkBinaryFilterPredicate('_1 < 2.b, classOf[Lt[_]], 1.b)
+      checkBinaryFilterPredicate('_1 > 3.b, classOf[Gt[_]], 4.b)
+      checkBinaryFilterPredicate('_1 <= 1.b, classOf[LtEq[_]], 1.b)
+      checkBinaryFilterPredicate('_1 >= 4.b, classOf[GtEq[_]], 4.b)
 
-        checkBinaryFilterPredicate(Literal(1.b) === '_1, classOf[Eq[_]], 1.b)
-        checkBinaryFilterPredicate(Literal(2.b) > '_1, classOf[Lt[_]], 1.b)
-        checkBinaryFilterPredicate(Literal(3.b) < '_1, classOf[Gt[_]], 4.b)
-        checkBinaryFilterPredicate(Literal(1.b) >= '_1, classOf[LtEq[_]], 1.b)
-        checkBinaryFilterPredicate(Literal(4.b) <= '_1, classOf[GtEq[_]], 4.b)
+      checkBinaryFilterPredicate(Literal(1.b) === '_1, classOf[Eq[_]], 1.b)
+      checkBinaryFilterPredicate(Literal(2.b) > '_1, classOf[Lt[_]], 1.b)
+      checkBinaryFilterPredicate(Literal(3.b) < '_1, classOf[Gt[_]], 4.b)
+      checkBinaryFilterPredicate(Literal(1.b) >= '_1, classOf[LtEq[_]], 1.b)
+      checkBinaryFilterPredicate(Literal(4.b) <= '_1, classOf[GtEq[_]], 4.b)
 
-        checkBinaryFilterPredicate(!('_1 < 4.b), classOf[GtEq[_]], 4.b)
-        checkBinaryFilterPredicate('_1 > 2.b && '_1 < 4.b, classOf[Operators.And], 3.b)
-        checkBinaryFilterPredicate(
-          '_1 < 2.b || '_1 > 3.b, classOf[Operators.Or], Seq(Row(1.b), Row(4.b)))
-      }
+      checkBinaryFilterPredicate(!('_1 < 4.b), classOf[GtEq[_]], 4.b)
+      checkBinaryFilterPredicate('_1 > 2.b && '_1 < 4.b, classOf[Operators.And], 3.b)
+      checkBinaryFilterPredicate(
+        '_1 < 2.b || '_1 > 3.b, classOf[Operators.Or], Seq(Row(1.b), Row(4.b)))
     }
   }
+}
+
+class ParquetDataSourceOnFilterSuite extends ParquetFilterSuiteBase with BeforeAndAfterAll {
+  val originalConf = sqlContext.conf.parquetUseDataSourceApi
+
+  override protected def beforeAll(): Unit = {
+    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
+  }
+
+  override protected def afterAll(): Unit = {
+    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+  }
+}
+
+class ParquetDataSourceOffFilterSuite extends ParquetFilterSuiteBase with BeforeAndAfterAll {
+  val originalConf = sqlContext.conf.parquetUseDataSourceApi
 
-  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
-    run("Parquet data source enabled")
+  override protected def beforeAll(): Unit = {
+    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
   }
 
-  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "false") {
-    run("Parquet data source disabled")
+  override protected def afterAll(): Unit = {
+    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index c306330818c0a..208f35761b807 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -21,6 +21,9 @@ import scala.collection.JavaConversions._
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.scalatest.BeforeAndAfterAll
 import parquet.example.data.simple.SimpleGroup
 import parquet.example.data.{Group, GroupWriter}
 import parquet.hadoop.api.WriteSupport
@@ -30,16 +33,13 @@ import parquet.hadoop.{ParquetFileWriter, ParquetWriter}
 import parquet.io.api.RecordConsumer
 import parquet.schema.{MessageType, MessageTypeParser}
 
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.spark.sql.{DataFrame, QueryTest, SQLConf}
-import org.apache.spark.sql.functions._
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext._
 import org.apache.spark.sql.test.TestSQLContext.implicits._
 import org.apache.spark.sql.types.DecimalType
+import org.apache.spark.sql.{DataFrame, QueryTest, SQLConf, SaveMode}
 
 // Write support class for nested groups: ParquetWriter initializes GroupWriteSupport
 // with an empty configuration (it is after all not intended to be used in this way?)
@@ -64,10 +64,11 @@ private[parquet] class TestGroupWriteSupport(schema: MessageType) extends WriteS
 /**
  * A test suite that tests basic Parquet I/O.
  */
-class ParquetIOSuite extends QueryTest with ParquetTest {
-
+class ParquetIOSuiteBase extends QueryTest with ParquetTest {
   val sqlContext = TestSQLContext
 
+  import sqlContext.implicits.localSeqToDataFrameHolder
+
   /**
    * Writes `data` to a Parquet file, reads it back and check file contents.
    */
@@ -75,229 +76,281 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
     withParquetRDD(data)(r => checkAnswer(r, data.map(Row.fromTuple)))
   }
 
-  def run(prefix: String): Unit = {
-    test(s"$prefix: basic data types (without binary)") {
-      val data = (1 to 4).map { i =>
-        (i % 2 == 0, i, i.toLong, i.toFloat, i.toDouble)
-      }
-      checkParquetFile(data)
+  test("basic data types (without binary)") {
+    val data = (1 to 4).map { i =>
+      (i % 2 == 0, i, i.toLong, i.toFloat, i.toDouble)
     }
+    checkParquetFile(data)
+  }
 
-    test(s"$prefix: raw binary") {
-      val data = (1 to 4).map(i => Tuple1(Array.fill(3)(i.toByte)))
-      withParquetRDD(data) { rdd =>
-        assertResult(data.map(_._1.mkString(",")).sorted) {
-          rdd.collect().map(_.getAs[Array[Byte]](0).mkString(",")).sorted
-        }
+  test("raw binary") {
+    val data = (1 to 4).map(i => Tuple1(Array.fill(3)(i.toByte)))
+    withParquetRDD(data) { rdd =>
+      assertResult(data.map(_._1.mkString(",")).sorted) {
+        rdd.collect().map(_.getAs[Array[Byte]](0).mkString(",")).sorted
       }
     }
+  }
 
-    test(s"$prefix: string") {
-      val data = (1 to 4).map(i => Tuple1(i.toString))
-      // Property spark.sql.parquet.binaryAsString shouldn't affect Parquet files written by Spark SQL
-      // as we store Spark SQL schema in the extra metadata.
-      withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING -> "false")(checkParquetFile(data))
-      withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING -> "true")(checkParquetFile(data))
-    }
+  test("string") {
+    val data = (1 to 4).map(i => Tuple1(i.toString))
+    // Property spark.sql.parquet.binaryAsString shouldn't affect Parquet files written by Spark SQL
+    // as we store Spark SQL schema in the extra metadata.
+    withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING -> "false")(checkParquetFile(data))
+    withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING -> "true")(checkParquetFile(data))
+  }
 
-    test(s"$prefix: fixed-length decimals") {
-
-      def makeDecimalRDD(decimal: DecimalType): DataFrame =
-        sparkContext
-          .parallelize(0 to 1000)
-          .map(i => Tuple1(i / 100.0))
-          .toDF
-          // Parquet doesn't allow column names with spaces, have to add an alias here
-          .select($"_1" cast decimal as "dec")
-
-      for ((precision, scale) <- Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17))) {
-        withTempPath { dir =>
-          val data = makeDecimalRDD(DecimalType(precision, scale))
-          data.saveAsParquetFile(dir.getCanonicalPath)
-          checkAnswer(parquetFile(dir.getCanonicalPath), data.collect().toSeq)
-        }
+  test("fixed-length decimals") {
+
+    def makeDecimalRDD(decimal: DecimalType): DataFrame =
+      sparkContext
+        .parallelize(0 to 1000)
+        .map(i => Tuple1(i / 100.0))
+        .toDF
+        // Parquet doesn't allow column names with spaces, have to add an alias here
+        .select($"_1" cast decimal as "dec")
+
+    for ((precision, scale) <- Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17))) {
+      withTempPath { dir =>
+        val data = makeDecimalRDD(DecimalType(precision, scale))
+        data.saveAsParquetFile(dir.getCanonicalPath)
+        checkAnswer(parquetFile(dir.getCanonicalPath), data.collect().toSeq)
       }
+    }
 
-      // Decimals with precision above 18 are not yet supported
-      intercept[RuntimeException] {
-        withTempPath { dir =>
-          makeDecimalRDD(DecimalType(19, 10)).saveAsParquetFile(dir.getCanonicalPath)
-          parquetFile(dir.getCanonicalPath).collect()
-        }
+    // Decimals with precision above 18 are not yet supported
+    intercept[RuntimeException] {
+      withTempPath { dir =>
+        makeDecimalRDD(DecimalType(19, 10)).saveAsParquetFile(dir.getCanonicalPath)
+        parquetFile(dir.getCanonicalPath).collect()
       }
+    }
 
-      // Unlimited-length decimals are not yet supported
-      intercept[RuntimeException] {
-        withTempPath { dir =>
-          makeDecimalRDD(DecimalType.Unlimited).saveAsParquetFile(dir.getCanonicalPath)
-          parquetFile(dir.getCanonicalPath).collect()
-        }
+    // Unlimited-length decimals are not yet supported
+    intercept[RuntimeException] {
+      withTempPath { dir =>
+        makeDecimalRDD(DecimalType.Unlimited).saveAsParquetFile(dir.getCanonicalPath)
+        parquetFile(dir.getCanonicalPath).collect()
       }
     }
+  }
+
+  test("map") {
+    val data = (1 to 4).map(i => Tuple1(Map(i -> s"val_$i")))
+    checkParquetFile(data)
+  }
+
+  test("array") {
+    val data = (1 to 4).map(i => Tuple1(Seq(i, i + 1)))
+    checkParquetFile(data)
+  }
 
-    test(s"$prefix: map") {
-      val data = (1 to 4).map(i => Tuple1(Map(i -> s"val_$i")))
-      checkParquetFile(data)
+  test("struct") {
+    val data = (1 to 4).map(i => Tuple1((i, s"val_$i")))
+    withParquetRDD(data) { rdd =>
+      // Structs are converted to `Row`s
+      checkAnswer(rdd, data.map { case Tuple1(struct) =>
+        Row(Row(struct.productIterator.toSeq: _*))
+      })
     }
+  }
 
-    test(s"$prefix: array") {
-      val data = (1 to 4).map(i => Tuple1(Seq(i, i + 1)))
-      checkParquetFile(data)
+  test("nested struct with array of array as field") {
+    val data = (1 to 4).map(i => Tuple1((i, Seq(Seq(s"val_$i")))))
+    withParquetRDD(data) { rdd =>
+      // Structs are converted to `Row`s
+      checkAnswer(rdd, data.map { case Tuple1(struct) =>
+        Row(Row(struct.productIterator.toSeq: _*))
+      })
     }
+  }
 
-    test(s"$prefix: struct") {
-      val data = (1 to 4).map(i => Tuple1((i, s"val_$i")))
-      withParquetRDD(data) { rdd =>
-        // Structs are converted to `Row`s
-        checkAnswer(rdd, data.map { case Tuple1(struct) =>
-          Row(Row(struct.productIterator.toSeq: _*))
-        })
-      }
+  test("nested map with struct as value type") {
+    val data = (1 to 4).map(i => Tuple1(Map(i -> (i, s"val_$i"))))
+    withParquetRDD(data) { rdd =>
+      checkAnswer(rdd, data.map { case Tuple1(m) =>
+        Row(m.mapValues(struct => Row(struct.productIterator.toSeq: _*)))
+      })
     }
+  }
 
-    test(s"$prefix: nested struct with array of array as field") {
-      val data = (1 to 4).map(i => Tuple1((i, Seq(Seq(s"val_$i")))))
-      withParquetRDD(data) { rdd =>
-        // Structs are converted to `Row`s
-        checkAnswer(rdd, data.map { case Tuple1(struct) =>
-          Row(Row(struct.productIterator.toSeq: _*))
-        })
-      }
+  test("nulls") {
+    val allNulls = (
+      null.asInstanceOf[java.lang.Boolean],
+      null.asInstanceOf[Integer],
+      null.asInstanceOf[java.lang.Long],
+      null.asInstanceOf[java.lang.Float],
+      null.asInstanceOf[java.lang.Double])
+
+    withParquetRDD(allNulls :: Nil) { rdd =>
+      val rows = rdd.collect()
+      assert(rows.size === 1)
+      assert(rows.head === Row(Seq.fill(5)(null): _*))
     }
+  }
 
-    test(s"$prefix: nested map with struct as value type") {
-      val data = (1 to 4).map(i => Tuple1(Map(i -> (i, s"val_$i"))))
-      withParquetRDD(data) { rdd =>
-        checkAnswer(rdd, data.map { case Tuple1(m) =>
-          Row(m.mapValues(struct => Row(struct.productIterator.toSeq: _*)))
-        })
-      }
+  test("nones") {
+    val allNones = (
+      None.asInstanceOf[Option[Int]],
+      None.asInstanceOf[Option[Long]],
+      None.asInstanceOf[Option[String]])
+
+    withParquetRDD(allNones :: Nil) { rdd =>
+      val rows = rdd.collect()
+      assert(rows.size === 1)
+      assert(rows.head === Row(Seq.fill(3)(null): _*))
     }
+  }
 
-    test(s"$prefix: nulls") {
-      val allNulls = (
-        null.asInstanceOf[java.lang.Boolean],
-        null.asInstanceOf[Integer],
-        null.asInstanceOf[java.lang.Long],
-        null.asInstanceOf[java.lang.Float],
-        null.asInstanceOf[java.lang.Double])
-
-      withParquetRDD(allNulls :: Nil) { rdd =>
-        val rows = rdd.collect()
-        assert(rows.size === 1)
-        assert(rows.head === Row(Seq.fill(5)(null): _*))
-      }
+  test("compression codec") {
+    def compressionCodecFor(path: String) = {
+      val codecs = ParquetTypesConverter
+        .readMetaData(new Path(path), Some(configuration))
+        .getBlocks
+        .flatMap(_.getColumns)
+        .map(_.getCodec.name())
+        .distinct
+
+      assert(codecs.size === 1)
+      codecs.head
     }
 
-    test(s"$prefix: nones") {
-      val allNones = (
-        None.asInstanceOf[Option[Int]],
-        None.asInstanceOf[Option[Long]],
-        None.asInstanceOf[Option[String]])
+    val data = (0 until 10).map(i => (i, i.toString))
 
-      withParquetRDD(allNones :: Nil) { rdd =>
-        val rows = rdd.collect()
-        assert(rows.size === 1)
-        assert(rows.head === Row(Seq.fill(3)(null): _*))
+    def checkCompressionCodec(codec: CompressionCodecName): Unit = {
+      withSQLConf(SQLConf.PARQUET_COMPRESSION -> codec.name()) {
+        withParquetFile(data) { path =>
+          assertResult(conf.parquetCompressionCodec.toUpperCase) {
+            compressionCodecFor(path)
+          }
+        }
       }
     }
 
-    test(s"$prefix: compression codec") {
-      def compressionCodecFor(path: String) = {
-        val codecs = ParquetTypesConverter
-          .readMetaData(new Path(path), Some(configuration))
-          .getBlocks
-          .flatMap(_.getColumns)
-          .map(_.getCodec.name())
-          .distinct
-
-        assert(codecs.size === 1)
-        codecs.head
-      }
+    // Checks default compression codec
+    checkCompressionCodec(CompressionCodecName.fromConf(conf.parquetCompressionCodec))
 
-      val data = (0 until 10).map(i => (i, i.toString))
+    checkCompressionCodec(CompressionCodecName.UNCOMPRESSED)
+    checkCompressionCodec(CompressionCodecName.GZIP)
+    checkCompressionCodec(CompressionCodecName.SNAPPY)
+  }
 
-      def checkCompressionCodec(codec: CompressionCodecName): Unit = {
-        withSQLConf(SQLConf.PARQUET_COMPRESSION -> codec.name()) {
-          withParquetFile(data) { path =>
-            assertResult(conf.parquetCompressionCodec.toUpperCase) {
-              compressionCodecFor(path)
-            }
-          }
-        }
+  test("read raw Parquet file") {
+    def makeRawParquetFile(path: Path): Unit = {
+      val schema = MessageTypeParser.parseMessageType(
+        """
+          |message root {
+          |  required boolean _1;
+          |  required int32   _2;
+          |  required int64   _3;
+          |  required float   _4;
+          |  required double  _5;
+          |}
+        """.stripMargin)
+
+      val writeSupport = new TestGroupWriteSupport(schema)
+      val writer = new ParquetWriter[Group](path, writeSupport)
+
+      (0 until 10).foreach { i =>
+        val record = new SimpleGroup(schema)
+        record.add(0, i % 2 == 0)
+        record.add(1, i)
+        record.add(2, i.toLong)
+        record.add(3, i.toFloat)
+        record.add(4, i.toDouble)
+        writer.write(record)
       }
 
-      // Checks default compression codec
-      checkCompressionCodec(CompressionCodecName.fromConf(conf.parquetCompressionCodec))
+      writer.close()
+    }
 
-      checkCompressionCodec(CompressionCodecName.UNCOMPRESSED)
-      checkCompressionCodec(CompressionCodecName.GZIP)
-      checkCompressionCodec(CompressionCodecName.SNAPPY)
+    withTempDir { dir =>
+      val path = new Path(dir.toURI.toString, "part-r-0.parquet")
+      makeRawParquetFile(path)
+      checkAnswer(parquetFile(path.toString), (0 until 10).map { i =>
+        Row(i % 2 == 0, i, i.toLong, i.toFloat, i.toDouble)
+      })
     }
+  }
 
-    test(s"$prefix: read raw Parquet file") {
-      def makeRawParquetFile(path: Path): Unit = {
-        val schema = MessageTypeParser.parseMessageType(
-          """
-            |message root {
-            |  required boolean _1;
-            |  required int32   _2;
-            |  required int64   _3;
-            |  required float   _4;
-            |  required double  _5;
-            |}
-          """.stripMargin)
-
-        val writeSupport = new TestGroupWriteSupport(schema)
-        val writer = new ParquetWriter[Group](path, writeSupport)
-
-        (0 until 10).foreach { i =>
-          val record = new SimpleGroup(schema)
-          record.add(0, i % 2 == 0)
-          record.add(1, i)
-          record.add(2, i.toLong)
-          record.add(3, i.toFloat)
-          record.add(4, i.toDouble)
-          writer.write(record)
-        }
+  test("write metadata") {
+    withTempPath { file =>
+      val path = new Path(file.toURI.toString)
+      val fs = FileSystem.getLocal(configuration)
+      val attributes = ScalaReflection.attributesFor[(Int, String)]
+      ParquetTypesConverter.writeMetaData(attributes, path, configuration)
 
-        writer.close()
-      }
+      assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)))
+      assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)))
 
-      withTempDir { dir =>
-        val path = new Path(dir.toURI.toString, "part-r-0.parquet")
-        makeRawParquetFile(path)
-        checkAnswer(parquetFile(path.toString), (0 until 10).map { i =>
-          Row(i % 2 == 0, i, i.toLong, i.toFloat, i.toDouble)
-        })
-      }
+      val metaData = ParquetTypesConverter.readMetaData(path, Some(configuration))
+      val actualSchema = metaData.getFileMetaData.getSchema
+      val expectedSchema = ParquetTypesConverter.convertFromAttributes(attributes)
+
+      actualSchema.checkContains(expectedSchema)
+      expectedSchema.checkContains(actualSchema)
     }
+  }
 
-    test(s"$prefix: write metadata") {
-      withTempPath { file =>
-        val path = new Path(file.toURI.toString)
-        val fs = FileSystem.getLocal(configuration)
-        val attributes = ScalaReflection.attributesFor[(Int, String)]
-        ParquetTypesConverter.writeMetaData(attributes, path, configuration)
+  test("save - overwrite") {
+    withParquetFile((1 to 10).map(i => (i, i.toString))) { file =>
+      val newData = (11 to 20).map(i => (i, i.toString))
+      newData.toDF().save("org.apache.spark.sql.parquet", SaveMode.Overwrite, Map("path" -> file))
+      checkAnswer(parquetFile(file), newData.map(Row.fromTuple))
+    }
+  }
 
-        assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)))
-        assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)))
+  test("save - ignore") {
+    val data = (1 to 10).map(i => (i, i.toString))
+    withParquetFile(data) { file =>
+      val newData = (11 to 20).map(i => (i, i.toString))
+      newData.toDF().save("org.apache.spark.sql.parquet", SaveMode.Ignore, Map("path" -> file))
+      checkAnswer(parquetFile(file), data.map(Row.fromTuple))
+    }
+  }
 
-        val metaData = ParquetTypesConverter.readMetaData(path, Some(configuration))
-        val actualSchema = metaData.getFileMetaData.getSchema
-        val expectedSchema = ParquetTypesConverter.convertFromAttributes(attributes)
+  test("save - throw") {
+    val data = (1 to 10).map(i => (i, i.toString))
+    withParquetFile(data) { file =>
+      val newData = (11 to 20).map(i => (i, i.toString))
+      val errorMessage = intercept[Throwable] {
+        newData.toDF().save(
+          "org.apache.spark.sql.parquet", SaveMode.ErrorIfExists, Map("path" -> file))
+      }.getMessage
+      assert(errorMessage.contains("already exists"))
+    }
+  }
 
-        actualSchema.checkContains(expectedSchema)
-        expectedSchema.checkContains(actualSchema)
-      }
+  test("save - append") {
+    val data = (1 to 10).map(i => (i, i.toString))
+    withParquetFile(data) { file =>
+      val newData = (11 to 20).map(i => (i, i.toString))
+      newData.toDF().save("org.apache.spark.sql.parquet", SaveMode.Append, Map("path" -> file))
+      checkAnswer(parquetFile(file), (data ++ newData).map(Row.fromTuple))
     }
   }
+}
+
+class ParquetDataSourceOnIOSuite extends ParquetIOSuiteBase with BeforeAndAfterAll {
+  val originalConf = sqlContext.conf.parquetUseDataSourceApi
+
+  override protected def beforeAll(): Unit = {
+    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
+  }
+
+  override protected def afterAll(): Unit = {
+    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+  }
+}
+
+class ParquetDataSourceOffIOSuite extends ParquetIOSuiteBase with BeforeAndAfterAll {
+  val originalConf = sqlContext.conf.parquetUseDataSourceApi
 
-  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
-    run("Parquet data source enabled")
+  override protected def beforeAll(): Unit = {
+    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
   }
 
-  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "false") {
-    run("Parquet data source disabled")
+  override protected def afterAll(): Unit = {
+    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index ae606d11a8f68..3bf0116c8f7e9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -19,17 +19,24 @@ package org.apache.spark.sql.parquet
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.hadoop.fs.Path
-import org.scalatest.FunSuite
 
 import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.parquet.ParquetRelation2._
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.{QueryTest, Row, SQLContext}
 
-class ParquetPartitionDiscoverySuite extends FunSuite with ParquetTest {
+// The data where the partitioning key exists only in the directory structure.
+case class ParquetData(intField: Int, stringField: String)
+
+// The data that also includes the partitioning key
+case class ParquetDataWithKey(intField: Int, pi: Int, stringField: String, ps: String)
+
+class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
   override val sqlContext: SQLContext = TestSQLContext
 
+  import sqlContext._
+
   val defaultPartitionName = "__NULL__"
 
   test("column type inference") {
@@ -112,6 +119,17 @@ class ParquetPartitionDiscoverySuite extends FunSuite with ParquetTest {
           Partition(Row(10, "20"), "hdfs://host:9000/path/a=10/b=20"),
           Partition(Row(10.5, "hello"), "hdfs://host:9000/path/a=10.5/b=hello"))))
 
+    check(Seq(
+      s"hdfs://host:9000/path/a=10/b=20",
+      s"hdfs://host:9000/path/a=$defaultPartitionName/b=hello"),
+      PartitionSpec(
+        StructType(Seq(
+          StructField("a", IntegerType),
+          StructField("b", StringType))),
+        Seq(
+          Partition(Row(10, "20"), s"hdfs://host:9000/path/a=10/b=20"),
+          Partition(Row(null, "hello"), s"hdfs://host:9000/path/a=$defaultPartitionName/b=hello"))))
+
     check(Seq(
       s"hdfs://host:9000/path/a=10/b=$defaultPartitionName",
       s"hdfs://host:9000/path/a=10.5/b=$defaultPartitionName"),
@@ -123,4 +141,182 @@ class ParquetPartitionDiscoverySuite extends FunSuite with ParquetTest {
           Partition(Row(10, null), s"hdfs://host:9000/path/a=10/b=$defaultPartitionName"),
           Partition(Row(10.5, null), s"hdfs://host:9000/path/a=10.5/b=$defaultPartitionName"))))
   }
+
+  test("read partitioned table - normal case") {
+    withTempDir { base =>
+      for {
+        pi <- Seq(1, 2)
+        ps <- Seq("foo", "bar")
+      } {
+        makeParquetFile(
+          (1 to 10).map(i => ParquetData(i, i.toString)),
+          makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
+      }
+
+      parquetFile(base.getCanonicalPath).registerTempTable("t")
+
+      withTempTable("t") {
+        checkAnswer(
+          sql("SELECT * FROM t"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+            ps <- Seq("foo", "bar")
+          } yield Row(i, i.toString, pi, ps))
+
+        checkAnswer(
+          sql("SELECT intField, pi FROM t"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+            _ <- Seq("foo", "bar")
+          } yield Row(i, pi))
+
+        checkAnswer(
+          sql("SELECT * FROM t WHERE pi = 1"),
+          for {
+            i <- 1 to 10
+            ps <- Seq("foo", "bar")
+          } yield Row(i, i.toString, 1, ps))
+
+        checkAnswer(
+          sql("SELECT * FROM t WHERE ps = 'foo'"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+          } yield Row(i, i.toString, pi, "foo"))
+      }
+    }
+  }
+
+  test("read partitioned table - partition key included in Parquet file") {
+    withTempDir { base =>
+      for {
+        pi <- Seq(1, 2)
+        ps <- Seq("foo", "bar")
+      } {
+        makeParquetFile(
+          (1 to 10).map(i => ParquetDataWithKey(i, pi, i.toString, ps)),
+          makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
+      }
+
+      parquetFile(base.getCanonicalPath).registerTempTable("t")
+
+      withTempTable("t") {
+        checkAnswer(
+          sql("SELECT * FROM t"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+            ps <- Seq("foo", "bar")
+          } yield Row(i, pi, i.toString, ps))
+
+        checkAnswer(
+          sql("SELECT intField, pi FROM t"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+            _ <- Seq("foo", "bar")
+          } yield Row(i, pi))
+
+        checkAnswer(
+          sql("SELECT * FROM t WHERE pi = 1"),
+          for {
+            i <- 1 to 10
+            ps <- Seq("foo", "bar")
+          } yield Row(i, 1, i.toString, ps))
+
+        checkAnswer(
+          sql("SELECT * FROM t WHERE ps = 'foo'"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+          } yield Row(i, pi, i.toString, "foo"))
+      }
+    }
+  }
+
+  test("read partitioned table - with nulls") {
+    withTempDir { base =>
+      for {
+        // Must be `Integer` rather than `Int` here. `null.asInstanceOf[Int]` results in a zero...
+        pi <- Seq(1, null.asInstanceOf[Integer])
+        ps <- Seq("foo", null.asInstanceOf[String])
+      } {
+        makeParquetFile(
+          (1 to 10).map(i => ParquetData(i, i.toString)),
+          makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
+      }
+
+      val parquetRelation = load(
+        "org.apache.spark.sql.parquet",
+        Map(
+          "path" -> base.getCanonicalPath,
+          ParquetRelation2.DEFAULT_PARTITION_NAME -> defaultPartitionName))
+
+      parquetRelation.registerTempTable("t")
+
+      withTempTable("t") {
+        checkAnswer(
+          sql("SELECT * FROM t"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, null.asInstanceOf[Integer])
+            ps <- Seq("foo", null.asInstanceOf[String])
+          } yield Row(i, i.toString, pi, ps))
+
+        checkAnswer(
+          sql("SELECT * FROM t WHERE pi IS NULL"),
+          for {
+            i <- 1 to 10
+            ps <- Seq("foo", null.asInstanceOf[String])
+          } yield Row(i, i.toString, null, ps))
+
+        checkAnswer(
+          sql("SELECT * FROM t WHERE ps IS NULL"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, null.asInstanceOf[Integer])
+          } yield Row(i, i.toString, pi, null))
+      }
+    }
+  }
+
+  test("read partitioned table - with nulls and partition keys are included in Parquet file") {
+    withTempDir { base =>
+      for {
+        pi <- Seq(1, 2)
+        ps <- Seq("foo", null.asInstanceOf[String])
+      } {
+        makeParquetFile(
+          (1 to 10).map(i => ParquetDataWithKey(i, pi, i.toString, ps)),
+          makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
+      }
+
+      val parquetRelation = load(
+        "org.apache.spark.sql.parquet",
+        Map(
+          "path" -> base.getCanonicalPath,
+          ParquetRelation2.DEFAULT_PARTITION_NAME -> defaultPartitionName))
+
+      parquetRelation.registerTempTable("t")
+
+      withTempTable("t") {
+        checkAnswer(
+          sql("SELECT * FROM t"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+            ps <- Seq("foo", null.asInstanceOf[String])
+          } yield Row(i, pi, i.toString, ps))
+
+        checkAnswer(
+          sql("SELECT * FROM t WHERE ps IS NULL"),
+          for {
+            i <- 1 to 10
+            pi <- Seq(1, 2)
+          } yield Row(i, pi, i.toString, null))
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index cba06835f9a61..d0665450cd766 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -17,103 +17,120 @@
 
 package org.apache.spark.sql.parquet
 
+import org.scalatest.BeforeAndAfterAll
+
+import org.apache.spark.sql.{SQLConf, QueryTest}
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext._
-import org.apache.spark.sql.{QueryTest, SQLConf}
 
 /**
  * A test suite that tests various Parquet queries.
  */
-class ParquetQuerySuite extends QueryTest with ParquetTest {
+class ParquetQuerySuiteBase extends QueryTest with ParquetTest {
   val sqlContext = TestSQLContext
 
-  def run(prefix: String): Unit = {
-    test(s"$prefix: simple projection") {
-      withParquetTable((0 until 10).map(i => (i, i.toString)), "t") {
-        checkAnswer(sql("SELECT _1 FROM t"), (0 until 10).map(Row.apply(_)))
-      }
+  test("simple projection") {
+    withParquetTable((0 until 10).map(i => (i, i.toString)), "t") {
+      checkAnswer(sql("SELECT _1 FROM t"), (0 until 10).map(Row.apply(_)))
     }
+  }
 
-    test(s"$prefix: appending") {
-      val data = (0 until 10).map(i => (i, i.toString))
-      withParquetTable(data, "t") {
-        sql("INSERT INTO TABLE t SELECT * FROM t")
-        checkAnswer(table("t"), (data ++ data).map(Row.fromTuple))
-      }
+  test("appending") {
+    val data = (0 until 10).map(i => (i, i.toString))
+    withParquetTable(data, "t") {
+      sql("INSERT INTO TABLE t SELECT * FROM t")
+      checkAnswer(table("t"), (data ++ data).map(Row.fromTuple))
     }
+  }
 
-    // This test case will trigger the NPE mentioned in
-    // https://issues.apache.org/jira/browse/PARQUET-151.
-    ignore(s"$prefix: overwriting") {
-      val data = (0 until 10).map(i => (i, i.toString))
-      withParquetTable(data, "t") {
-        sql("INSERT OVERWRITE TABLE t SELECT * FROM t")
-        checkAnswer(table("t"), data.map(Row.fromTuple))
-      }
+  // This test case will trigger the NPE mentioned in
+  // https://issues.apache.org/jira/browse/PARQUET-151.
+  // Update: This also triggers SPARK-5746, should re enable it when we get both fixed.
+  ignore("overwriting") {
+    val data = (0 until 10).map(i => (i, i.toString))
+    withParquetTable(data, "t") {
+      sql("INSERT OVERWRITE TABLE t SELECT * FROM t")
+      checkAnswer(table("t"), data.map(Row.fromTuple))
     }
+  }
 
-    test(s"$prefix: self-join") {
-      // 4 rows, cells of column 1 of row 2 and row 4 are null
-      val data = (1 to 4).map { i =>
-        val maybeInt = if (i % 2 == 0) None else Some(i)
-        (maybeInt, i.toString)
-      }
-
-      withParquetTable(data, "t") {
-        val selfJoin = sql("SELECT * FROM t x JOIN t y WHERE x._1 = y._1")
-        val queryOutput = selfJoin.queryExecution.analyzed.output
+  test("self-join") {
+    // 4 rows, cells of column 1 of row 2 and row 4 are null
+    val data = (1 to 4).map { i =>
+      val maybeInt = if (i % 2 == 0) None else Some(i)
+      (maybeInt, i.toString)
+    }
 
-        assertResult(4, s"Field count mismatches")(queryOutput.size)
-        assertResult(2, s"Duplicated expression ID in query plan:\n $selfJoin") {
-          queryOutput.filter(_.name == "_1").map(_.exprId).size
-        }
+    withParquetTable(data, "t") {
+      val selfJoin = sql("SELECT * FROM t x JOIN t y WHERE x._1 = y._1")
+      val queryOutput = selfJoin.queryExecution.analyzed.output
 
-        checkAnswer(selfJoin, List(Row(1, "1", 1, "1"), Row(3, "3", 3, "3")))
+      assertResult(4, "Field count mismatche")(queryOutput.size)
+      assertResult(2, "Duplicated expression ID in query plan:\n $selfJoin") {
+        queryOutput.filter(_.name == "_1").map(_.exprId).size
       }
+
+      checkAnswer(selfJoin, List(Row(1, "1", 1, "1"), Row(3, "3", 3, "3")))
     }
+  }
 
-    test(s"$prefix: nested data - struct with array field") {
-      val data = (1 to 10).map(i => Tuple1((i, Seq(s"val_$i"))))
-      withParquetTable(data, "t") {
-        checkAnswer(sql("SELECT _1._2[0] FROM t"), data.map {
-          case Tuple1((_, Seq(string))) => Row(string)
-        })
-      }
+  test("nested data - struct with array field") {
+    val data = (1 to 10).map(i => Tuple1((i, Seq("val_$i"))))
+    withParquetTable(data, "t") {
+      checkAnswer(sql("SELECT _1._2[0] FROM t"), data.map {
+        case Tuple1((_, Seq(string))) => Row(string)
+      })
     }
+  }
 
-    test(s"$prefix: nested data - array of struct") {
-      val data = (1 to 10).map(i => Tuple1(Seq(i -> s"val_$i")))
-      withParquetTable(data, "t") {
-        checkAnswer(sql("SELECT _1[0]._2 FROM t"), data.map {
-          case Tuple1(Seq((_, string))) => Row(string)
-        })
-      }
+  test("nested data - array of struct") {
+    val data = (1 to 10).map(i => Tuple1(Seq(i -> "val_$i")))
+    withParquetTable(data, "t") {
+      checkAnswer(sql("SELECT _1[0]._2 FROM t"), data.map {
+        case Tuple1(Seq((_, string))) => Row(string)
+      })
     }
+  }
 
-    test(s"$prefix: SPARK-1913 regression: columns only referenced by pushed down filters should remain") {
-      withParquetTable((1 to 10).map(Tuple1.apply), "t") {
-        checkAnswer(sql(s"SELECT _1 FROM t WHERE _1 < 10"), (1 to 9).map(Row.apply(_)))
-      }
+  test("SPARK-1913 regression: columns only referenced by pushed down filters should remain") {
+    withParquetTable((1 to 10).map(Tuple1.apply), "t") {
+      checkAnswer(sql("SELECT _1 FROM t WHERE _1 < 10"), (1 to 9).map(Row.apply(_)))
     }
+  }
 
-    test(s"$prefix: SPARK-5309 strings stored using dictionary compression in parquet") {
-      withParquetTable((0 until 1000).map(i => ("same", "run_" + i /100, 1)), "t") {
+  test("SPARK-5309 strings stored using dictionary compression in parquet") {
+    withParquetTable((0 until 1000).map(i => ("same", "run_" + i /100, 1)), "t") {
 
-        checkAnswer(sql(s"SELECT _1, _2, SUM(_3) FROM t GROUP BY _1, _2"),
-          (0 until 10).map(i => Row("same", "run_" + i, 100)))
+      checkAnswer(sql("SELECT _1, _2, SUM(_3) FROM t GROUP BY _1, _2"),
+        (0 until 10).map(i => Row("same", "run_" + i, 100)))
 
-        checkAnswer(sql(s"SELECT _1, _2, SUM(_3) FROM t WHERE _2 = 'run_5' GROUP BY _1, _2"),
-          List(Row("same", "run_5", 100)))
-      }
+      checkAnswer(sql("SELECT _1, _2, SUM(_3) FROM t WHERE _2 = 'run_5' GROUP BY _1, _2"),
+        List(Row("same", "run_5", 100)))
     }
   }
+}
+
+class ParquetDataSourceOnQuerySuite extends ParquetQuerySuiteBase with BeforeAndAfterAll {
+  val originalConf = sqlContext.conf.parquetUseDataSourceApi
+
+  override protected def beforeAll(): Unit = {
+    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
+  }
+
+  override protected def afterAll(): Unit = {
+    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+  }
+}
+
+class ParquetDataSourceOffQuerySuite extends ParquetQuerySuiteBase with BeforeAndAfterAll {
+  val originalConf = sqlContext.conf.parquetUseDataSourceApi
 
-  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
-    run("Parquet data source enabled")
+  override protected def beforeAll(): Unit = {
+    sqlContext.conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
   }
 
-  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "false") {
-    run("Parquet data source disabled")
+  override protected def afterAll(): Unit = {
+    sqlContext.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index ddc7b181d4d46..87b380f950979 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -22,26 +22,24 @@ import java.sql.Timestamp
 
 import scala.collection.JavaConversions._
 import scala.language.implicitConversions
-import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.ql.Driver
 import org.apache.hadoop.hive.ql.metadata.Table
-import org.apache.hadoop.hive.ql.processors._
 import org.apache.hadoop.hive.ql.parse.VariableSubstitution
+import org.apache.hadoop.hive.ql.processors._
 import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.serde2.io.{DateWritable, TimestampWritable}
 
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.{Analyzer, EliminateSubQueries, OverrideCatalog, OverrideFunctionRegistry}
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUdfs, SetCommand, QueryExecutionException}
-import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DescribeHiveTableCommand}
-import org.apache.spark.sql.sources.{CreateTableUsing, DataSourceStrategy}
+import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUdfs, QueryExecutionException, SetCommand}
+import org.apache.spark.sql.hive.execution.{DescribeHiveTableCommand, HiveNativeCommand}
+import org.apache.spark.sql.sources.DataSourceStrategy
 import org.apache.spark.sql.types._
 
 /**
@@ -244,6 +242,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   override protected[sql] lazy val analyzer =
     new Analyzer(catalog, functionRegistry, caseSensitive = false) {
       override val extendedRules =
+        catalog.ParquetConversions ::
         catalog.CreateTables ::
         catalog.PreInsertionCasts ::
         ExtractPythonUdfs ::
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index eb1ee54247bea..6d794d0e11391 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -20,25 +20,25 @@ package org.apache.spark.sql.hive
 import java.io.IOException
 import java.util.{List => JList}
 
-import com.google.common.cache.{LoadingCache, CacheLoader, CacheBuilder}
-
-import org.apache.hadoop.util.ReflectionUtils
-import org.apache.hadoop.hive.metastore.{Warehouse, TableType}
-import org.apache.hadoop.hive.metastore.api.{Table => TTable, Partition => TPartition, FieldSchema}
+import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
+import org.apache.hadoop.hive.metastore.api.{FieldSchema, Partition => TPartition, Table => TTable}
+import org.apache.hadoop.hive.metastore.{TableType, Warehouse}
 import org.apache.hadoop.hive.ql.metadata._
 import org.apache.hadoop.hive.ql.plan.CreateTableDesc
 import org.apache.hadoop.hive.serde.serdeConstants
-import org.apache.hadoop.hive.serde2.{Deserializer, SerDeException}
 import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
+import org.apache.hadoop.hive.serde2.{Deserializer, SerDeException}
+import org.apache.hadoop.util.ReflectionUtils
 
 import org.apache.spark.Logging
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.analysis.{Catalog, OverrideCatalog}
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
-import org.apache.spark.sql.parquet.ParquetRelation2
+import org.apache.spark.sql.parquet.{ParquetRelation2, Partition => ParquetPartition, PartitionSpec}
 import org.apache.spark.sql.sources.{DDLParser, LogicalRelation, ResolvedDataSource}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -101,16 +101,10 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
 
   val caseSensitive: Boolean = false
 
-  /** *
-    * Creates a data source table (a table created with USING clause) in Hive's metastore.
-    * Returns true when the table has been created. Otherwise, false.
-    * @param tableName
-    * @param userSpecifiedSchema
-    * @param provider
-    * @param options
-    * @param isExternal
-    * @return
-    */
+  /**
+   * Creates a data source table (a table created with USING clause) in Hive's metastore.
+   * Returns true when the table has been created. Otherwise, false.
+   */
   def createDataSourceTable(
       tableName: String,
       userSpecifiedSchema: Option[StructType],
@@ -141,7 +135,7 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
   }
 
   def hiveDefaultTableFilePath(tableName: String): String = {
-    val currentDatabase = client.getDatabase(hive.sessionState.getCurrentDatabase())
+    val currentDatabase = client.getDatabase(hive.sessionState.getCurrentDatabase)
     hiveWarehouse.getTablePath(currentDatabase, tableName).toString
   }
 
@@ -176,25 +170,41 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
           Nil
         }
 
-      val relation = MetastoreRelation(
-        databaseName, tblName, alias)(
-          table.getTTable, partitions.map(part => part.getTPartition))(hive)
-
-      if (hive.convertMetastoreParquet &&
-          hive.conf.parquetUseDataSourceApi &&
-          relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet")) {
-        val metastoreSchema = StructType.fromAttributes(relation.output)
-        val paths = if (relation.hiveQlTable.isPartitioned) {
-          relation.hiveQlPartitions.map(p => p.getLocation)
-        } else {
-          Seq(relation.hiveQlTable.getDataLocation.toString)
-        }
+      MetastoreRelation(databaseName, tblName, alias)(
+        table.getTTable, partitions.map(part => part.getTPartition))(hive)
+    }
+  }
 
-        LogicalRelation(ParquetRelation2(
-          paths, Map(ParquetRelation2.METASTORE_SCHEMA -> metastoreSchema.json))(hive))
-      } else {
-        relation
+  private def convertToParquetRelation(metastoreRelation: MetastoreRelation): LogicalRelation = {
+    val metastoreSchema = StructType.fromAttributes(metastoreRelation.output)
+
+    // NOTE: Instead of passing Metastore schema directly to `ParquetRelation2`, we have to
+    // serialize the Metastore schema to JSON and pass it as a data source option because of the
+    // evil case insensitivity issue, which is reconciled within `ParquetRelation2`.
+    if (metastoreRelation.hiveQlTable.isPartitioned) {
+      val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
+      val partitionColumnDataTypes = partitionSchema.map(_.dataType)
+      val partitions = metastoreRelation.hiveQlPartitions.map { p =>
+        val location = p.getLocation
+        val values = Row.fromSeq(p.getValues.zip(partitionColumnDataTypes).map {
+          case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null)
+        })
+        ParquetPartition(values, location)
       }
+      val partitionSpec = PartitionSpec(partitionSchema, partitions)
+      val paths = partitions.map(_.path)
+      LogicalRelation(
+        ParquetRelation2(
+          paths,
+          Map(ParquetRelation2.METASTORE_SCHEMA -> metastoreSchema.json),
+          None,
+          Some(partitionSpec))(hive))
+    } else {
+      val paths = Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
+      LogicalRelation(
+        ParquetRelation2(
+          paths,
+          Map(ParquetRelation2.METASTORE_SCHEMA -> metastoreSchema.json))(hive))
     }
   }
 
@@ -261,9 +271,9 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
         logInfo(s"Default to LazySimpleSerDe for table $dbName.$tblName")
         tbl.setSerializationLib(classOf[LazySimpleSerDe].getName())
 
-        import org.apache.hadoop.mapred.TextInputFormat
         import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
         import org.apache.hadoop.io.Text
+        import org.apache.hadoop.mapred.TextInputFormat
 
         tbl.setInputFormatClass(classOf[TextInputFormat])
         tbl.setOutputFormatClass(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]])
@@ -385,13 +395,56 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
     }
   }
 
+  /**
+   * When scanning or writing to non-partitioned Metastore Parquet tables, convert them to Parquet
+   * data source relations for better performance.
+   *
+   * This rule can be considered as [[HiveStrategies.ParquetConversion]] done right.
+   */
+  object ParquetConversions extends Rule[LogicalPlan] {
+    override def apply(plan: LogicalPlan): LogicalPlan = {
+      // Collects all `MetastoreRelation`s which should be replaced
+      val toBeReplaced = plan.collect {
+        // Write path
+        case InsertIntoTable(relation: MetastoreRelation, _, _, _)
+            // Inserting into partitioned table is not supported in Parquet data source (yet).
+            if !relation.hiveQlTable.isPartitioned &&
+              hive.convertMetastoreParquet &&
+              hive.conf.parquetUseDataSourceApi &&
+              relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
+          relation
+
+        // Read path
+        case p @ PhysicalOperation(_, _, relation: MetastoreRelation)
+            if hive.convertMetastoreParquet &&
+              hive.conf.parquetUseDataSourceApi &&
+              relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
+          relation
+      }
+
+      // Replaces all `MetastoreRelation`s with corresponding `ParquetRelation2`s, and fixes
+      // attribute IDs referenced in other nodes.
+      toBeReplaced.distinct.foldLeft(plan) { (lastPlan, relation) =>
+        val parquetRelation = convertToParquetRelation(relation)
+        val attributedRewrites = AttributeMap(relation.output.zip(parquetRelation.output))
+
+        lastPlan.transformUp {
+          case r: MetastoreRelation if r == relation => parquetRelation
+          case other => other.transformExpressions {
+            case a: Attribute if a.resolved => attributedRewrites.getOrElse(a, a)
+          }
+        }
+      }
+    }
+  }
+
   /**
    * Creates any tables required for query execution.
    * For example, because of a CREATE TABLE X AS statement.
    */
   object CreateTables extends Rule[LogicalPlan] {
     import org.apache.hadoop.hive.ql.Context
-    import org.apache.hadoop.hive.ql.parse.{QB, ASTNode, SemanticAnalyzer}
+    import org.apache.hadoop.hive.ql.parse.{ASTNode, QB, SemanticAnalyzer}
 
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
       // Wait until children are resolved.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index cb138be90e2e1..965d159656d80 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -139,15 +139,19 @@ private[hive] trait HiveStrategies {
 
             val partitionLocations = partitions.map(_.getLocation)
 
-            hiveContext
-              .parquetFile(partitionLocations.head, partitionLocations.tail: _*)
-              .addPartitioningAttributes(relation.partitionKeys)
-              .lowerCase
-              .where(unresolvedOtherPredicates)
-              .select(unresolvedProjection: _*)
-              .queryExecution
-              .executedPlan
-              .fakeOutput(projectList.map(_.toAttribute)) :: Nil
+            if (partitionLocations.isEmpty) {
+              PhysicalRDD(plan.output, sparkContext.emptyRDD[Row]) :: Nil
+            } else {
+              hiveContext
+                .parquetFile(partitionLocations.head, partitionLocations.tail: _*)
+                .addPartitioningAttributes(relation.partitionKeys)
+                .lowerCase
+                .where(unresolvedOtherPredicates)
+                .select(unresolvedProjection: _*)
+                .queryExecution
+                .executedPlan
+                .fakeOutput(projectList.map(_.toAttribute)) :: Nil
+            }
           } else {
             hiveContext
               .parquetFile(relation.hiveQlTable.getDataLocation.toString)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
index e246cbb6d77f0..2acf1a7767c19 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
@@ -40,7 +40,7 @@ case class ParquetDataWithKey(p: Int, intField: Int, stringField: String)
  * A suite to test the automatic conversion of metastore tables with parquet data to use the
  * built in parquet support.
  */
-class ParquetMetastoreSuite extends ParquetPartitioningTest {
+class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
   override def beforeAll(): Unit = {
     super.beforeAll()
 
@@ -97,6 +97,9 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
   }
 
   override def afterAll(): Unit = {
+    sql("DROP TABLE partitioned_parquet")
+    sql("DROP TABLE partitioned_parquet_with_key")
+    sql("DROP TABLE normal_parquet")
     setConf("spark.sql.hive.convertMetastoreParquet", "false")
   }
 
@@ -113,10 +116,38 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
   }
 }
 
+class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
+  val originalConf = conf.parquetUseDataSourceApi
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
+  }
+
+  override def afterAll(): Unit = {
+    super.afterAll()
+    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+  }
+}
+
+class ParquetDataSourceOffMetastoreSuite extends ParquetMetastoreSuiteBase {
+  val originalConf = conf.parquetUseDataSourceApi
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
+  }
+
+  override def afterAll(): Unit = {
+    super.afterAll()
+    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+  }
+}
+
 /**
  * A suite of tests for the Parquet support through the data sources API.
  */
-class ParquetSourceSuite extends ParquetPartitioningTest {
+class ParquetSourceSuiteBase extends ParquetPartitioningTest {
   override def beforeAll(): Unit = {
     super.beforeAll()
 
@@ -146,6 +177,34 @@ class ParquetSourceSuite extends ParquetPartitioningTest {
   }
 }
 
+class ParquetDataSourceOnSourceSuite extends ParquetSourceSuiteBase {
+  val originalConf = conf.parquetUseDataSourceApi
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
+  }
+
+  override def afterAll(): Unit = {
+    super.afterAll()
+    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+  }
+}
+
+class ParquetDataSourceOffSourceSuite extends ParquetSourceSuiteBase {
+  val originalConf = conf.parquetUseDataSourceApi
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
+  }
+
+  override def afterAll(): Unit = {
+    super.afterAll()
+    setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
+  }
+}
+
 /**
  * A collection of tests for parquet data with various forms of partitioning.
  */
@@ -191,107 +250,99 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
     }
   }
 
-  def run(prefix: String): Unit = {
-    Seq("partitioned_parquet", "partitioned_parquet_with_key").foreach { table =>
-      test(s"$prefix: ordering of the partitioning columns $table") {
-        checkAnswer(
-          sql(s"SELECT p, stringField FROM $table WHERE p = 1"),
-          Seq.fill(10)(Row(1, "part-1"))
-        )
-
-        checkAnswer(
-          sql(s"SELECT stringField, p FROM $table WHERE p = 1"),
-          Seq.fill(10)(Row("part-1", 1))
-        )
-      }
-
-      test(s"$prefix: project the partitioning column $table") {
-        checkAnswer(
-          sql(s"SELECT p, count(*) FROM $table group by p"),
-          Row(1, 10) ::
-            Row(2, 10) ::
-            Row(3, 10) ::
-            Row(4, 10) ::
-            Row(5, 10) ::
-            Row(6, 10) ::
-            Row(7, 10) ::
-            Row(8, 10) ::
-            Row(9, 10) ::
-            Row(10, 10) :: Nil
-        )
-      }
-
-      test(s"$prefix: project partitioning and non-partitioning columns $table") {
-        checkAnswer(
-          sql(s"SELECT stringField, p, count(intField) FROM $table GROUP BY p, stringField"),
-          Row("part-1", 1, 10) ::
-            Row("part-2", 2, 10) ::
-            Row("part-3", 3, 10) ::
-            Row("part-4", 4, 10) ::
-            Row("part-5", 5, 10) ::
-            Row("part-6", 6, 10) ::
-            Row("part-7", 7, 10) ::
-            Row("part-8", 8, 10) ::
-            Row("part-9", 9, 10) ::
-            Row("part-10", 10, 10) :: Nil
-        )
-      }
-
-      test(s"$prefix: simple count $table") {
-        checkAnswer(
-          sql(s"SELECT COUNT(*) FROM $table"),
-          Row(100))
-      }
-
-      test(s"$prefix: pruned count $table") {
-        checkAnswer(
-          sql(s"SELECT COUNT(*) FROM $table WHERE p = 1"),
-          Row(10))
-      }
-
-      test(s"$prefix: non-existent partition $table") {
-        checkAnswer(
-          sql(s"SELECT COUNT(*) FROM $table WHERE p = 1000"),
-          Row(0))
-      }
-
-      test(s"$prefix: multi-partition pruned count $table") {
-        checkAnswer(
-          sql(s"SELECT COUNT(*) FROM $table WHERE p IN (1,2,3)"),
-          Row(30))
-      }
-
-      test(s"$prefix: non-partition predicates $table") {
-        checkAnswer(
-          sql(s"SELECT COUNT(*) FROM $table WHERE intField IN (1,2,3)"),
-          Row(30))
-      }
-
-      test(s"$prefix: sum $table") {
-        checkAnswer(
-          sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND p = 1"),
-          Row(1 + 2 + 3))
-      }
-
-      test(s"$prefix: hive udfs $table") {
-        checkAnswer(
-          sql(s"SELECT concat(stringField, stringField) FROM $table"),
-          sql(s"SELECT stringField FROM $table").map {
-            case Row(s: String) => Row(s + s)
-          }.collect().toSeq)
-      }
+  Seq("partitioned_parquet", "partitioned_parquet_with_key").foreach { table =>
+    test(s"ordering of the partitioning columns $table") {
+      checkAnswer(
+        sql(s"SELECT p, stringField FROM $table WHERE p = 1"),
+        Seq.fill(10)(Row(1, "part-1"))
+      )
+
+      checkAnswer(
+        sql(s"SELECT stringField, p FROM $table WHERE p = 1"),
+        Seq.fill(10)(Row("part-1", 1))
+      )
+    }
+
+    test(s"project the partitioning column $table") {
+      checkAnswer(
+        sql(s"SELECT p, count(*) FROM $table group by p"),
+        Row(1, 10) ::
+          Row(2, 10) ::
+          Row(3, 10) ::
+          Row(4, 10) ::
+          Row(5, 10) ::
+          Row(6, 10) ::
+          Row(7, 10) ::
+          Row(8, 10) ::
+          Row(9, 10) ::
+          Row(10, 10) :: Nil
+      )
+    }
+
+    test(s"project partitioning and non-partitioning columns $table") {
+      checkAnswer(
+        sql(s"SELECT stringField, p, count(intField) FROM $table GROUP BY p, stringField"),
+        Row("part-1", 1, 10) ::
+          Row("part-2", 2, 10) ::
+          Row("part-3", 3, 10) ::
+          Row("part-4", 4, 10) ::
+          Row("part-5", 5, 10) ::
+          Row("part-6", 6, 10) ::
+          Row("part-7", 7, 10) ::
+          Row("part-8", 8, 10) ::
+          Row("part-9", 9, 10) ::
+          Row("part-10", 10, 10) :: Nil
+      )
+    }
+
+    test(s"simple count $table") {
+      checkAnswer(
+        sql(s"SELECT COUNT(*) FROM $table"),
+        Row(100))
     }
 
-    test(s"$prefix: $prefix: non-part select(*)") {
+    test(s"pruned count $table") {
       checkAnswer(
-        sql("SELECT COUNT(*) FROM normal_parquet"),
+        sql(s"SELECT COUNT(*) FROM $table WHERE p = 1"),
         Row(10))
     }
-  }
 
-  setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
-  run("Parquet data source enabled")
+    test(s"non-existent partition $table") {
+      checkAnswer(
+        sql(s"SELECT COUNT(*) FROM $table WHERE p = 1000"),
+        Row(0))
+    }
+
+    test(s"multi-partition pruned count $table") {
+      checkAnswer(
+        sql(s"SELECT COUNT(*) FROM $table WHERE p IN (1,2,3)"),
+        Row(30))
+    }
+
+    test(s"non-partition predicates $table") {
+      checkAnswer(
+        sql(s"SELECT COUNT(*) FROM $table WHERE intField IN (1,2,3)"),
+        Row(30))
+    }
 
-  setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
-  run("Parquet data source disabled")
+    test(s"sum $table") {
+      checkAnswer(
+        sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND p = 1"),
+        Row(1 + 2 + 3))
+    }
+
+    test(s"hive udfs $table") {
+      checkAnswer(
+        sql(s"SELECT concat(stringField, stringField) FROM $table"),
+        sql(s"SELECT stringField FROM $table").map {
+          case Row(s: String) => Row(s + s)
+        }.collect().toSeq)
+    }
+  }
+
+  test("non-part select(*)") {
+    checkAnswer(
+      sql("SELECT COUNT(*) FROM normal_parquet"),
+      Row(10))
+  }
 }

From 1115e8e739ec4d60604cd79bd452770f041510d4 Mon Sep 17 00:00:00 2001
From: Xutingjun <1039320815@qq.com>
Date: Mon, 16 Feb 2015 14:54:23 +0000
Subject: [PATCH 181/357] [SPARK-5831][Streaming]When checkpoint file size is
 bigger than 10, then delete the old ones

Author: Xutingjun <1039320815@qq.com>

Closes #4621 from XuTingjun/checkpointFile and squashes the following commits:

b5f2816 [Xutingjun] fix bug
---
 .../src/main/scala/org/apache/spark/streaming/Checkpoint.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
index b780282bdac37..f88a8a0151550 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
@@ -152,7 +152,7 @@ class CheckpointWriter(
 
           // Delete old checkpoint files
           val allCheckpointFiles = Checkpoint.getCheckpointFiles(checkpointDir, fs)
-          if (allCheckpointFiles.size > 4) {
+          if (allCheckpointFiles.size > 10) {
             allCheckpointFiles.take(allCheckpointFiles.size - 10).foreach(file => {
               logInfo("Deleting " + file)
               fs.delete(file, true)

From a3afa4a1bff88c4d8a5228fcf1e0cfc132541a22 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 16 Feb 2015 17:04:30 +0000
Subject: [PATCH 182/357] SPARK-5815 [MLLIB] Part 2. Deprecate SVDPlusPlus APIs
 that expose DoubleMatrix from JBLAS

Now, deprecated runSVDPlusPlus and update run, for 1.4.0 / master only

Author: Sean Owen <sowen@cloudera.com>

Closes #4625 from srowen/SPARK-5815.2 and squashes the following commits:

6fd2ca5 [Sean Owen] Now, deprecated runSVDPlusPlus and update run, for 1.4.0 / master only
---
 .../apache/spark/graphx/lib/SVDPlusPlus.scala | 42 +++++++------------
 1 file changed, 15 insertions(+), 27 deletions(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
index fc84cfbe64184..3e4157a63fd1c 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.graphx.lib
 
-import org.apache.spark.annotation.Experimental
-
 import scala.util.Random
 import org.jblas.DoubleMatrix
 import org.apache.spark.rdd._
@@ -40,8 +38,17 @@ object SVDPlusPlus {
     extends Serializable
 
   /**
-   * :: Experimental ::
-   *
+   * This method is now replaced by the updated version of `run()` and returns exactly
+   * the same result.
+   */
+  @deprecated("Call run()", "1.4.0")
+  def runSVDPlusPlus(edges: RDD[Edge[Double]], conf: Conf)
+    : (Graph[(Array[Double], Array[Double], Double, Double), Double], Double) =
+  {
+    run(edges, conf)
+  }
+
+  /**
    * Implement SVD++ based on "Factorization Meets the Neighborhood:
    * a Multifaceted Collaborative Filtering Model",
    * available at [[http://public.research.att.com/~volinsky/netflix/kdd08koren.pdf]].
@@ -49,35 +56,14 @@ object SVDPlusPlus {
    * The prediction rule is rui = u + bu + bi + qi*(pu + |N(u)|^(-0.5)*sum(y)),
    * see the details on page 6.
    *
-   * This method temporarily replaces `run()`, and replaces `DoubleMatrix` in `run()`'s return
-   * value with `Array[Double]`. In 1.4.0, this method will be deprecated, but will be copied
-   * to replace `run()`, which will then be undeprecated.
-   *
    * @param edges edges for constructing the graph
    *
    * @param conf SVDPlusPlus parameters
    *
    * @return a graph with vertex attributes containing the trained model
    */
-  @Experimental
-  def runSVDPlusPlus(edges: RDD[Edge[Double]], conf: Conf)
-    : (Graph[(Array[Double], Array[Double], Double, Double), Double], Double) =
-  {
-    val (graph, u) = run(edges, conf)
-    // Convert DoubleMatrix to Array[Double]:
-    val newVertices = graph.vertices.mapValues(v => (v._1.toArray, v._2.toArray, v._3, v._4))
-    (Graph(newVertices, graph.edges), u)
-  }
-
-  /**
-   * This method is deprecated in favor of `runSVDPlusPlus()`, which replaces `DoubleMatrix`
-   * with `Array[Double]` in its return value. This method is deprecated. It will effectively
-   * be removed in 1.4.0 when `runSVDPlusPlus()` is copied to replace `run()`, and hence the
-   * return type of this method changes.
-   */
-  @deprecated("Call runSVDPlusPlus", "1.3.0")
   def run(edges: RDD[Edge[Double]], conf: Conf)
-    : (Graph[(DoubleMatrix, DoubleMatrix, Double, Double), Double], Double) =
+    : (Graph[(Array[Double], Array[Double], Double, Double), Double], Double) =
   {
     // Generate default vertex attribute
     def defaultF(rank: Int): (DoubleMatrix, DoubleMatrix, Double, Double) = {
@@ -194,7 +180,9 @@ object SVDPlusPlus {
     g.unpersist()
     g = gJoinT3
 
-    (g, u)
+    // Convert DoubleMatrix to Array[Double]:
+    val newVertices = g.vertices.mapValues(v => (v._1.toArray, v._2.toArray, v._3, v._4))
+    (Graph(newVertices, g.edges), u)
   }
 
   /**

From 5c78be7a515fc2fc92cda0517318e7b5d85762f4 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 16 Feb 2015 10:06:11 -0800
Subject: [PATCH 183/357] [SPARK-5799][SQL] Compute aggregation function on
 specified numeric columns

Compute aggregation function on specified numeric columns. For example:

    val df = Seq(("a", 1, 0, "b"), ("b", 2, 4, "c"), ("a", 2, 3, "d")).toDataFrame("key", "value1", "value2", "rest")
    df.groupBy("key").min("value2")

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #4592 from viirya/specific_cols_agg and squashes the following commits:

9446896 [Liang-Chi Hsieh] For comments.
314c4cd [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into specific_cols_agg
353fad7 [Liang-Chi Hsieh] For python unit tests.
54ed0c4 [Liang-Chi Hsieh] Address comments.
b079e6b [Liang-Chi Hsieh] Remove duplicate codes.
55100fb [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into specific_cols_agg
880c2ac [Liang-Chi Hsieh] Fix Python style checks.
4c63a01 [Liang-Chi Hsieh] Fix pyspark.
b1a24fc [Liang-Chi Hsieh] Address comments.
2592f29 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into specific_cols_agg
27069c3 [Liang-Chi Hsieh] Combine functions and add varargs annotation.
371a3f7 [Liang-Chi Hsieh] Compute aggregation function on specified numeric columns.
---
 python/pyspark/sql/dataframe.py               | 74 +++++++++++++++----
 python/pyspark/sql/functions.py               |  2 +
 .../org/apache/spark/sql/DataFrameImpl.scala  |  4 +-
 .../org/apache/spark/sql/GroupedData.scala    | 57 +++++++++++---
 .../org/apache/spark/sql/DataFrameSuite.scala | 12 +++
 5 files changed, 123 insertions(+), 26 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 1438fe5285cc5..28a59e73a3410 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -664,6 +664,18 @@ def _api(self):
     return _api
 
 
+def df_varargs_api(f):
+    def _api(self, *args):
+        jargs = ListConverter().convert(args,
+                                        self.sql_ctx._sc._gateway._gateway_client)
+        name = f.__name__
+        jdf = getattr(self._jdf, name)(self.sql_ctx._sc._jvm.PythonUtils.toSeq(jargs))
+        return DataFrame(jdf, self.sql_ctx)
+    _api.__name__ = f.__name__
+    _api.__doc__ = f.__doc__
+    return _api
+
+
 class GroupedData(object):
 
     """
@@ -714,30 +726,60 @@ def count(self):
         [Row(age=2, count=1), Row(age=5, count=1)]
         """
 
-    @dfapi
-    def mean(self):
+    @df_varargs_api
+    def mean(self, *cols):
         """Compute the average value for each numeric columns
-        for each group. This is an alias for `avg`."""
+        for each group. This is an alias for `avg`.
 
-    @dfapi
-    def avg(self):
+        >>> df.groupBy().mean('age').collect()
+        [Row(AVG(age#0)=3.5)]
+        >>> df3.groupBy().mean('age', 'height').collect()
+        [Row(AVG(age#4)=3.5, AVG(height#5)=82.5)]
+        """
+
+    @df_varargs_api
+    def avg(self, *cols):
         """Compute the average value for each numeric columns
-        for each group."""
+        for each group.
 
-    @dfapi
-    def max(self):
+        >>> df.groupBy().avg('age').collect()
+        [Row(AVG(age#0)=3.5)]
+        >>> df3.groupBy().avg('age', 'height').collect()
+        [Row(AVG(age#4)=3.5, AVG(height#5)=82.5)]
+        """
+
+    @df_varargs_api
+    def max(self, *cols):
         """Compute the max value for each numeric columns for
-        each group. """
+        each group.
 
-    @dfapi
-    def min(self):
+        >>> df.groupBy().max('age').collect()
+        [Row(MAX(age#0)=5)]
+        >>> df3.groupBy().max('age', 'height').collect()
+        [Row(MAX(age#4)=5, MAX(height#5)=85)]
+        """
+
+    @df_varargs_api
+    def min(self, *cols):
         """Compute the min value for each numeric column for
-        each group."""
+        each group.
 
-    @dfapi
-    def sum(self):
+        >>> df.groupBy().min('age').collect()
+        [Row(MIN(age#0)=2)]
+        >>> df3.groupBy().min('age', 'height').collect()
+        [Row(MIN(age#4)=2, MIN(height#5)=80)]
+        """
+
+    @df_varargs_api
+    def sum(self, *cols):
         """Compute the sum for each numeric columns for each
-        group."""
+        group.
+
+        >>> df.groupBy().sum('age').collect()
+        [Row(SUM(age#0)=7)]
+        >>> df3.groupBy().sum('age', 'height').collect()
+        [Row(SUM(age#4)=7, SUM(height#5)=165)]
+        """
 
 
 def _create_column_from_literal(literal):
@@ -945,6 +987,8 @@ def _test():
     globs['sqlCtx'] = SQLContext(sc)
     globs['df'] = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF()
     globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF()
+    globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80),
+                                  Row(name='Bob', age=5, height=85)]).toDF()
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.dataframe, globs=globs,
         optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 39aa550eeb5ad..d0e090607ff4f 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -158,6 +158,8 @@ def _test():
     globs['sqlCtx'] = SQLContext(sc)
     globs['df'] = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF()
     globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF()
+    globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80),
+                                  Row(name='Bob', age=5, height=85)]).toDF()
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.dataframe, globs=globs,
         optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index 7b7efbe3477b6..9eb0c131405d8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -88,12 +88,12 @@ private[sql] class DataFrameImpl protected[sql](
     }
   }
 
-  protected[sql] def numericColumns: Seq[Expression] = {
+  protected[sql] def numericColumns(): Seq[Expression] = {
     schema.fields.filter(_.dataType.isInstanceOf[NumericType]).map { n =>
       queryExecution.analyzed.resolve(n.name, sqlContext.analyzer.resolver).get
     }
   }
-
+ 
   override def toDF(colNames: String*): DataFrame = {
     require(schema.size == colNames.size,
       "The number of columns doesn't match.\n" +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index 0868013fe7c96..a5a677b68863f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -23,6 +23,8 @@ import scala.collection.JavaConversions._
 import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.Aggregate
+import org.apache.spark.sql.types.NumericType
+
 
 
 /**
@@ -39,13 +41,30 @@ class GroupedData protected[sql](df: DataFrameImpl, groupingExprs: Seq[Expressio
       df.sqlContext, Aggregate(groupingExprs, namedGroupingExprs ++ aggExprs, df.logicalPlan))
   }
 
-  private[this] def aggregateNumericColumns(f: Expression => Expression): Seq[NamedExpression] = {
-    df.numericColumns.map { c =>
+  private[this] def aggregateNumericColumns(colNames: String*)(f: Expression => Expression)
+    : Seq[NamedExpression] = {
+
+    val columnExprs = if (colNames.isEmpty) {
+      // No columns specified. Use all numeric columns.
+      df.numericColumns
+    } else {
+      // Make sure all specified columns are numeric
+      colNames.map { colName =>
+        val namedExpr = df.resolve(colName)
+        if (!namedExpr.dataType.isInstanceOf[NumericType]) {
+          throw new AnalysisException(
+            s""""$colName" is not a numeric column. """ +
+            "Aggregation function can only be performed on a numeric column.")
+        }
+        namedExpr
+      }
+    }
+    columnExprs.map { c =>
       val a = f(c)
       Alias(a, a.toString)()
     }
   }
-
+ 
   private[this] def strToExpr(expr: String): (Expression => Expression) = {
     expr.toLowerCase match {
       case "avg" | "average" | "mean" => Average
@@ -152,30 +171,50 @@ class GroupedData protected[sql](df: DataFrameImpl, groupingExprs: Seq[Expressio
   /**
    * Compute the average value for each numeric columns for each group. This is an alias for `avg`.
    * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the average values for them.
    */
-  def mean(): DataFrame = aggregateNumericColumns(Average)
-
+  @scala.annotation.varargs
+  def mean(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames:_*)(Average)
+  }
+ 
   /**
    * Compute the max value for each numeric columns for each group.
    * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the max values for them.
    */
-  def max(): DataFrame = aggregateNumericColumns(Max)
+  @scala.annotation.varargs
+  def max(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames:_*)(Max)
+  }
 
   /**
    * Compute the mean value for each numeric columns for each group.
    * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the mean values for them.
    */
-  def avg(): DataFrame = aggregateNumericColumns(Average)
+  @scala.annotation.varargs
+  def avg(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames:_*)(Average)
+  }
 
   /**
    * Compute the min value for each numeric column for each group.
    * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the min values for them.
    */
-  def min(): DataFrame = aggregateNumericColumns(Min)
+  @scala.annotation.varargs
+  def min(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames:_*)(Min)
+  }
 
   /**
    * Compute the sum for each numeric columns for each group.
    * The resulting [[DataFrame]] will also contain the grouping columns.
+   * When specified columns are given, only compute the sum for them.
    */
-  def sum(): DataFrame = aggregateNumericColumns(Sum)
+  @scala.annotation.varargs
+  def sum(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames:_*)(Sum)
+  }    
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index f0cd43632ec3f..524571d9cc636 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -162,6 +162,18 @@ class DataFrameSuite extends QueryTest {
       testData2.groupBy("a").agg(Map("b" -> "sum")),
       Row(1, 3) :: Row(2, 3) :: Row(3, 3) :: Nil
     )
+
+    val df1 = Seq(("a", 1, 0, "b"), ("b", 2, 4, "c"), ("a", 2, 3, "d"))
+      .toDF("key", "value1", "value2", "rest")
+
+    checkAnswer(
+      df1.groupBy("key").min(),
+      df1.groupBy("key").min("value1", "value2").collect()
+    )
+    checkAnswer(
+      df1.groupBy("key").min("value2"),
+      Seq(Row("a", 0), Row("b", 4))
+    )
   }
 
   test("agg without groups") {

From 9baac56ccd57d3890a9b6439d4e13bbe9381822b Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 16 Feb 2015 10:09:55 -0800
Subject: [PATCH 184/357] Minor fixes for commit
 https://github.com/apache/spark/pull/4592.

---
 .../main/scala/org/apache/spark/sql/DataFrameImpl.scala  | 6 +++---
 .../main/scala/org/apache/spark/sql/GroupedData.scala    | 9 ++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index 9eb0c131405d8..500e3c90fdbc1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -83,17 +83,17 @@ private[sql] class DataFrameImpl protected[sql](
 
   protected[sql] def resolve(colName: String): NamedExpression = {
     queryExecution.analyzed.resolve(colName, sqlContext.analyzer.resolver).getOrElse {
-      throw new RuntimeException(
+      throw new AnalysisException(
         s"""Cannot resolve column name "$colName" among (${schema.fieldNames.mkString(", ")})""")
     }
   }
 
-  protected[sql] def numericColumns(): Seq[Expression] = {
+  protected[sql] def numericColumns: Seq[Expression] = {
     schema.fields.filter(_.dataType.isInstanceOf[NumericType]).map { n =>
       queryExecution.analyzed.resolve(n.name, sqlContext.analyzer.resolver).get
     }
   }
- 
+
   override def toDF(colNames: String*): DataFrame = {
     require(schema.size == colNames.size,
       "The number of columns doesn't match.\n" +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index a5a677b68863f..2ecf086de92f7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.sql
 
-import scala.language.implicitConversions
 import scala.collection.JavaConversions._
+import scala.language.implicitConversions
 
 import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.expressions._
@@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.plans.logical.Aggregate
 import org.apache.spark.sql.types.NumericType
 
 
-
 /**
  * A set of methods for aggregations on a [[DataFrame]], created by [[DataFrame.groupBy]].
  */
@@ -48,13 +47,13 @@ class GroupedData protected[sql](df: DataFrameImpl, groupingExprs: Seq[Expressio
       // No columns specified. Use all numeric columns.
       df.numericColumns
     } else {
-      // Make sure all specified columns are numeric
+      // Make sure all specified columns are numeric.
       colNames.map { colName =>
         val namedExpr = df.resolve(colName)
         if (!namedExpr.dataType.isInstanceOf[NumericType]) {
           throw new AnalysisException(
             s""""$colName" is not a numeric column. """ +
-            "Aggregation function can only be performed on a numeric column.")
+            "Aggregation function can only be applied on a numeric column.")
         }
         namedExpr
       }
@@ -64,7 +63,7 @@ class GroupedData protected[sql](df: DataFrameImpl, groupingExprs: Seq[Expressio
       Alias(a, a.toString)()
     }
   }
- 
+
   private[this] def strToExpr(expr: String): (Expression => Expression) = {
     expr.toLowerCase match {
       case "avg" | "average" | "mean" => Average

From 8e25373ce72061d3b6a353259ec627606afa4a5f Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 16 Feb 2015 19:32:31 +0000
Subject: [PATCH 185/357] SPARK-5795 [STREAMING]
 api.java.JavaPairDStream.saveAsNewAPIHadoopFiles may not friendly to java

Revise JavaPairDStream API declaration on saveAs Hadoop methods, to allow it to be called directly as intended.

CC tdas for review

Author: Sean Owen <sowen@cloudera.com>

Closes #4608 from srowen/SPARK-5795 and squashes the following commits:

36f1ead [Sean Owen] Add code that shows compile problem and fix
036bd27 [Sean Owen] Revise JavaPairDStream API declaration on saveAs Hadoop methods, to allow it to be called directly as intended.
---
 .../streaming/api/java/JavaPairDStream.scala  | 20 +++++++++----------
 .../apache/spark/streaming/JavaAPISuite.java  | 18 +++++++++++++++++
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
index de124cf40eff1..bd01789b611a4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
@@ -726,7 +726,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    * Save each RDD in `this` DStream as a Hadoop file. The file name at each batch interval is
    * generated based on `prefix` and `suffix`: "prefix-TIME_IN_MS.suffix".
    */
-  def saveAsHadoopFiles[F <: OutputFormat[K, V]](prefix: String, suffix: String) {
+  def saveAsHadoopFiles(prefix: String, suffix: String) {
     dstream.saveAsHadoopFiles(prefix, suffix)
   }
 
@@ -734,12 +734,12 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    * Save each RDD in `this` DStream as a Hadoop file. The file name at each batch interval is
    * generated based on `prefix` and `suffix`: "prefix-TIME_IN_MS.suffix".
    */
-  def saveAsHadoopFiles(
+  def saveAsHadoopFiles[F <: OutputFormat[_, _]](
       prefix: String,
       suffix: String,
       keyClass: Class[_],
       valueClass: Class[_],
-      outputFormatClass: Class[_ <: OutputFormat[_, _]]) {
+      outputFormatClass: Class[F]) {
     dstream.saveAsHadoopFiles(prefix, suffix, keyClass, valueClass, outputFormatClass)
   }
 
@@ -747,12 +747,12 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    * Save each RDD in `this` DStream as a Hadoop file. The file name at each batch interval is
    * generated based on `prefix` and `suffix`: "prefix-TIME_IN_MS.suffix".
    */
-  def saveAsHadoopFiles(
+  def saveAsHadoopFiles[F <: OutputFormat[_, _]](
       prefix: String,
       suffix: String,
       keyClass: Class[_],
       valueClass: Class[_],
-      outputFormatClass: Class[_ <: OutputFormat[_, _]],
+      outputFormatClass: Class[F],
       conf: JobConf) {
     dstream.saveAsHadoopFiles(prefix, suffix, keyClass, valueClass, outputFormatClass, conf)
   }
@@ -761,7 +761,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    * Save each RDD in `this` DStream as a Hadoop file. The file name at each batch interval is
    * generated based on `prefix` and `suffix`: "prefix-TIME_IN_MS.suffix".
    */
-  def saveAsNewAPIHadoopFiles[F <: NewOutputFormat[K, V]](prefix: String, suffix: String) {
+  def saveAsNewAPIHadoopFiles(prefix: String, suffix: String) {
     dstream.saveAsNewAPIHadoopFiles(prefix, suffix)
   }
 
@@ -769,12 +769,12 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    * Save each RDD in `this` DStream as a Hadoop file. The file name at each batch interval is
    * generated based on `prefix` and `suffix`: "prefix-TIME_IN_MS.suffix".
    */
-  def saveAsNewAPIHadoopFiles(
+  def saveAsNewAPIHadoopFiles[F <: NewOutputFormat[_, _]](
       prefix: String,
       suffix: String,
       keyClass: Class[_],
       valueClass: Class[_],
-      outputFormatClass: Class[_ <: NewOutputFormat[_, _]]) {
+      outputFormatClass: Class[F]) {
     dstream.saveAsNewAPIHadoopFiles(prefix, suffix, keyClass, valueClass, outputFormatClass)
   }
 
@@ -782,12 +782,12 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    * Save each RDD in `this` DStream as a Hadoop file. The file name at each batch interval is
    * generated based on `prefix` and `suffix`: "prefix-TIME_IN_MS.suffix".
    */
-  def saveAsNewAPIHadoopFiles(
+  def saveAsNewAPIHadoopFiles[F <: NewOutputFormat[_, _]](
       prefix: String,
       suffix: String,
       keyClass: Class[_],
       valueClass: Class[_],
-      outputFormatClass: Class[_ <: NewOutputFormat[_, _]],
+      outputFormatClass: Class[F],
       conf: Configuration = new Configuration) {
     dstream.saveAsNewAPIHadoopFiles(prefix, suffix, keyClass, valueClass, outputFormatClass, conf)
   }
diff --git a/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
index 2df8cf6a8a3df..57302ff407183 100644
--- a/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
+++ b/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
@@ -1828,4 +1828,22 @@ private List<List<String>> fileTestPrepare(File testDir) throws IOException {
 
     return expected;
   }
+
+  // SPARK-5795: no logic assertions, just testing that intended API invocations compile
+  private void compileSaveAsJavaAPI(JavaPairDStream<LongWritable,Text> pds) {
+    pds.saveAsNewAPIHadoopFiles(
+        "", "", LongWritable.class, Text.class,
+        org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
+    pds.saveAsHadoopFiles(
+        "", "", LongWritable.class, Text.class,
+        org.apache.hadoop.mapred.SequenceFileOutputFormat.class);
+    // Checks that a previous common workaround for this API still compiles
+    pds.saveAsNewAPIHadoopFiles(
+        "", "", LongWritable.class, Text.class,
+        (Class) org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
+    pds.saveAsHadoopFiles(
+        "", "", LongWritable.class, Text.class,
+        (Class) org.apache.hadoop.mapred.SequenceFileOutputFormat.class);
+  }
+
 }

From cc552e042896350e21eec9b78593de25006ecc70 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Mon, 16 Feb 2015 12:21:08 -0800
Subject: [PATCH 186/357] [SQL] [Minor] Update the SpecificMutableRow.copy

When profiling the Join / Aggregate queries via VisualVM, I noticed lots of `SpecificMutableRow` objects created, as well as the `MutableValue`, since the `SpecificMutableRow` are mostly used in data source implementation, but the `copy` method could be called multiple times in upper modules (e.g. in Join / aggregation etc.), duplicated instances created should be avoid.

Author: Cheng Hao <hao.cheng@intel.com>

Closes #4619 from chenghao-intel/specific_mutable_row and squashes the following commits:

9300d23 [Cheng Hao] update the SpecificMutableRow.copy
---
 .../sql/catalyst/expressions/SpecificMutableRow.scala      | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
index 7434165f654f8..21d714c9a8c3b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SpecificMutableRow.scala
@@ -220,13 +220,14 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR
   override def isNullAt(i: Int): Boolean = values(i).isNull
 
   override def copy(): Row = {
-    val newValues = new Array[MutableValue](values.length)
+    val newValues = new Array[Any](values.length)
     var i = 0
     while (i < values.length) {
-      newValues(i) = values(i).copy()
+      newValues(i) = values(i).boxed
       i += 1
     }
-    new SpecificMutableRow(newValues)
+
+    new GenericRow(newValues)
   }
 
   override def update(ordinal: Int, value: Any): Unit = {

From 275a0c08134dea1896eab73a8e017256900fb1db Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Mon, 16 Feb 2015 12:31:36 -0800
Subject: [PATCH 187/357] [SPARK-5824] [SQL] add null format in ctas and set
 default col comment to null

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #4609 from adrian-wang/ctas and squashes the following commits:

0a75d5a [Daoyuan Wang] reorder import
93d1863 [Daoyuan Wang] add null format in ctas and set default col comment to null
---
 .../execution/HiveCompatibilitySuite.scala     |  1 +
 .../spark/sql/hive/HiveMetastoreCatalog.scala  |  3 ++-
 ...ormatCTAS-0-36f9196395758cebfed837a1c391a1e |  0
 ...rmatCTAS-1-b5a31d4cb34218b8de1ac3fed59fa75b |  0
 ...matCTAS-10-7f4f04b87c7ef9653b4646949b24cf0b | 10 ++++++++++
 ...matCTAS-11-4a4c16b53c612d00012d338c97bf5281 |  0
 ...matCTAS-12-7f4f04b87c7ef9653b4646949b24cf0b | 10 ++++++++++
 ...matCTAS-13-2e59caa113585495d8684fee69d88bc0 |  0
 ...matCTAS-14-ad9fe9d68c2cf492259af4f6167c1b12 |  0
 ...rmatCTAS-2-aa2bdbd93668dceae43d1a02f2ede68d |  0
 ...rmatCTAS-3-b0057150f237050f38c1efa1f2d6b273 |  6 ++++++
 ...rmatCTAS-4-16c7086f39d6458b6c5cf2479f0473bd |  0
 ...rmatCTAS-5-183d77b734ce6a373de5b3ebe1cd04c9 |  0
 ...rmatCTAS-6-159fff36b548e00ee952d1df8ef19833 |  0
 ...rmatCTAS-7-46900b082b02ce3e58087d1f41128f65 |  4 ++++
 ...rmatCTAS-8-7f26cbd6be5631a3acce26f667d1c5d8 | 18 ++++++++++++++++++
 ...rmatCTAS-9-22e1b3899de7087b39c24d9d8f643b47 |  1 +
 .../org/apache/spark/sql/hive/Shim12.scala     |  2 ++
 .../org/apache/spark/sql/hive/Shim13.scala     |  7 +++++++
 19 files changed, 61 insertions(+), 1 deletion(-)
 create mode 100644 sql/hive/src/test/resources/golden/nullformatCTAS-0-36f9196395758cebfed837a1c391a1e
 create mode 100644 sql/hive/src/test/resources/golden/nullformatCTAS-1-b5a31d4cb34218b8de1ac3fed59fa75b
 create mode 100644 sql/hive/src/test/resources/golden/nullformatCTAS-10-7f4f04b87c7ef9653b4646949b24cf0b
 create mode 100644 sql/hive/src/test/resources/golden/nullformatCTAS-11-4a4c16b53c612d00012d338c97bf5281
 create mode 100644 sql/hive/src/test/resources/golden/nullformatCTAS-12-7f4f04b87c7ef9653b4646949b24cf0b
 create mode 100644 sql/hive/src/test/resources/golden/nullformatCTAS-13-2e59caa113585495d8684fee69d88bc0
 create mode 100644 sql/hive/src/test/resources/golden/nullformatCTAS-14-ad9fe9d68c2cf492259af4f6167c1b12
 create mode 100644 sql/hive/src/test/resources/golden/nullformatCTAS-2-aa2bdbd93668dceae43d1a02f2ede68d
 create mode 100644 sql/hive/src/test/resources/golden/nullformatCTAS-3-b0057150f237050f38c1efa1f2d6b273
 create mode 100644 sql/hive/src/test/resources/golden/nullformatCTAS-4-16c7086f39d6458b6c5cf2479f0473bd
 create mode 100644 sql/hive/src/test/resources/golden/nullformatCTAS-5-183d77b734ce6a373de5b3ebe1cd04c9
 create mode 100644 sql/hive/src/test/resources/golden/nullformatCTAS-6-159fff36b548e00ee952d1df8ef19833
 create mode 100644 sql/hive/src/test/resources/golden/nullformatCTAS-7-46900b082b02ce3e58087d1f41128f65
 create mode 100644 sql/hive/src/test/resources/golden/nullformatCTAS-8-7f26cbd6be5631a3acce26f667d1c5d8
 create mode 100644 sql/hive/src/test/resources/golden/nullformatCTAS-9-22e1b3899de7087b39c24d9d8f643b47

diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index e443e5bd5f54d..133f2d3c84a2e 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -640,6 +640,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "nonblock_op_deduplicate",
     "notable_alias1",
     "notable_alias2",
+    "nullformatCTAS",
     "nullgroup",
     "nullgroup2",
     "nullgroup3",
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 6d794d0e11391..f82778c87672c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -240,7 +240,7 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
     val hiveSchema: JList[FieldSchema] = if (schema == null || schema.isEmpty) {
       crtTbl.getCols
     } else {
-      schema.map(attr => new FieldSchema(attr.name, toMetastoreType(attr.dataType), ""))
+      schema.map(attr => new FieldSchema(attr.name, toMetastoreType(attr.dataType), null))
     }
     tbl.setFields(hiveSchema)
 
@@ -314,6 +314,7 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
     if (crtTbl != null && crtTbl.getLineDelim() != null) {
       tbl.setSerdeParam(serdeConstants.LINE_DELIM, crtTbl.getLineDelim())
     }
+    HiveShim.setTblNullFormat(crtTbl, tbl)
 
     if (crtTbl != null && crtTbl.getSerdeProps() != null) {
       val iter = crtTbl.getSerdeProps().entrySet().iterator()
diff --git a/sql/hive/src/test/resources/golden/nullformatCTAS-0-36f9196395758cebfed837a1c391a1e b/sql/hive/src/test/resources/golden/nullformatCTAS-0-36f9196395758cebfed837a1c391a1e
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/nullformatCTAS-1-b5a31d4cb34218b8de1ac3fed59fa75b b/sql/hive/src/test/resources/golden/nullformatCTAS-1-b5a31d4cb34218b8de1ac3fed59fa75b
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/nullformatCTAS-10-7f4f04b87c7ef9653b4646949b24cf0b b/sql/hive/src/test/resources/golden/nullformatCTAS-10-7f4f04b87c7ef9653b4646949b24cf0b
new file mode 100644
index 0000000000000..e74deff51c9ba
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/nullformatCTAS-10-7f4f04b87c7ef9653b4646949b24cf0b
@@ -0,0 +1,10 @@
+1.0	1
+1.0	1
+1.0	1
+1.0	1
+1.0	1
+NULL	1
+NULL	NULL
+1.0	NULL
+1.0	1
+1.0	1
diff --git a/sql/hive/src/test/resources/golden/nullformatCTAS-11-4a4c16b53c612d00012d338c97bf5281 b/sql/hive/src/test/resources/golden/nullformatCTAS-11-4a4c16b53c612d00012d338c97bf5281
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/nullformatCTAS-12-7f4f04b87c7ef9653b4646949b24cf0b b/sql/hive/src/test/resources/golden/nullformatCTAS-12-7f4f04b87c7ef9653b4646949b24cf0b
new file mode 100644
index 0000000000000..00ebb521970dd
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/nullformatCTAS-12-7f4f04b87c7ef9653b4646949b24cf0b
@@ -0,0 +1,10 @@
+1.0	1
+1.0	1
+1.0	1
+1.0	1
+1.0	1
+fooNull	1
+fooNull	fooNull
+1.0	fooNull
+1.0	1
+1.0	1
diff --git a/sql/hive/src/test/resources/golden/nullformatCTAS-13-2e59caa113585495d8684fee69d88bc0 b/sql/hive/src/test/resources/golden/nullformatCTAS-13-2e59caa113585495d8684fee69d88bc0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/nullformatCTAS-14-ad9fe9d68c2cf492259af4f6167c1b12 b/sql/hive/src/test/resources/golden/nullformatCTAS-14-ad9fe9d68c2cf492259af4f6167c1b12
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/nullformatCTAS-2-aa2bdbd93668dceae43d1a02f2ede68d b/sql/hive/src/test/resources/golden/nullformatCTAS-2-aa2bdbd93668dceae43d1a02f2ede68d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/nullformatCTAS-3-b0057150f237050f38c1efa1f2d6b273 b/sql/hive/src/test/resources/golden/nullformatCTAS-3-b0057150f237050f38c1efa1f2d6b273
new file mode 100644
index 0000000000000..b00bcb3624532
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/nullformatCTAS-3-b0057150f237050f38c1efa1f2d6b273
@@ -0,0 +1,6 @@
+a                   	string              	                    
+b                   	string              	                    
+c                   	string              	                    
+d                   	string              	                    
+	 	 
+Detailed Table Information	Table(tableName:base_tab, dbName:default, owner:animal, createTime:1423973915, lastAccessTime:0, retention:0, sd:StorageDescriptor(cols:[FieldSchema(name:a, type:string, comment:null), FieldSchema(name:b, type:string, comment:null), FieldSchema(name:c, type:string, comment:null), FieldSchema(name:d, type:string, comment:null)], location:file:/tmp/sparkHiveWarehouse2573474017665704744/base_tab, inputFormat:org.apache.hadoop.mapred.TextInputFormat, outputFormat:org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat, compressed:false, numBuckets:-1, serdeInfo:SerDeInfo(name:null, serializationLib:org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, parameters:{serialization.format=1}), bucketCols:[], sortCols:[], parameters:{}, skewedInfo:SkewedInfo(skewedColNames:[], skewedColValues:[], skewedColValueLocationMaps:{}), storedAsSubDirectories:false), partitionKeys:[], parameters:{numFiles=1, transient_lastDdlTime=1423973915, COLUMN_STATS_ACCURATE=true, totalSize=130, numRows=0, rawDataSize=0}, viewOriginalText:null, viewExpandedText:null, tableType:MANAGED_TABLE)	
diff --git a/sql/hive/src/test/resources/golden/nullformatCTAS-4-16c7086f39d6458b6c5cf2479f0473bd b/sql/hive/src/test/resources/golden/nullformatCTAS-4-16c7086f39d6458b6c5cf2479f0473bd
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/nullformatCTAS-5-183d77b734ce6a373de5b3ebe1cd04c9 b/sql/hive/src/test/resources/golden/nullformatCTAS-5-183d77b734ce6a373de5b3ebe1cd04c9
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/nullformatCTAS-6-159fff36b548e00ee952d1df8ef19833 b/sql/hive/src/test/resources/golden/nullformatCTAS-6-159fff36b548e00ee952d1df8ef19833
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/nullformatCTAS-7-46900b082b02ce3e58087d1f41128f65 b/sql/hive/src/test/resources/golden/nullformatCTAS-7-46900b082b02ce3e58087d1f41128f65
new file mode 100644
index 0000000000000..264c973ff7af1
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/nullformatCTAS-7-46900b082b02ce3e58087d1f41128f65
@@ -0,0 +1,4 @@
+a                   	string              	                    
+b                   	string              	                    
+	 	 
+Detailed Table Information	Table(tableName:null_tab3, dbName:default, owner:animal, createTime:1423973928, lastAccessTime:0, retention:0, sd:StorageDescriptor(cols:[FieldSchema(name:a, type:string, comment:null), FieldSchema(name:b, type:string, comment:null)], location:file:/tmp/sparkHiveWarehouse2573474017665704744/null_tab3, inputFormat:org.apache.hadoop.mapred.TextInputFormat, outputFormat:org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat, compressed:false, numBuckets:-1, serdeInfo:SerDeInfo(name:null, serializationLib:org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, parameters:{serialization.null.format=fooNull, serialization.format=1}), bucketCols:[], sortCols:[], parameters:{}, skewedInfo:SkewedInfo(skewedColNames:[], skewedColValues:[], skewedColValueLocationMaps:{}), storedAsSubDirectories:false), partitionKeys:[], parameters:{numFiles=1, transient_lastDdlTime=1423973928, COLUMN_STATS_ACCURATE=true, totalSize=80, numRows=10, rawDataSize=70}, viewOriginalText:null, viewExpandedText:null, tableType:MANAGED_TABLE)	
diff --git a/sql/hive/src/test/resources/golden/nullformatCTAS-8-7f26cbd6be5631a3acce26f667d1c5d8 b/sql/hive/src/test/resources/golden/nullformatCTAS-8-7f26cbd6be5631a3acce26f667d1c5d8
new file mode 100644
index 0000000000000..881917bcf1c69
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/nullformatCTAS-8-7f26cbd6be5631a3acce26f667d1c5d8
@@ -0,0 +1,18 @@
+CREATE  TABLE `null_tab3`(
+  `a` string, 
+  `b` string)
+ROW FORMAT DELIMITED 
+  NULL DEFINED AS 'fooNull' 
+STORED AS INPUTFORMAT 
+  'org.apache.hadoop.mapred.TextInputFormat' 
+OUTPUTFORMAT 
+  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+LOCATION
+  'file:/tmp/sparkHiveWarehouse2573474017665704744/null_tab3'
+TBLPROPERTIES (
+  'numFiles'='1', 
+  'transient_lastDdlTime'='1423973928', 
+  'COLUMN_STATS_ACCURATE'='true', 
+  'totalSize'='80', 
+  'numRows'='10', 
+  'rawDataSize'='70')
diff --git a/sql/hive/src/test/resources/golden/nullformatCTAS-9-22e1b3899de7087b39c24d9d8f643b47 b/sql/hive/src/test/resources/golden/nullformatCTAS-9-22e1b3899de7087b39c24d9d8f643b47
new file mode 100644
index 0000000000000..3a2e3f4984a0e
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/nullformatCTAS-9-22e1b3899de7087b39c24d9d8f643b47
@@ -0,0 +1 @@
+-1
diff --git a/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala b/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala
index b5a0754ff61f9..8534c7d7064e5 100644
--- a/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala
+++ b/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala
@@ -245,6 +245,8 @@ private[hive] object HiveShim {
   def prepareWritable(w: Writable): Writable = {
     w
   }
+
+  def setTblNullFormat(crtTbl: CreateTableDesc, tbl: Table) = {}
 }
 
 class ShimFileSinkDesc(var dir: String, var tableInfo: TableDesc, var compressed: Boolean)
diff --git a/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala b/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala
index e4c1809c8bb21..72104f5b55761 100644
--- a/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala
+++ b/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala
@@ -35,6 +35,7 @@ import org.apache.hadoop.hive.ql.Context
 import org.apache.hadoop.hive.ql.metadata.{Table, Hive, Partition}
 import org.apache.hadoop.hive.ql.plan.{CreateTableDesc, FileSinkDesc, TableDesc}
 import org.apache.hadoop.hive.ql.processors.CommandProcessorFactory
+import org.apache.hadoop.hive.serde.serdeConstants
 import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfo, DecimalTypeInfo, TypeInfoFactory}
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.{HiveDecimalObjectInspector, PrimitiveObjectInspectorFactory}
 import org.apache.hadoop.hive.serde2.objectinspector.{PrimitiveObjectInspector, ObjectInspector}
@@ -410,6 +411,12 @@ private[hive] object HiveShim {
     }
     w
   }
+
+  def setTblNullFormat(crtTbl: CreateTableDesc, tbl: Table) = {
+    if (crtTbl != null && crtTbl.getNullFormat() != null) {
+      tbl.setSerdeParam(serdeConstants.SERIALIZATION_NULL_FORMAT, crtTbl.getNullFormat())
+    }
+  }
 }
 
 /*

From 104b2c45805ce0a9c86e2823f402de6e9f0aee81 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Mon, 16 Feb 2015 12:32:56 -0800
Subject: [PATCH 188/357] [SQL] Initial support for reporting location of error
 in sql string

Author: Michael Armbrust <michael@databricks.com>

Closes #4587 from marmbrus/position and squashes the following commits:

0810052 [Michael Armbrust] fix tests
395c019 [Michael Armbrust] Merge remote-tracking branch 'marmbrus/position' into position
e155dce [Michael Armbrust] more errors
f3efa51 [Michael Armbrust] Update AnalysisException.scala
d45ff60 [Michael Armbrust] [SQL] Initial support for reporting location of error in sql string
---
 .../apache/spark/sql/AnalysisException.scala  |  17 +-
 .../sql/catalyst/analysis/Analyzer.scala      |  19 +-
 .../spark/sql/catalyst/analysis/Catalog.scala |   6 +
 .../spark/sql/catalyst/analysis/package.scala |  10 ++
 .../spark/sql/catalyst/trees/TreeNode.scala   |  60 ++++++-
 .../sql/catalyst/trees/TreeNodeSuite.scala    |  14 ++
 .../spark/sql/hive/HiveMetastoreCatalog.scala |   9 +-
 .../org/apache/spark/sql/hive/HiveQl.scala    |  47 +++--
 .../spark/sql/hive/CachedTableSuite.scala     |   4 +-
 .../spark/sql/hive/ErrorPositionSuite.scala   | 163 ++++++++++++++++++
 .../sql/hive/execution/SQLQuerySuite.scala    |   4 +-
 11 files changed, 314 insertions(+), 39 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala
index 871d560b9d54f..15add84878ecf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala
@@ -17,7 +17,22 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.annotation.DeveloperApi
+
 /**
+ * :: DeveloperApi ::
  * Thrown when a query fails to analyze, usually because the query itself is invalid.
  */
-class AnalysisException(message: String) extends Exception(message) with Serializable
+@DeveloperApi
+class AnalysisException protected[sql] (
+    val message: String,
+    val line: Option[Int] = None,
+    val startPosition: Option[Int] = None)
+  extends Exception with Serializable {
+
+  override def getMessage: String = {
+    val lineAnnotation = line.map(l => s" line $l").getOrElse("")
+    val positionAnnotation = startPosition.map(p => s" pos $p").getOrElse("")
+    s"$message;$lineAnnotation$positionAnnotation"
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 58a7003977c93..aa4320bd582cb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -85,7 +85,7 @@ class Analyzer(catalog: Catalog,
           operator transformExpressionsUp {
             case a: Attribute if !a.resolved =>
               val from = operator.inputSet.map(_.name).mkString(", ")
-              failAnalysis(s"cannot resolve '${a.prettyString}' given input columns $from")
+              a.failAnalysis(s"cannot resolve '${a.prettyString}' given input columns $from")
 
             case c: Cast if !c.resolved =>
               failAnalysis(
@@ -246,12 +246,21 @@ class Analyzer(catalog: Catalog,
    * Replaces [[UnresolvedRelation]]s with concrete relations from the catalog.
    */
   object ResolveRelations extends Rule[LogicalPlan] {
+    def getTable(u: UnresolvedRelation) = {
+      try {
+        catalog.lookupRelation(u.tableIdentifier, u.alias)
+      } catch {
+        case _: NoSuchTableException =>
+          u.failAnalysis(s"no such table ${u.tableIdentifier}")
+      }
+    }
+
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-      case i @ InsertIntoTable(UnresolvedRelation(tableIdentifier, alias), _, _, _) =>
+      case i @ InsertIntoTable(u: UnresolvedRelation, _, _, _) =>
         i.copy(
-          table = EliminateSubQueries(catalog.lookupRelation(tableIdentifier, alias)))
-      case UnresolvedRelation(tableIdentifier, alias) =>
-        catalog.lookupRelation(tableIdentifier, alias)
+          table = EliminateSubQueries(getTable(u)))
+      case u: UnresolvedRelation =>
+        getTable(u)
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
index f57eab24607f8..bf97215ee67da 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
@@ -21,6 +21,12 @@ import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Subquery}
 
+/**
+ * Thrown by a catalog when a table cannot be found.  The analzyer will rethrow the exception
+ * as an AnalysisException with the correct position information.
+ */
+class NoSuchTableException extends Exception
+
 /**
  * An interface for looking up relations by name.  Used by an [[Analyzer]].
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala
index 5dc9d0e566087..e95f19e69ed43 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/package.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.catalyst
 
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.trees.TreeNode
+
 /**
  * Provides a logical query plan [[Analyzer]] and supporting classes for performing analysis.
  * Analysis consists of translating [[UnresolvedAttribute]]s and [[UnresolvedRelation]]s
@@ -32,4 +35,11 @@ package object analysis {
 
   val caseInsensitiveResolution = (a: String, b: String) => a.equalsIgnoreCase(b)
   val caseSensitiveResolution = (a: String, b: String) => a == b
+
+  implicit class AnalysisErrorAt(t: TreeNode[_]) {
+    /** Fails the analysis at the point where a specific tree node was parsed. */
+    def failAnalysis(msg: String) = {
+      throw new AnalysisException(msg, t.origin.line, t.origin.startPosition)
+    }
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index e0930b056d5fa..109671bdca361 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -22,9 +22,42 @@ import org.apache.spark.sql.catalyst.errors._
 /** Used by [[TreeNode.getNodeNumbered]] when traversing the tree for a given number */
 private class MutableInt(var i: Int)
 
+case class Origin(
+  line: Option[Int] = None,
+  startPosition: Option[Int] = None)
+
+/**
+ * Provides a location for TreeNodes to ask about the context of their origin.  For example, which
+ * line of code is currently being parsed.
+ */
+object CurrentOrigin {
+  private val value = new ThreadLocal[Origin]() {
+    override def initialValue: Origin = Origin()
+  }
+
+  def get = value.get()
+  def set(o: Origin) = value.set(o)
+
+  def reset() = value.set(Origin())
+
+  def setPosition(line: Int, start: Int) = {
+    value.set(
+      value.get.copy(line = Some(line), startPosition = Some(start)))
+  }
+
+  def withOrigin[A](o: Origin)(f: => A): A = {
+    set(o)
+    val ret = try f finally { reset() }
+    reset()
+    ret
+  }
+}
+
 abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
   self: BaseType with Product =>
 
+  val origin = CurrentOrigin.get
+
   /** Returns a Seq of the children of this node */
   def children: Seq[BaseType]
 
@@ -150,7 +183,10 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
    * @param rule the function used to transform this nodes children
    */
   def transformDown(rule: PartialFunction[BaseType, BaseType]): BaseType = {
-    val afterRule = rule.applyOrElse(this, identity[BaseType])
+    val afterRule = CurrentOrigin.withOrigin(origin) {
+      rule.applyOrElse(this, identity[BaseType])
+    }
+
     // Check if unchanged and then possibly return old copy to avoid gc churn.
     if (this fastEquals afterRule) {
       transformChildrenDown(rule)
@@ -210,9 +246,13 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
   def transformUp(rule: PartialFunction[BaseType, BaseType]): BaseType = {
     val afterRuleOnChildren = transformChildrenUp(rule);
     if (this fastEquals afterRuleOnChildren) {
-      rule.applyOrElse(this, identity[BaseType])
+      CurrentOrigin.withOrigin(origin) {
+        rule.applyOrElse(this, identity[BaseType])
+      }
     } else {
-      rule.applyOrElse(afterRuleOnChildren, identity[BaseType])
+      CurrentOrigin.withOrigin(origin) {
+        rule.applyOrElse(afterRuleOnChildren, identity[BaseType])
+      }
     }
   }
 
@@ -268,12 +308,14 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
    */
   def makeCopy(newArgs: Array[AnyRef]): this.type = attachTree(this, "makeCopy") {
     try {
-      // Skip no-arg constructors that are just there for kryo.
-      val defaultCtor = getClass.getConstructors.find(_.getParameterTypes.size != 0).head
-      if (otherCopyArgs.isEmpty) {
-        defaultCtor.newInstance(newArgs: _*).asInstanceOf[this.type]
-      } else {
-        defaultCtor.newInstance((newArgs ++ otherCopyArgs).toArray: _*).asInstanceOf[this.type]
+      CurrentOrigin.withOrigin(origin) {
+        // Skip no-arg constructors that are just there for kryo.
+        val defaultCtor = getClass.getConstructors.find(_.getParameterTypes.size != 0).head
+        if (otherCopyArgs.isEmpty) {
+          defaultCtor.newInstance(newArgs: _*).asInstanceOf[this.type]
+        } else {
+          defaultCtor.newInstance((newArgs ++ otherCopyArgs).toArray: _*).asInstanceOf[this.type]
+        }
       }
     } catch {
       case e: java.lang.IllegalArgumentException =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index cdb843f959704..e7ce92a2160b6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -104,4 +104,18 @@ class TreeNodeSuite extends FunSuite {
     assert(actual === Dummy(None))
   }
 
+  test("preserves origin") {
+    CurrentOrigin.setPosition(1,1)
+    val add = Add(Literal(1), Literal(1))
+    CurrentOrigin.reset()
+
+    val transformed = add transform {
+      case Literal(1, _) => Literal(2)
+    }
+
+    assert(transformed.origin.line.isDefined)
+    assert(transformed.origin.startPosition.isDefined)
+  }
+
+
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index f82778c87672c..12f86a04a37af 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -31,8 +31,8 @@ import org.apache.hadoop.hive.serde2.{Deserializer, SerDeException}
 import org.apache.hadoop.util.ReflectionUtils
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.SQLContext
-import org.apache.spark.sql.catalyst.analysis.{Catalog, OverrideCatalog}
+import org.apache.spark.sql.{AnalysisException, SQLContext}
+import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, Catalog, OverrideCatalog}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
@@ -154,7 +154,10 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
     val databaseName = tableIdent.lift(tableIdent.size - 2).getOrElse(
       hive.sessionState.getCurrentDatabase)
     val tblName = tableIdent.last
-    val table = client.getTable(databaseName, tblName)
+    val table = try client.getTable(databaseName, tblName) catch {
+      case te: org.apache.hadoop.hive.ql.metadata.InvalidTableException =>
+        throw new NoSuchTableException
+    }
 
     if (table.getProperty("spark.sql.sources.provider") != null) {
       cachedDataSourceTables(QualifiedTableName(databaseName, tblName).toLowerCase)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 5269460e5b6bc..5a1825a87dadb 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.hive
 
 import java.sql.Date
 
+
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.hadoop.hive.conf.HiveConf
@@ -27,13 +28,14 @@ import org.apache.hadoop.hive.ql.lib.Node
 import org.apache.hadoop.hive.ql.metadata.Table
 import org.apache.hadoop.hive.ql.parse._
 import org.apache.hadoop.hive.ql.plan.PlanUtils
-import org.apache.spark.sql.SparkSQLParser
+import org.apache.spark.sql.{AnalysisException, SparkSQLParser}
 
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.trees.CurrentOrigin
 import org.apache.spark.sql.execution.ExplainCommand
 import org.apache.spark.sql.sources.DescribeCommand
 import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DropTable, AnalyzeTable, HiveScriptIOSchema}
@@ -211,12 +213,6 @@ private[hive] object HiveQl {
     }
   }
 
-  class ParseException(sql: String, cause: Throwable)
-    extends Exception(s"Failed to parse: $sql", cause)
-
-  class SemanticException(msg: String)
-    extends Exception(s"Error in semantic analysis: $msg")
-
   /**
    * Returns the AST for the given SQL string.
    */
@@ -236,8 +232,10 @@ private[hive] object HiveQl {
   /** Returns a LogicalPlan for a given HiveQL string. */
   def parseSql(sql: String): LogicalPlan = hqlParser(sql)
 
+  val errorRegEx = "line (\\d+):(\\d+) (.*)".r
+
   /** Creates LogicalPlan for a given HiveQL string. */
-  def createPlan(sql: String) = {
+  def createPlan(sql: String): LogicalPlan = {
     try {
       val tree = getAst(sql)
       if (nativeCommands contains tree.getText) {
@@ -249,14 +247,23 @@ private[hive] object HiveQl {
         }
       }
     } catch {
-      case e: Exception => throw new ParseException(sql, e)
-      case e: NotImplementedError => sys.error(
-        s"""
-          |Unsupported language features in query: $sql
-          |${dumpTree(getAst(sql))}
-          |$e
-          |${e.getStackTrace.head}
-        """.stripMargin)
+      case pe: org.apache.hadoop.hive.ql.parse.ParseException =>
+        pe.getMessage match {
+          case errorRegEx(line, start, message) =>
+            throw new AnalysisException(message, Some(line.toInt), Some(start.toInt))
+          case otherMessage =>
+            throw new AnalysisException(otherMessage)
+        }
+      case e: Exception =>
+        throw new AnalysisException(e.getMessage)
+      case e: NotImplementedError =>
+        throw new AnalysisException(
+          s"""
+            |Unsupported language features in query: $sql
+            |${dumpTree(getAst(sql))}
+            |$e
+            |${e.getStackTrace.head}
+          """.stripMargin)
     }
   }
 
@@ -292,6 +299,7 @@ private[hive] object HiveQl {
     /** @return matches of the form (tokenName, children). */
     def unapply(t: Any): Option[(String, Seq[ASTNode])] = t match {
       case t: ASTNode =>
+        CurrentOrigin.setPosition(t.getLine, t.getCharPositionInLine)
         Some((t.getText,
           Option(t.getChildren).map(_.toList).getOrElse(Nil).asInstanceOf[Seq[ASTNode]]))
       case _ => None
@@ -1278,7 +1286,12 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
   def dumpTree(node: Node, builder: StringBuilder = new StringBuilder, indent: Int = 0)
     : StringBuilder = {
     node match {
-      case a: ASTNode => builder.append(("  " * indent) + a.getText + "\n")
+      case a: ASTNode => builder.append(
+        ("  " * indent) + a.getText + " " +
+          a.getLine + ", " +
+          a.getTokenStartIndex + "," +
+          a.getTokenStopIndex + ", " +
+          a.getCharPositionInLine + "\n")
       case other => sys.error(s"Non ASTNode encountered: $other")
     }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
index 7c8b5205e239e..44d24273e722a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.hive
 import org.apache.spark.sql.columnar.{InMemoryColumnarTableScan, InMemoryRelation}
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.{DataFrame, QueryTest}
+import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest}
 import org.apache.spark.storage.RDDBlockId
 
 class CachedTableSuite extends QueryTest {
@@ -96,7 +96,7 @@ class CachedTableSuite extends QueryTest {
     cacheTable("test")
     sql("SELECT * FROM test").collect()
     sql("DROP TABLE test")
-    intercept[org.apache.hadoop.hive.ql.metadata.InvalidTableException] {
+    intercept[AnalysisException] {
       sql("SELECT * FROM test").collect()
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala
new file mode 100644
index 0000000000000..f04437c595bf6
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ErrorPositionSuite.scala
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import java.io.{OutputStream, PrintStream}
+
+import org.apache.spark.sql.hive.test.TestHive._
+import org.apache.spark.sql.{AnalysisException, QueryTest}
+
+import scala.util.Try
+
+class ErrorPositionSuite extends QueryTest {
+
+  positionTest("unresolved attribute 1",
+    "SELECT x FROM src", "x")
+
+  positionTest("unresolved attribute 2",
+    "SELECT        x FROM src", "x")
+
+  positionTest("unresolved attribute 3",
+    "SELECT key, x FROM src", "x")
+
+  positionTest("unresolved attribute 4",
+    """SELECT key,
+      |x FROM src
+    """.stripMargin, "x")
+
+  positionTest("unresolved attribute 5",
+    """SELECT key,
+      |  x FROM src
+    """.stripMargin, "x")
+
+  positionTest("unresolved attribute 6",
+    """SELECT key,
+      |
+      |  1 + x FROM src
+    """.stripMargin, "x")
+
+  positionTest("unresolved attribute 7",
+    """SELECT key,
+      |
+      |  1 + x + 1 FROM src
+    """.stripMargin, "x")
+
+  positionTest("multi-char unresolved attribute",
+    """SELECT key,
+      |
+      |  1 + abcd + 1 FROM src
+    """.stripMargin, "abcd")
+
+  positionTest("unresolved attribute group by",
+    """SELECT key FROM src GROUP BY
+       |x
+    """.stripMargin, "x")
+
+  positionTest("unresolved attribute order by",
+    """SELECT key FROM src ORDER BY
+      |x
+    """.stripMargin, "x")
+
+  positionTest("unresolved attribute where",
+    """SELECT key FROM src
+      |WHERE x = true
+    """.stripMargin, "x")
+
+  positionTest("unresolved attribute backticks",
+    "SELECT `x` FROM src", "`x`")
+
+  positionTest("parse error",
+    "SELECT WHERE", "WHERE")
+
+  positionTest("bad relation",
+    "SELECT * FROM badTable", "badTable")
+
+  ignore("other expressions") {
+    positionTest("bad addition",
+      "SELECT 1 + array(1)", "1 + array")
+  }
+
+  /** Hive can be very noisy, messing up the output of our tests. */
+  private def quietly[A](f: => A): A = {
+    val origErr = System.err
+    val origOut = System.out
+    try {
+      System.setErr(new PrintStream(new OutputStream {
+        def write(b: Int) = {}
+      }))
+      System.setOut(new PrintStream(new OutputStream {
+        def write(b: Int) = {}
+      }))
+
+      f
+    } finally {
+      System.setErr(origErr)
+      System.setOut(origOut)
+    }
+  }
+
+  /**
+   * Creates a test that checks to see if the error thrown when analyzing a given query includes
+   * the location of the given token in the query string.
+   *
+   * @param name the name of the test
+   * @param query the query to analyze
+   * @param token a unique token in the string that should be indicated by the exception
+   */
+  def positionTest(name: String, query: String, token: String) = {
+    def parseTree =
+      Try(quietly(HiveQl.dumpTree(HiveQl.getAst(query)))).getOrElse("<failed to parse>")
+
+    test(name) {
+      val error = intercept[AnalysisException] {
+        quietly(sql(query))
+      }
+      val (line, expectedLineNum) = query.split("\n").zipWithIndex.collect {
+        case (l, i) if l.contains(token) => (l, i + 1)
+      }.headOption.getOrElse(sys.error(s"Invalid test. Token $token not in $query"))
+      val actualLine = error.line.getOrElse {
+        fail(
+          s"line not returned for error '${error.getMessage}' on token $token\n$parseTree"
+        )
+      }
+      assert(actualLine === expectedLineNum, "wrong line")
+
+      val expectedStart = line.indexOf(token)
+      val actualStart = error.startPosition.getOrElse {
+        fail(
+          s"start not returned for error on token $token\n" +
+            HiveQl.dumpTree(HiveQl.getAst(query))
+        )
+      }
+      assert(expectedStart === actualStart,
+       s"""Incorrect start position.
+          |== QUERY ==
+          |$query
+          |
+          |== AST ==
+          |$parseTree
+          |
+          |Actual: $actualStart, Expected: $expectedStart
+          |$line
+          |${" " * actualStart}^
+          |0123456789 123456789 1234567890
+          |          2         3
+        """.stripMargin)
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 978825938395f..e8d9eec3d88ff 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{QueryTest, Row, SQLConf}
+import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SQLConf}
 
 case class Nested1(f1: Nested2)
 case class Nested2(f2: Nested3)
@@ -185,7 +185,7 @@ class SQLQuerySuite extends QueryTest {
       sql("SELECT * FROM test_ctas_1234"),
       sql("SELECT * FROM nested").collect().toSeq)
 
-    intercept[org.apache.hadoop.hive.ql.metadata.InvalidTableException] {
+    intercept[AnalysisException] {
       sql("CREATE TABLE test_ctas_12345 AS SELECT * from notexists").collect()
     }
   }

From b4d7c7032d755de42951f92d9535287ef6230b9b Mon Sep 17 00:00:00 2001
From: OopsOutOfMemory <victorshengli@126.com>
Date: Mon, 16 Feb 2015 12:34:09 -0800
Subject: [PATCH 189/357] [SQL] Add fetched row count in SparkSQLCLIDriver

before this change:
```scala
Time taken: 0.619 seconds
```

after this change :
```scala
Time taken: 0.619 seconds, Fetched: 4 row(s)
```

Author: OopsOutOfMemory <victorshengli@126.com>

Closes #4604 from OopsOutOfMemory/rowcount and squashes the following commits:

7252dea [OopsOutOfMemory] add fetched row count
---
 .../sql/hive/thriftserver/SparkSQLCLIDriver.scala    | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index bb19ac232fcbe..401e97b162dea 100755
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -292,9 +292,13 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
             }
           }
 
+          var counter = 0
           try {
             while (!out.checkError() && driver.getResults(res)) {
-              res.foreach(out.println)
+              res.foreach{ l =>
+                counter += 1
+                out.println(l)
+              }
               res.clear()
             }
           } catch {
@@ -311,7 +315,11 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
             ret = cret
           }
 
-          console.printInfo(s"Time taken: $timeTaken seconds", null)
+          var responseMsg = s"Time taken: $timeTaken seconds"
+          if (counter != 0) {
+            responseMsg += s", Fetched $counter row(s)"
+          }
+          console.printInfo(responseMsg , null)
           // Destroy the driver to release all the locks.
           driver.destroy()
         } else {

From 6f54dee66100e5e58f6649158db257eb5009bd6a Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 16 Feb 2015 12:48:55 -0800
Subject: [PATCH 190/357] [SPARK-5296] [SQL] Add more filter types for data
 sources API

This PR adds the following filter types for data sources API:

- `IsNull`
- `IsNotNull`
- `Not`
- `And`
- `Or`

The code which converts Catalyst predicate expressions to data sources filters is very similar to filter conversion logics in `ParquetFilters` which converts Catalyst predicates to Parquet filter predicates. In this way we can support nested AND/OR/NOT predicates without changing current `BaseScan` type hierarchy.

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4623)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>

This patch had conflicts when merged, resolved by
Committer: Michael Armbrust <michael@databricks.com>

Closes #4623 from liancheng/more-fiters and squashes the following commits:

1b296f4 [Cheng Lian] Add more filter types for data sources API
---
 .../org/apache/spark/sql/SQLContext.scala     |  9 ++-
 .../apache/spark/sql/parquet/newParquet.scala |  5 +-
 .../sql/sources/DataSourceStrategy.scala      | 81 +++++++++++++------
 .../apache/spark/sql/sources/filters.scala    |  5 ++
 .../spark/sql/sources/FilteredScanSuite.scala | 34 +++++++-
 5 files changed, 103 insertions(+), 31 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index b42a52ebd2f16..1442250569416 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -28,16 +28,16 @@ import scala.reflect.runtime.universe.TypeTag
 import org.apache.spark.annotation.{AlphaComponent, DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.optimizer.{DefaultOptimizer, Optimizer}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, NoRelation}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
-import org.apache.spark.sql.execution._
+import org.apache.spark.sql.catalyst.{ScalaReflection, expressions}
+import org.apache.spark.sql.execution.{Filter, _}
 import org.apache.spark.sql.jdbc.{JDBCPartition, JDBCPartitioningInfo, JDBCRelation}
 import org.apache.spark.sql.json._
-import org.apache.spark.sql.sources.{BaseRelation, DDLParser, DataSourceStrategy, LogicalRelation, _}
+import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 import org.apache.spark.{Partition, SparkContext}
@@ -867,7 +867,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
       val projectSet = AttributeSet(projectList.flatMap(_.references))
       val filterSet = AttributeSet(filterPredicates.flatMap(_.references))
-      val filterCondition = prunePushedDownFilters(filterPredicates).reduceLeftOption(And)
+      val filterCondition =
+        prunePushedDownFilters(filterPredicates).reduceLeftOption(expressions.And)
 
       // Right now we still use a projection even if the only evaluation is applying an alias
       // to a column.  Since this is a no-op, it could be avoided. However, using this
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 9279f5a903f55..9bb34e2df9a26 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -32,6 +32,7 @@ import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
 import org.apache.hadoop.mapreduce.{InputSplit, Job, JobContext}
+
 import parquet.filter2.predicate.FilterApi
 import parquet.format.converter.ParquetMetadataConverter
 import parquet.hadoop.metadata.CompressionCodecName
@@ -42,6 +43,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
 import org.apache.spark.rdd.{NewHadoopPartition, NewHadoopRDD, RDD}
+import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.parquet.ParquetTypesConverter._
 import org.apache.spark.sql.sources._
@@ -497,7 +499,8 @@ case class ParquetRelation2(
       _.references.map(_.name).toSet.subsetOf(partitionColumnNames)
     }
 
-    val rawPredicate = partitionPruningPredicates.reduceOption(And).getOrElse(Literal(true))
+    val rawPredicate =
+      partitionPruningPredicates.reduceOption(expressions.And).getOrElse(Literal(true))
     val boundPredicate = InterpretedPredicate(rawPredicate transform {
       case a: AttributeReference =>
         val index = partitionColumns.indexWhere(a.name == _.name)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
index 624369afe87b5..a853385fdac68 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -19,11 +19,11 @@ package org.apache.spark.sql.sources
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.expressions
-import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeReference, AttributeSet, Expression, NamedExpression}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.{Row, Strategy, execution}
+import org.apache.spark.sql.{Row, Strategy, execution, sources}
 
 /**
  * A Strategy for planning scans over data sources defined using the sources API.
@@ -88,7 +88,7 @@ private[sql] object DataSourceStrategy extends Strategy {
 
     val projectSet = AttributeSet(projectList.flatMap(_.references))
     val filterSet = AttributeSet(filterPredicates.flatMap(_.references))
-    val filterCondition = filterPredicates.reduceLeftOption(And)
+    val filterCondition = filterPredicates.reduceLeftOption(expressions.And)
 
     val pushedFilters = filterPredicates.map { _ transform {
       case a: AttributeReference => relation.attributeMap(a) // Match original case of attributes.
@@ -118,27 +118,60 @@ private[sql] object DataSourceStrategy extends Strategy {
     }
   }
 
-  /** Turn Catalyst [[Expression]]s into data source [[Filter]]s. */
-  protected[sql] def selectFilters(filters: Seq[Expression]): Seq[Filter] = filters.collect {
-    case expressions.EqualTo(a: Attribute, expressions.Literal(v, _)) => EqualTo(a.name, v)
-    case expressions.EqualTo(expressions.Literal(v, _), a: Attribute) => EqualTo(a.name, v)
-
-    case expressions.GreaterThan(a: Attribute, expressions.Literal(v, _)) => GreaterThan(a.name, v)
-    case expressions.GreaterThan(expressions.Literal(v, _), a: Attribute) => LessThan(a.name, v)
-
-    case expressions.LessThan(a: Attribute, expressions.Literal(v, _)) => LessThan(a.name, v)
-    case expressions.LessThan(expressions.Literal(v, _), a: Attribute) => GreaterThan(a.name, v)
-
-    case expressions.GreaterThanOrEqual(a: Attribute, expressions.Literal(v, _)) =>
-      GreaterThanOrEqual(a.name, v)
-    case expressions.GreaterThanOrEqual(expressions.Literal(v, _), a: Attribute) =>
-      LessThanOrEqual(a.name, v)
-
-    case expressions.LessThanOrEqual(a: Attribute, expressions.Literal(v, _)) =>
-      LessThanOrEqual(a.name, v)
-    case expressions.LessThanOrEqual(expressions.Literal(v, _), a: Attribute) =>
-      GreaterThanOrEqual(a.name, v)
+  /**
+   * Selects Catalyst predicate [[Expression]]s which are convertible into data source [[Filter]]s,
+   * and convert them.
+   */
+  protected[sql] def selectFilters(filters: Seq[Expression]) = {
+    def translate(predicate: Expression): Option[Filter] = predicate match {
+      case expressions.EqualTo(a: Attribute, Literal(v, _)) =>
+        Some(sources.EqualTo(a.name, v))
+      case expressions.EqualTo(Literal(v, _), a: Attribute) =>
+        Some(sources.EqualTo(a.name, v))
+
+      case expressions.GreaterThan(a: Attribute, Literal(v, _)) =>
+        Some(sources.GreaterThan(a.name, v))
+      case expressions.GreaterThan(Literal(v, _), a: Attribute) =>
+        Some(sources.LessThan(a.name, v))
+
+      case expressions.LessThan(a: Attribute, Literal(v, _)) =>
+        Some(sources.LessThan(a.name, v))
+      case expressions.LessThan(Literal(v, _), a: Attribute) =>
+        Some(sources.GreaterThan(a.name, v))
+
+      case expressions.GreaterThanOrEqual(a: Attribute, Literal(v, _)) =>
+        Some(sources.GreaterThanOrEqual(a.name, v))
+      case expressions.GreaterThanOrEqual(Literal(v, _), a: Attribute) =>
+        Some(sources.LessThanOrEqual(a.name, v))
+
+      case expressions.LessThanOrEqual(a: Attribute, Literal(v, _)) =>
+        Some(sources.LessThanOrEqual(a.name, v))
+      case expressions.LessThanOrEqual(Literal(v, _), a: Attribute) =>
+        Some(sources.GreaterThanOrEqual(a.name, v))
+
+      case expressions.InSet(a: Attribute, set) =>
+        Some(sources.In(a.name, set.toArray))
+
+      case expressions.IsNull(a: Attribute) =>
+        Some(sources.IsNull(a.name))
+      case expressions.IsNotNull(a: Attribute) =>
+        Some(sources.IsNotNull(a.name))
+
+      case expressions.And(left, right) =>
+        (translate(left) ++ translate(right)).reduceOption(sources.And)
+
+      case expressions.Or(left, right) =>
+        for {
+          leftFilter <- translate(left)
+          rightFilter <- translate(right)
+        } yield sources.Or(leftFilter, rightFilter)
+
+      case expressions.Not(child) =>
+        translate(child).map(sources.Not)
+
+      case _ => None
+    }
 
-    case expressions.InSet(a: Attribute, set) => In(a.name, set.toArray)
+    filters.flatMap(translate)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
index 4a9fefc12b9ad..1e4505e36d2f0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala
@@ -25,3 +25,8 @@ case class GreaterThanOrEqual(attribute: String, value: Any) extends Filter
 case class LessThan(attribute: String, value: Any) extends Filter
 case class LessThanOrEqual(attribute: String, value: Any) extends Filter
 case class In(attribute: String, values: Array[Any]) extends Filter
+case class IsNull(attribute: String) extends Filter
+case class IsNotNull(attribute: String) extends Filter
+case class And(left: Filter, right: Filter) extends Filter
+case class Or(left: Filter, right: Filter) extends Filter
+case class Not(child: Filter) extends Filter
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
index 390538d35a348..41cd35683c196 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/FilteredScanSuite.scala
@@ -47,16 +47,22 @@ case class SimpleFilteredScan(from: Int, to: Int)(@transient val sqlContext: SQL
 
     FiltersPushed.list = filters
 
-    val filterFunctions = filters.collect {
+    def translateFilter(filter: Filter): Int => Boolean = filter match {
       case EqualTo("a", v) => (a: Int) => a == v
       case LessThan("a", v: Int) => (a: Int) => a < v
       case LessThanOrEqual("a", v: Int) => (a: Int) => a <= v
       case GreaterThan("a", v: Int) => (a: Int) => a > v
       case GreaterThanOrEqual("a", v: Int) => (a: Int) => a >= v
       case In("a", values) => (a: Int) => values.map(_.asInstanceOf[Int]).toSet.contains(a)
+      case IsNull("a") => (a: Int) => false // Int can't be null
+      case IsNotNull("a") => (a: Int) => true
+      case Not(pred) => (a: Int) => !translateFilter(pred)(a)
+      case And(left, right) => (a: Int) => translateFilter(left)(a) && translateFilter(right)(a)
+      case Or(left, right) => (a: Int) => translateFilter(left)(a) || translateFilter(right)(a)
+      case _ => (a: Int) => true
     }
 
-    def eval(a: Int) = !filterFunctions.map(_(a)).contains(false)
+    def eval(a: Int) = !filters.map(translateFilter(_)(a)).contains(false)
 
     sqlContext.sparkContext.parallelize(from to to).filter(eval).map(i =>
       Row.fromSeq(rowBuilders.map(_(i)).reduceOption(_ ++ _).getOrElse(Seq.empty)))
@@ -136,6 +142,26 @@ class FilteredScanSuite extends DataSourceTest {
     "SELECT * FROM oneToTenFiltered WHERE b = 2",
     Seq(1).map(i => Row(i, i * 2)).toSeq)
 
+  sqlTest(
+    "SELECT * FROM oneToTenFiltered WHERE a IS NULL",
+    Seq.empty[Row])
+
+  sqlTest(
+    "SELECT * FROM oneToTenFiltered WHERE a IS NOT NULL",
+    (1 to 10).map(i => Row(i, i * 2)).toSeq)
+
+  sqlTest(
+    "SELECT * FROM oneToTenFiltered WHERE a < 5 AND a > 1",
+    (2 to 4).map(i => Row(i, i * 2)).toSeq)
+
+  sqlTest(
+    "SELECT * FROM oneToTenFiltered WHERE a < 3 OR a > 8",
+    Seq(1, 2, 9, 10).map(i => Row(i, i * 2)).toSeq)
+
+  sqlTest(
+    "SELECT * FROM oneToTenFiltered WHERE NOT (a < 6)",
+    (6 to 10).map(i => Row(i, i * 2)).toSeq)
+
   testPushDown("SELECT * FROM oneToTenFiltered WHERE A = 1", 1)
   testPushDown("SELECT a FROM oneToTenFiltered WHERE A = 1", 1)
   testPushDown("SELECT b FROM oneToTenFiltered WHERE A = 1", 1)
@@ -162,6 +188,10 @@ class FilteredScanSuite extends DataSourceTest {
   testPushDown("SELECT * FROM oneToTenFiltered WHERE a = 20", 0)
   testPushDown("SELECT * FROM oneToTenFiltered WHERE b = 1", 10)
 
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE a < 5 AND a > 1", 3)
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE a < 3 OR a > 8", 4)
+  testPushDown("SELECT * FROM oneToTenFiltered WHERE NOT (a < 6)", 5)
+
   def testPushDown(sqlString: String, expectedCount: Int): Unit = {
     test(s"PushDown Returns $expectedCount: $sqlString") {
       val queryExecution = sql(sqlString).queryExecution

From c51ab37faddf4ede23243058dfb388e74a192552 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 16 Feb 2015 12:52:05 -0800
Subject: [PATCH 191/357] [SPARK-5833] [SQL] Adds REFRESH TABLE command

Lifts `HiveMetastoreCatalog.refreshTable` to `Catalog`. Adds `RefreshTable` command to refresh (possibly cached) metadata in external data sources tables.

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4624)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>

Closes #4624 from liancheng/refresh-table and squashes the following commits:

8d1aa4c [Cheng Lian] Adds REFRESH TABLE command
---
 .../spark/sql/catalyst/analysis/Catalog.scala | 10 ++++
 .../org/apache/spark/sql/sources/ddl.scala    | 52 +++++++++++--------
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  2 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  2 +-
 4 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
index bf97215ee67da..9e6e2912e0622 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
@@ -46,6 +46,8 @@ trait Catalog {
    */
   def getTables(databaseName: Option[String]): Seq[(String, Boolean)]
 
+  def refreshTable(databaseName: String, tableName: String): Unit
+
   def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit
 
   def unregisterTable(tableIdentifier: Seq[String]): Unit
@@ -119,6 +121,10 @@ class SimpleCatalog(val caseSensitive: Boolean) extends Catalog {
       case (name, _) => (name, true)
     }.toSeq
   }
+
+  override def refreshTable(databaseName: String, tableName: String): Unit = {
+    throw new UnsupportedOperationException
+  }
 }
 
 /**
@@ -224,4 +230,8 @@ object EmptyCatalog extends Catalog {
   }
 
   override def unregisterAllTables(): Unit = {}
+
+  override def refreshTable(databaseName: String, tableName: String): Unit = {
+    throw new UnsupportedOperationException
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index 8cac9c0fdf7fa..1b5e8c280e2fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.{SaveMode, DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
-import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.catalyst.expressions.{Row, AttributeReference}
 import org.apache.spark.sql.execution.RunnableCommand
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -66,6 +66,7 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
   protected val EXTENDED = Keyword("EXTENDED")
   protected val AS = Keyword("AS")
   protected val COMMENT = Keyword("COMMENT")
+  protected val REFRESH = Keyword("REFRESH")
 
   // Data types.
   protected val STRING = Keyword("STRING")
@@ -85,7 +86,7 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
   protected val MAP = Keyword("MAP")
   protected val STRUCT = Keyword("STRUCT")
 
-  protected lazy val ddl: Parser[LogicalPlan] = createTable | describeTable
+  protected lazy val ddl: Parser[LogicalPlan] = createTable | describeTable | refreshTable
 
   protected def start: Parser[LogicalPlan] = ddl
 
@@ -104,9 +105,8 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
    * AS SELECT ...
    */
   protected lazy val createTable: Parser[LogicalPlan] =
-  (
-    (CREATE ~> TEMPORARY.? <~ TABLE) ~ (IF ~> NOT <~ EXISTS).? ~ ident
-      ~ (tableCols).? ~ (USING ~> className) ~ (OPTIONS ~> options).? ~ (AS ~> restInput).? ^^ {
+    (CREATE ~> TEMPORARY.? <~ TABLE) ~ (IF ~> NOT <~ EXISTS).? ~ ident ~
+      tableCols.? ~ (USING ~> className) ~ (OPTIONS ~> options).? ~ (AS ~> restInput).? ^^ {
       case temp ~ allowExisting ~ tableName ~ columns ~ provider ~ opts ~ query =>
         if (temp.isDefined && allowExisting.isDefined) {
           throw new DDLException(
@@ -145,8 +145,7 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
             allowExisting.isDefined,
             managedIfNoPath = false)
         }
-      }
-  )
+    }
 
   protected lazy val tableCols: Parser[Seq[StructField]] =  "(" ~> repsep(column, ",") <~ ")"
 
@@ -166,6 +165,12 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
         DescribeCommand(UnresolvedRelation(tblIdentifier, None), e.isDefined)
    }
 
+  protected lazy val refreshTable: Parser[LogicalPlan] =
+    REFRESH ~> TABLE ~> (ident <~ ".").? ~ ident ^^ {
+      case maybeDatabaseName ~ tableName =>
+        RefreshTable(maybeDatabaseName.getOrElse("default"), tableName)
+    }
+
   protected lazy val options: Parser[Map[String, String]] =
     "(" ~> repsep(pair, ",") <~ ")" ^^ { case s: Seq[(String, String)] => s.toMap }
 
@@ -177,10 +182,10 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
     ident ~ dataType ~ (COMMENT ~> stringLit).?  ^^ { case columnName ~ typ ~ cm =>
       val meta = cm match {
         case Some(comment) =>
-          new MetadataBuilder().putString(COMMENT.str.toLowerCase(), comment).build()
+          new MetadataBuilder().putString(COMMENT.str.toLowerCase, comment).build()
         case None => Metadata.empty
       }
-      StructField(columnName, typ, true, meta)
+      StructField(columnName, typ, nullable = true, meta)
     }
 
   protected lazy val primitiveType: Parser[DataType] =
@@ -318,24 +323,18 @@ private[sql] case class DescribeCommand(
     isExtended: Boolean) extends Command {
   override val output = Seq(
     // Column names are based on Hive.
-    AttributeReference("col_name", StringType, nullable = false, 
+    AttributeReference("col_name", StringType, nullable = false,
       new MetadataBuilder().putString("comment", "name of the column").build())(),
-    AttributeReference("data_type", StringType, nullable = false, 
+    AttributeReference("data_type", StringType, nullable = false,
       new MetadataBuilder().putString("comment", "data type of the column").build())(),
-    AttributeReference("comment", StringType, nullable = false, 
+    AttributeReference("comment", StringType, nullable = false,
       new MetadataBuilder().putString("comment", "comment of the column").build())())
 }
 
 /**
   * Used to represent the operation of create table using a data source.
-  * @param tableName
-  * @param userSpecifiedSchema
-  * @param provider
-  * @param temporary
-  * @param options
   * @param allowExisting If it is true, we will do nothing when the table already exists.
- *                      If it is false, an exception will be thrown
-  * @param managedIfNoPath
+  *                      If it is false, an exception will be thrown
   */
 private[sql] case class CreateTableUsing(
     tableName: String,
@@ -362,7 +361,7 @@ private[sql] case class CreateTableUsingAsLogicalPlan(
     options: Map[String, String],
     query: LogicalPlan) extends Command
 
-private [sql] case class CreateTempTableUsing(
+private[sql] case class CreateTempTableUsing(
     tableName: String,
     userSpecifiedSchema: Option[StructType],
     provider: String,
@@ -376,7 +375,7 @@ private [sql] case class CreateTempTableUsing(
   }
 }
 
-private [sql] case class CreateTempTableUsingAsSelect(
+private[sql] case class CreateTempTableUsingAsSelect(
     tableName: String,
     provider: String,
     mode: SaveMode,
@@ -393,6 +392,15 @@ private [sql] case class CreateTempTableUsingAsSelect(
   }
 }
 
+private[sql] case class RefreshTable(databaseName: String, tableName: String)
+  extends RunnableCommand {
+
+  override def run(sqlContext: SQLContext): Seq[Row] = {
+    sqlContext.catalog.refreshTable(databaseName, tableName)
+    Seq.empty[Row]
+  }
+}
+
 /**
  * Builds a map in which keys are case insensitive
  */
@@ -408,7 +416,7 @@ protected[sql] class CaseInsensitiveMap(map: Map[String, String]) extends Map[St
 
   override def iterator: Iterator[(String, String)] = baseMap.iterator
 
-  override def -(key: String): Map[String, String] = baseMap - key.toLowerCase()
+  override def -(key: String): Map[String, String] = baseMap - key.toLowerCase
 }
 
 /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 12f86a04a37af..580c5706dde67 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -91,7 +91,7 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
     CacheBuilder.newBuilder().maximumSize(1000).build(cacheLoader)
   }
 
-  def refreshTable(databaseName: String, tableName: String): Unit = {
+  override def refreshTable(databaseName: String, tableName: String): Unit = {
     cachedDataSourceTables.refresh(QualifiedTableName(databaseName, tableName).toLowerCase)
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index addf887ab9162..375aae5d51915 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -177,7 +177,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       sql("SELECT * FROM jsonTable"),
       Row("a1", "b1"))
 
-    refreshTable("jsonTable")
+    sql("REFRESH TABLE jsonTable")
 
     // Check that the refresh worked
     checkAnswer(

From bb05982dd25e008fb01684dff1f95d03e7271721 Mon Sep 17 00:00:00 2001
From: Matt Whelan <mwhelan@perka.com>
Date: Mon, 16 Feb 2015 22:54:32 +0000
Subject: [PATCH 192/357] SPARK-5841: remove DiskBlockManager shutdown hook on
 stop

After a call to stop, the shutdown hook is redundant, and causes a
memory leak.

Author: Matt Whelan <mwhelan@perka.com>

Closes #4627 from MattWhelan/SPARK-5841 and squashes the following commits:

d5f5c7f [Matt Whelan] SPARK-5841: remove DiskBlockManager shutdown hook on stop
---
 .../org/apache/spark/storage/DiskBlockManager.scala | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
index 53eaedacbf291..ae9df8cbe9821 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -49,7 +49,7 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon
   }
   private val subDirs = Array.fill(localDirs.length)(new Array[File](subDirsPerLocalDir))
 
-  addShutdownHook()
+  private val shutdownHook = addShutdownHook()
 
   /** Looks up a file by hashing it into one of our local subdirectories. */
   // This method should be kept in sync with
@@ -134,17 +134,22 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon
     }
   }
 
-  private def addShutdownHook() {
-    Runtime.getRuntime.addShutdownHook(new Thread("delete Spark local dirs") {
+  private def addShutdownHook(): Thread = {
+    val shutdownHook = new Thread("delete Spark local dirs") {
       override def run(): Unit = Utils.logUncaughtExceptions {
         logDebug("Shutdown hook called")
         DiskBlockManager.this.stop()
       }
-    })
+    }
+    Runtime.getRuntime.addShutdownHook(shutdownHook)
+    shutdownHook
   }
 
   /** Cleanup local dirs and stop shuffle sender. */
   private[spark] def stop() {
+    // Remove the shutdown hook.  It causes memory leaks if we leave it around.
+    Runtime.getRuntime.removeShutdownHook(shutdownHook)
+
     // Only perform cleanup if an external service is not serving our shuffle files.
     if (!blockManager.externalShuffleServiceEnabled || blockManager.blockManagerId.isDriver) {
       localDirs.foreach { localDir =>

From c01c4ebcfe5c1a4a56a8987af596eca090c2cc2f Mon Sep 17 00:00:00 2001
From: Matt Whelan <mwhelan@perka.com>
Date: Mon, 16 Feb 2015 23:05:34 +0000
Subject: [PATCH 193/357] SPARK-5357: Update commons-codec version to 1.10
 (current)

Resolves https://issues.apache.org/jira/browse/SPARK-5357

In commons-codec 1.5, Base64 instances are not thread safe.  That was only true from 1.4-1.6.

Author: Matt Whelan <mwhelan@perka.com>

Closes #4153 from MattWhelan/depsUpdate and squashes the following commits:

b4a91f4 [Matt Whelan] SPARK-5357: Update commons-codec version to 1.10 (current)
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 6810d71be4230..bb355bf735bee 100644
--- a/pom.xml
+++ b/pom.xml
@@ -404,7 +404,7 @@
       <dependency>
         <groupId>commons-codec</groupId>
         <artifactId>commons-codec</artifactId>
-        <version>1.5</version>
+        <version>1.10</version>
       </dependency>
       <dependency>
         <groupId>org.apache.commons</groupId>

From 0cfda8461f173428f955aa9a7140b1356beea400 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 16 Feb 2015 15:25:11 -0800
Subject: [PATCH 194/357] [SPARK-2313] Use socket to communicate GatewayServer
 port back to Python driver

This patch changes PySpark so that the GatewayServer's port is communicated back to the Python process that launches it over a local socket instead of a pipe.  The old pipe-based approach was brittle and could fail if `spark-submit` printed unexpected to stdout.

To accomplish this, I wrote a custom `PythonGatewayServer.main()` function to use in place of Py4J's `GatewayServer.main()`.

Closes #3424.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #4603 from JoshRosen/SPARK-2313 and squashes the following commits:

6a7740b [Josh Rosen] Remove EchoOutputThread since it's no longer needed
0db501f [Josh Rosen] Use select() so that we don't block if GatewayServer dies.
9bdb4b6 [Josh Rosen] Handle case where getListeningPort returns -1
3fb7ed1 [Josh Rosen] Remove stdout=PIPE
2458934 [Josh Rosen] Use underscore to mark env var. as private
d12c95d [Josh Rosen] Use Logging and Utils.tryOrExit()
e5f9730 [Josh Rosen] Wrap everything in a giant try-block
2f70689 [Josh Rosen] Use stdin PIPE to share fate with driver
8bf956e [Josh Rosen] Initial cut at passing Py4J gateway port back to driver via socket
---
 .../api/python/PythonGatewayServer.scala      | 64 +++++++++++++++++
 .../org/apache/spark/deploy/SparkSubmit.scala |  4 +-
 python/pyspark/java_gateway.py                | 72 +++++++++----------
 3 files changed, 97 insertions(+), 43 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala b/core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala
new file mode 100644
index 0000000000000..164e95081583f
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.python
+
+import java.io.DataOutputStream
+import java.net.Socket
+
+import py4j.GatewayServer
+
+import org.apache.spark.Logging
+import org.apache.spark.util.Utils
+
+/**
+ * Process that starts a Py4J GatewayServer on an ephemeral port and communicates the bound port
+ * back to its caller via a callback port specified by the caller.
+ *
+ * This process is launched (via SparkSubmit) by the PySpark driver (see java_gateway.py).
+ */
+private[spark] object PythonGatewayServer extends Logging {
+  def main(args: Array[String]): Unit = Utils.tryOrExit {
+    // Start a GatewayServer on an ephemeral port
+    val gatewayServer: GatewayServer = new GatewayServer(null, 0)
+    gatewayServer.start()
+    val boundPort: Int = gatewayServer.getListeningPort
+    if (boundPort == -1) {
+      logError("GatewayServer failed to bind; exiting")
+      System.exit(1)
+    } else {
+      logDebug(s"Started PythonGatewayServer on port $boundPort")
+    }
+
+    // Communicate the bound port back to the caller via the caller-specified callback port
+    val callbackHost = sys.env("_PYSPARK_DRIVER_CALLBACK_HOST")
+    val callbackPort = sys.env("_PYSPARK_DRIVER_CALLBACK_PORT").toInt
+    logDebug(s"Communicating GatewayServer port to Python driver at $callbackHost:$callbackPort")
+    val callbackSocket = new Socket(callbackHost, callbackPort)
+    val dos = new DataOutputStream(callbackSocket.getOutputStream)
+    dos.writeInt(boundPort)
+    dos.close()
+    callbackSocket.close()
+
+    // Exit on EOF or broken pipe to ensure that this process dies when the Python driver dies:
+    while (System.in.read() != -1) {
+      // Do nothing
+    }
+    logDebug("Exiting due to broken pipe from Python driver")
+    System.exit(0)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 54399e99c98f0..012a89a31b046 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -39,7 +39,6 @@ import org.apache.ivy.plugins.resolver.{ChainResolver, IBiblioResolver}
 
 import org.apache.spark.SPARK_VERSION
 import org.apache.spark.deploy.rest._
-import org.apache.spark.executor._
 import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, Utils}
 
 /**
@@ -284,8 +283,7 @@ object SparkSubmit {
     // If we're running a python app, set the main class to our specific python runner
     if (args.isPython && deployMode == CLIENT) {
       if (args.primaryResource == PYSPARK_SHELL) {
-        args.mainClass = "py4j.GatewayServer"
-        args.childArgs = ArrayBuffer("--die-on-broken-pipe", "0")
+        args.mainClass = "org.apache.spark.api.python.PythonGatewayServer"
       } else {
         // If a python file is provided, add it to the child arguments and list of files to deploy.
         // Usage: PythonAppRunner <main python file> <extra python files> [app arguments]
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index a0a028446d5fd..936857e75c7e9 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -17,19 +17,20 @@
 
 import atexit
 import os
-import sys
+import select
 import signal
 import shlex
+import socket
 import platform
 from subprocess import Popen, PIPE
-from threading import Thread
 from py4j.java_gateway import java_import, JavaGateway, GatewayClient
 
+from pyspark.serializers import read_int
+
 
 def launch_gateway():
     SPARK_HOME = os.environ["SPARK_HOME"]
 
-    gateway_port = -1
     if "PYSPARK_GATEWAY_PORT" in os.environ:
         gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"])
     else:
@@ -41,36 +42,42 @@ def launch_gateway():
         submit_args = submit_args if submit_args is not None else ""
         submit_args = shlex.split(submit_args)
         command = [os.path.join(SPARK_HOME, script)] + submit_args + ["pyspark-shell"]
+
+        # Start a socket that will be used by PythonGatewayServer to communicate its port to us
+        callback_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        callback_socket.bind(('127.0.0.1', 0))
+        callback_socket.listen(1)
+        callback_host, callback_port = callback_socket.getsockname()
+        env = dict(os.environ)
+        env['_PYSPARK_DRIVER_CALLBACK_HOST'] = callback_host
+        env['_PYSPARK_DRIVER_CALLBACK_PORT'] = str(callback_port)
+
+        # Launch the Java gateway.
+        # We open a pipe to stdin so that the Java gateway can die when the pipe is broken
         if not on_windows:
             # Don't send ctrl-c / SIGINT to the Java gateway:
             def preexec_func():
                 signal.signal(signal.SIGINT, signal.SIG_IGN)
-            env = dict(os.environ)
             env["IS_SUBPROCESS"] = "1"  # tell JVM to exit after python exits
-            proc = Popen(command, stdout=PIPE, stdin=PIPE, preexec_fn=preexec_func, env=env)
+            proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)
         else:
             # preexec_fn not supported on Windows
-            proc = Popen(command, stdout=PIPE, stdin=PIPE)
+            proc = Popen(command, stdin=PIPE, env=env)
 
-        try:
-            # Determine which ephemeral port the server started on:
-            gateway_port = proc.stdout.readline()
-            gateway_port = int(gateway_port)
-        except ValueError:
-            # Grab the remaining lines of stdout
-            (stdout, _) = proc.communicate()
-            exit_code = proc.poll()
-            error_msg = "Launching GatewayServer failed"
-            error_msg += " with exit code %d!\n" % exit_code if exit_code else "!\n"
-            error_msg += "Warning: Expected GatewayServer to output a port, but found "
-            if gateway_port == "" and stdout == "":
-                error_msg += "no output.\n"
-            else:
-                error_msg += "the following:\n\n"
-                error_msg += "--------------------------------------------------------------\n"
-                error_msg += gateway_port + stdout
-                error_msg += "--------------------------------------------------------------\n"
-            raise Exception(error_msg)
+        gateway_port = None
+        # We use select() here in order to avoid blocking indefinitely if the subprocess dies
+        # before connecting
+        while gateway_port is None and proc.poll() is None:
+            timeout = 1  # (seconds)
+            readable, _, _ = select.select([callback_socket], [], [], timeout)
+            if callback_socket in readable:
+                gateway_connection = callback_socket.accept()[0]
+                # Determine which ephemeral port the server started on:
+                gateway_port = read_int(gateway_connection.makefile())
+                gateway_connection.close()
+                callback_socket.close()
+        if gateway_port is None:
+            raise Exception("Java gateway process exited before sending the driver its port number")
 
         # In Windows, ensure the Java child processes do not linger after Python has exited.
         # In UNIX-based systems, the child process can kill itself on broken pipe (i.e. when
@@ -88,21 +95,6 @@ def killChild():
                 Popen(["cmd", "/c", "taskkill", "/f", "/t", "/pid", str(proc.pid)])
             atexit.register(killChild)
 
-        # Create a thread to echo output from the GatewayServer, which is required
-        # for Java log output to show up:
-        class EchoOutputThread(Thread):
-
-            def __init__(self, stream):
-                Thread.__init__(self)
-                self.daemon = True
-                self.stream = stream
-
-            def run(self):
-                while True:
-                    line = self.stream.readline()
-                    sys.stderr.write(line)
-        EchoOutputThread(proc.stdout).start()
-
     # Connect to the gateway
     gateway = JavaGateway(GatewayClient(port=gateway_port), auto_convert=False)
 

From 04b401da811e62a4365cf39ea95cadd0e737001c Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Mon, 16 Feb 2015 15:43:56 -0800
Subject: [PATCH 195/357] HOTFIX: Break in Jekyll build from #4589

That patch had a line break in the middle of a {{ }} expression, which is not allowed.
---
 docs/streaming-programming-guide.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 997de9511ca3e..815c98713b738 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -908,8 +908,7 @@ JavaPairDStream<String, Integer> runningCounts = pairs.updateStateByKey(updateFu
 The update function will be called for each word, with `newValues` having a sequence of 1's (from
 the `(word, 1)` pairs) and the `runningCount` having the previous count. For the complete
 Java code, take a look at the example
-[JavaStatefulNetworkWordCount.java]({{site
-.SPARK_GITHUB_URL}}/blob/master/examples/src/main/java/org/apache/spark/examples/streaming
+[JavaStatefulNetworkWordCount.java]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/java/org/apache/spark/examples/streaming
 /JavaStatefulNetworkWordCount.java).
 
 </div>

From 5b6cd65cd611b1a46a7d5eb33139c6224b96264e Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 16 Feb 2015 15:51:59 -0800
Subject: [PATCH 196/357] [SPARK-5746][SQL] Check invalid cases for the write
 path of data source API

JIRA: https://issues.apache.org/jira/browse/SPARK-5746

liancheng marmbrus

Author: Yin Huai <yhuai@databricks.com>

Closes #4617 from yhuai/insertOverwrite and squashes the following commits:

8e3019d [Yin Huai] Fix compilation error.
499e8e7 [Yin Huai] Merge remote-tracking branch 'upstream/master' into insertOverwrite
e76e85a [Yin Huai] Address comments.
ac31b3c [Yin Huai] Merge remote-tracking branch 'upstream/master' into insertOverwrite
f30bdad [Yin Huai] Use toDF.
99da57e [Yin Huai] Merge remote-tracking branch 'upstream/master' into insertOverwrite
6b7545c [Yin Huai] Add a pre write check to the data source API.
a88c516 [Yin Huai] DDLParser will take a parsering function to take care CTAS statements.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 13 +++-
 .../org/apache/spark/sql/DataFrameImpl.scala  |  8 ++-
 .../org/apache/spark/sql/SQLContext.scala     |  5 +-
 .../spark/sql/execution/SparkStrategies.scala | 10 +--
 .../sql/sources/DataSourceStrategy.scala      |  5 +-
 .../org/apache/spark/sql/sources/ddl.scala    | 29 ++++----
 .../org/apache/spark/sql/sources/rules.scala  | 72 +++++++++++++++++--
 .../spark/sql/parquet/ParquetQuerySuite.scala | 13 ++--
 .../sources/CreateTableAsSelectSuite.scala    | 28 ++++++++
 .../spark/sql/sources/DataSourceTest.scala    |  3 +-
 ...nsertIntoSuite.scala => InsertSuite.scala} | 46 +++++++++++-
 .../apache/spark/sql/hive/HiveContext.scala   | 12 ++--
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  2 +-
 .../spark/sql/hive/HiveStrategies.scala       |  8 +--
 14 files changed, 197 insertions(+), 57 deletions(-)
 rename sql/core/src/test/scala/org/apache/spark/sql/sources/{InsertIntoSuite.scala => InsertSuite.scala} (79%)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index aa4320bd582cb..fc37b8cde0806 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -50,7 +50,13 @@ class Analyzer(catalog: Catalog,
   /**
    * Override to provide additional rules for the "Resolution" batch.
    */
-  val extendedRules: Seq[Rule[LogicalPlan]] = Nil
+  val extendedResolutionRules: Seq[Rule[LogicalPlan]] = Nil
+
+  /**
+   * Override to provide additional rules for the "Check Analysis" batch.
+   * These rules will be evaluated after our built-in check rules.
+   */
+  val extendedCheckRules: Seq[Rule[LogicalPlan]] = Nil
 
   lazy val batches: Seq[Batch] = Seq(
     Batch("Resolution", fixedPoint,
@@ -64,9 +70,10 @@ class Analyzer(catalog: Catalog,
       UnresolvedHavingClauseAttributes ::
       TrimGroupingAliases ::
       typeCoercionRules ++
-      extendedRules : _*),
+      extendedResolutionRules : _*),
     Batch("Check Analysis", Once,
-      CheckResolution),
+      CheckResolution +:
+      extendedCheckRules: _*),
     Batch("Remove SubQueries", fixedPoint,
       EliminateSubQueries)
   )
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index 500e3c90fdbc1..3c1cf8d5e3851 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -67,7 +67,11 @@ private[sql] class DataFrameImpl protected[sql](
   @transient protected[sql] override val logicalPlan: LogicalPlan = queryExecution.logical match {
     // For various commands (like DDL) and queries with side effects, we force query optimization to
     // happen right away to let these side effects take place eagerly.
-    case _: Command | _: InsertIntoTable | _: CreateTableAsSelect[_] |_: WriteToFile =>
+    case _: Command |
+         _: InsertIntoTable |
+         _: CreateTableAsSelect[_] |
+         _: CreateTableUsingAsSelect |
+         _: WriteToFile =>
       LogicalRDD(queryExecution.analyzed.output, queryExecution.toRdd)(sqlContext)
     case _ =>
       queryExecution.logical
@@ -386,7 +390,7 @@ private[sql] class DataFrameImpl protected[sql](
       mode: SaveMode,
       options: Map[String, String]): Unit = {
     val cmd =
-      CreateTableUsingAsLogicalPlan(
+      CreateTableUsingAsSelect(
         tableName,
         source,
         temporary = false,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 1442250569416..d08c2d1cfe030 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -92,7 +92,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
   @transient
   protected[sql] lazy val analyzer: Analyzer =
     new Analyzer(catalog, functionRegistry, caseSensitive = true) {
-      override val extendedRules =
+      override val extendedResolutionRules =
+        sources.PreWriteCheck(catalog) ::
         sources.PreInsertCastAndRename ::
         Nil
     }
@@ -101,7 +102,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
   protected[sql] lazy val optimizer: Optimizer = DefaultOptimizer
 
   @transient
-  protected[sql] val ddlParser = new DDLParser
+  protected[sql] val ddlParser = new DDLParser(sqlParser.apply(_))
 
   @transient
   protected[sql] val sqlParser = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index e915e0e6a0ec1..5281c7502556a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -319,18 +319,10 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         sys.error("allowExisting should be set to false when creating a temporary table.")
 
       case CreateTableUsingAsSelect(tableName, provider, true, mode, opts, query) =>
-        val logicalPlan = sqlContext.parseSql(query)
-        val cmd =
-          CreateTempTableUsingAsSelect(tableName, provider, mode, opts, logicalPlan)
-        ExecutedCommand(cmd) :: Nil
-      case c: CreateTableUsingAsSelect if !c.temporary =>
-        sys.error("Tables created with SQLContext must be TEMPORARY. Use a HiveContext instead.")
-
-      case CreateTableUsingAsLogicalPlan(tableName, provider, true, mode, opts, query) =>
         val cmd =
           CreateTempTableUsingAsSelect(tableName, provider, mode, opts, query)
         ExecutedCommand(cmd) :: Nil
-      case c: CreateTableUsingAsLogicalPlan if !c.temporary =>
+      case c: CreateTableUsingAsSelect if !c.temporary =>
         sys.error("Tables created with SQLContext must be TEMPORARY. Use a HiveContext instead.")
 
       case LogicalDescribeCommand(table, isExtended) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
index a853385fdac68..67f3507c61ab6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -55,10 +55,7 @@ private[sql] object DataSourceStrategy extends Strategy {
       execution.PhysicalRDD(l.output, t.buildScan()) :: Nil
 
     case i @ logical.InsertIntoTable(
-      l @ LogicalRelation(t: InsertableRelation), partition, query, overwrite) =>
-      if (partition.nonEmpty) {
-        sys.error(s"Insert into a partition is not allowed because $l is not partitioned.")
-      }
+      l @ LogicalRelation(t: InsertableRelation), part, query, overwrite) if part.isEmpty =>
       execution.ExecutedCommand(InsertIntoDataSource(l, query, overwrite)) :: Nil
 
     case _ => Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index 1b5e8c280e2fe..dd8b3d211be64 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.{SaveMode, DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
-import org.apache.spark.sql.catalyst.expressions.{Row, AttributeReference}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Row}
 import org.apache.spark.sql.execution.RunnableCommand
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -32,7 +32,8 @@ import org.apache.spark.util.Utils
 /**
  * A parser for foreign DDL commands.
  */
-private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
+private[sql] class DDLParser(
+    parseQuery: String => LogicalPlan) extends AbstractSparkSQLParser with Logging {
 
   def apply(input: String, exceptionOnError: Boolean): Option[LogicalPlan] = {
     try {
@@ -105,6 +106,7 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
    * AS SELECT ...
    */
   protected lazy val createTable: Parser[LogicalPlan] =
+    // TODO: Support database.table.
     (CREATE ~> TEMPORARY.? <~ TABLE) ~ (IF ~> NOT <~ EXISTS).? ~ ident ~
       tableCols.? ~ (USING ~> className) ~ (OPTIONS ~> options).? ~ (AS ~> restInput).? ^^ {
       case temp ~ allowExisting ~ tableName ~ columns ~ provider ~ opts ~ query =>
@@ -128,12 +130,13 @@ private[sql] class DDLParser extends AbstractSparkSQLParser with Logging {
             SaveMode.ErrorIfExists
           }
 
+          val queryPlan = parseQuery(query.get)
           CreateTableUsingAsSelect(tableName,
             provider,
             temp.isDefined,
             mode,
             options,
-            query.get)
+            queryPlan)
         } else {
           val userSpecifiedSchema = columns.flatMap(fields => Some(StructType(fields)))
           CreateTableUsing(
@@ -345,21 +348,23 @@ private[sql] case class CreateTableUsing(
     allowExisting: Boolean,
     managedIfNoPath: Boolean) extends Command
 
+/**
+ * A node used to support CTAS statements and saveAsTable for the data source API.
+ * This node is a [[UnaryNode]] instead of a [[Command]] because we want the analyzer
+ * can analyze the logical plan that will be used to populate the table.
+ * So, [[PreWriteCheck]] can detect cases that are not allowed.
+ */
 private[sql] case class CreateTableUsingAsSelect(
     tableName: String,
     provider: String,
     temporary: Boolean,
     mode: SaveMode,
     options: Map[String, String],
-    query: String) extends Command
-
-private[sql] case class CreateTableUsingAsLogicalPlan(
-    tableName: String,
-    provider: String,
-    temporary: Boolean,
-    mode: SaveMode,
-    options: Map[String, String],
-    query: LogicalPlan) extends Command
+    child: LogicalPlan) extends UnaryNode {
+  override def output = Seq.empty[Attribute]
+  // TODO: Override resolved after we support databaseName.
+  // override lazy val resolved = databaseName != None && childrenResolved
+}
 
 private[sql] case class CreateTempTableUsing(
     tableName: String,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
index 4ed22d363da5b..36a9c0bdc41e6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
@@ -17,7 +17,10 @@
 
 package org.apache.spark.sql.sources
 
+import org.apache.spark.sql.{SaveMode, AnalysisException}
+import org.apache.spark.sql.catalyst.analysis.{EliminateSubQueries, Catalog}
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Alias}
+import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.types.DataType
@@ -26,11 +29,9 @@ import org.apache.spark.sql.types.DataType
  * A rule to do pre-insert data type casting and field renaming. Before we insert into
  * an [[InsertableRelation]], we will use this rule to make sure that
  * the columns to be inserted have the correct data type and fields have the correct names.
- * @param resolver The resolver used by the Analyzer.
  */
 private[sql] object PreInsertCastAndRename extends Rule[LogicalPlan] {
-  def apply(plan: LogicalPlan): LogicalPlan = {
-    plan.transform {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
       // Wait until children are resolved.
       case p: LogicalPlan if !p.childrenResolved => p
 
@@ -46,7 +47,6 @@ private[sql] object PreInsertCastAndRename extends Rule[LogicalPlan] {
         }
         castAndRenameChildOutput(i, l.output, child)
       }
-    }
   }
 
   /** If necessary, cast data types and rename fields to the expected types and names. */
@@ -74,3 +74,67 @@ private[sql] object PreInsertCastAndRename extends Rule[LogicalPlan] {
     }
   }
 }
+
+/**
+ * A rule to do various checks before inserting into or writing to a data source table.
+ */
+private[sql] case class PreWriteCheck(catalog: Catalog) extends Rule[LogicalPlan] {
+  def failAnalysis(msg: String) = { throw new AnalysisException(msg) }
+
+  def apply(plan: LogicalPlan): LogicalPlan = {
+    plan.foreach {
+      case i @ logical.InsertIntoTable(
+        l @ LogicalRelation(t: InsertableRelation), partition, query, overwrite) =>
+        // Right now, we do not support insert into a data source table with partition specs.
+        if (partition.nonEmpty) {
+          failAnalysis(s"Insert into a partition is not allowed because $l is not partitioned.")
+        } else {
+          // Get all input data source relations of the query.
+          val srcRelations = query.collect {
+            case LogicalRelation(src: BaseRelation) => src
+          }
+          if (srcRelations.exists(src => src == t)) {
+            failAnalysis(
+              "Cannot insert overwrite into table that is also being read from.")
+          } else {
+            // OK
+          }
+        }
+
+      case i @ logical.InsertIntoTable(
+        l: LogicalRelation, partition, query, overwrite) if !l.isInstanceOf[InsertableRelation] =>
+        // The relation in l is not an InsertableRelation.
+        failAnalysis(s"$l does not allow insertion.")
+
+      case CreateTableUsingAsSelect(tableName, _, _, SaveMode.Overwrite, _, query) =>
+        // When the SaveMode is Overwrite, we need to check if the table is an input table of
+        // the query. If so, we will throw an AnalysisException to let users know it is not allowed.
+        if (catalog.tableExists(Seq(tableName))) {
+          // Need to remove SubQuery operator.
+          EliminateSubQueries(catalog.lookupRelation(Seq(tableName))) match {
+            // Only do the check if the table is a data source table
+            // (the relation is a BaseRelation).
+            case l @ LogicalRelation(dest: BaseRelation) =>
+              // Get all input data source relations of the query.
+              val srcRelations = query.collect {
+                case LogicalRelation(src: BaseRelation) => src
+              }
+              if (srcRelations.exists(src => src == dest)) {
+                failAnalysis(
+                  s"Cannot overwrite table $tableName that is also being read from.")
+              } else {
+                // OK
+              }
+
+            case _ => // OK
+          }
+        } else {
+          // OK
+        }
+
+      case _ => // OK
+    }
+
+    plan
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index d0665450cd766..9318c15520a10 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -38,21 +38,22 @@ class ParquetQuerySuiteBase extends QueryTest with ParquetTest {
 
   test("appending") {
     val data = (0 until 10).map(i => (i, i.toString))
+    createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
     withParquetTable(data, "t") {
-      sql("INSERT INTO TABLE t SELECT * FROM t")
+      sql("INSERT INTO TABLE t SELECT * FROM tmp")
       checkAnswer(table("t"), (data ++ data).map(Row.fromTuple))
     }
+    catalog.unregisterTable(Seq("tmp"))
   }
 
-  // This test case will trigger the NPE mentioned in
-  // https://issues.apache.org/jira/browse/PARQUET-151.
-  // Update: This also triggers SPARK-5746, should re enable it when we get both fixed.
-  ignore("overwriting") {
+  test("overwriting") {
     val data = (0 until 10).map(i => (i, i.toString))
+    createDataFrame(data).toDF("c1", "c2").registerTempTable("tmp")
     withParquetTable(data, "t") {
-      sql("INSERT OVERWRITE TABLE t SELECT * FROM t")
+      sql("INSERT OVERWRITE TABLE t SELECT * FROM tmp")
       checkAnswer(table("t"), data.map(Row.fromTuple))
     }
+    catalog.unregisterTable(Seq("tmp"))
   }
 
   test("self-join") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
index 29caed9337ff6..60355414a40fa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.sources
 
 import java.io.File
 
+import org.apache.spark.sql.AnalysisException
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.catalyst.util
@@ -157,4 +158,31 @@ class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll {
       """.stripMargin)
     }
   }
+
+  test("it is not allowed to write to a table while querying it.") {
+    sql(
+      s"""
+        |CREATE TEMPORARY TABLE jsonTable
+        |USING org.apache.spark.sql.json.DefaultSource
+        |OPTIONS (
+        |  path '${path.toString}'
+        |) AS
+        |SELECT a, b FROM jt
+      """.stripMargin)
+
+    val message = intercept[AnalysisException] {
+      sql(
+        s"""
+        |CREATE TEMPORARY TABLE jsonTable
+        |USING org.apache.spark.sql.json.DefaultSource
+        |OPTIONS (
+        |  path '${path.toString}'
+        |) AS
+        |SELECT a, b FROM jsonTable
+      """.stripMargin)
+    }.getMessage
+    assert(
+      message.contains("Cannot overwrite table "),
+      "Writing to a table while querying it should not be allowed.")
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
index 53f5f7426e9e6..0ec6881d7afe6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
@@ -29,7 +29,8 @@ abstract class DataSourceTest extends QueryTest with BeforeAndAfter {
     @transient
     override protected[sql] lazy val analyzer: Analyzer =
       new Analyzer(catalog, functionRegistry, caseSensitive = false) {
-        override val extendedRules =
+        override val extendedResolutionRules =
+          PreWriteCheck(catalog) ::
           PreInsertCastAndRename ::
           Nil
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertIntoSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
similarity index 79%
rename from sql/core/src/test/scala/org/apache/spark/sql/sources/InsertIntoSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 36e504e759152..5682e5a2bcea9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertIntoSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -21,11 +21,11 @@ import java.io.File
 
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.{AnalysisException, Row}
 import org.apache.spark.sql.catalyst.util
 import org.apache.spark.util.Utils
 
-class InsertIntoSuite extends DataSourceTest with BeforeAndAfterAll {
+class InsertSuite extends DataSourceTest with BeforeAndAfterAll {
 
   import caseInsensisitiveContext._
 
@@ -129,6 +129,18 @@ class InsertIntoSuite extends DataSourceTest with BeforeAndAfterAll {
     }
   }
 
+  test("it is not allowed to write to a table while querying it.") {
+    val message = intercept[AnalysisException] {
+      sql(
+        s"""
+        |INSERT OVERWRITE TABLE jsonTable SELECT a, b FROM jsonTable
+      """.stripMargin)
+    }.getMessage
+    assert(
+      message.contains("Cannot insert overwrite into table that is also being read from."),
+      "INSERT OVERWRITE to a table while querying it should not be allowed.")
+  }
+
   test("Caching")  {
     // Cached Query Execution
     cacheTable("jsonTable")
@@ -173,4 +185,34 @@ class InsertIntoSuite extends DataSourceTest with BeforeAndAfterAll {
     uncacheTable("jsonTable")
     assertCached(sql("SELECT * FROM jsonTable"), 0)
   }
+
+  test("it's not allowed to insert into a relation that is not an InsertableRelation") {
+    sql(
+      """
+        |CREATE TEMPORARY TABLE oneToTen
+        |USING org.apache.spark.sql.sources.SimpleScanSource
+        |OPTIONS (
+        |  From '1',
+        |  To '10'
+        |)
+      """.stripMargin)
+
+    checkAnswer(
+      sql("SELECT * FROM oneToTen"),
+      (1 to 10).map(Row(_)).toSeq
+    )
+
+    val message = intercept[AnalysisException] {
+      sql(
+        s"""
+        |INSERT OVERWRITE TABLE oneToTen SELECT a FROM jt
+        """.stripMargin)
+    }.getMessage
+    assert(
+      message.contains("does not allow insertion."),
+      "It is not allowed to insert into a table that is not an InsertableRelation."
+    )
+
+    dropTempTable("oneToTen")
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 87b380f950979..6c55bc6be17f9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -39,7 +39,7 @@ import org.apache.spark.sql.catalyst.analysis.{Analyzer, EliminateSubQueries, Ov
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUdfs, QueryExecutionException, SetCommand}
 import org.apache.spark.sql.hive.execution.{DescribeHiveTableCommand, HiveNativeCommand}
-import org.apache.spark.sql.sources.DataSourceStrategy
+import org.apache.spark.sql.sources.{DDLParser, DataSourceStrategy}
 import org.apache.spark.sql.types._
 
 /**
@@ -64,14 +64,17 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   override protected[sql] def executePlan(plan: LogicalPlan): this.QueryExecution =
     new this.QueryExecution(plan)
 
+  @transient
+  protected[sql] val ddlParserWithHiveQL = new DDLParser(HiveQl.parseSql(_))
+
   override def sql(sqlText: String): DataFrame = {
     val substituted = new VariableSubstitution().substitute(hiveconf, sqlText)
     // TODO: Create a framework for registering parsers instead of just hardcoding if statements.
     if (conf.dialect == "sql") {
       super.sql(substituted)
     } else if (conf.dialect == "hiveql") {
-      DataFrame(this,
-        ddlParser(sqlText, exceptionOnError = false).getOrElse(HiveQl.parseSql(substituted)))
+      val ddlPlan = ddlParserWithHiveQL(sqlText, exceptionOnError = false)
+      DataFrame(this, ddlPlan.getOrElse(HiveQl.parseSql(substituted)))
     }  else {
       sys.error(s"Unsupported SQL dialect: ${conf.dialect}. Try 'sql' or 'hiveql'")
     }
@@ -241,12 +244,13 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   @transient
   override protected[sql] lazy val analyzer =
     new Analyzer(catalog, functionRegistry, caseSensitive = false) {
-      override val extendedRules =
+      override val extendedResolutionRules =
         catalog.ParquetConversions ::
         catalog.CreateTables ::
         catalog.PreInsertionCasts ::
         ExtractPythonUdfs ::
         ResolveUdtfsAlias ::
+        sources.PreWriteCheck(catalog) ::
         sources.PreInsertCastAndRename ::
         Nil
     }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 580c5706dde67..72211fe2e46c6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -663,7 +663,7 @@ private[hive] case class MetastoreRelation
 }
 
 object HiveMetastoreTypes {
-  protected val ddlParser = new DDLParser
+  protected val ddlParser = new DDLParser(HiveQl.parseSql(_))
 
   def toDataType(metastoreType: String): DataType = synchronized {
     ddlParser.parseType(metastoreType)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 965d159656d80..d2c39ab621713 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeComman
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.hive.execution._
 import org.apache.spark.sql.parquet.ParquetRelation
-import org.apache.spark.sql.sources.{CreateTableUsingAsLogicalPlan, CreateTableUsingAsSelect, CreateTableUsing}
+import org.apache.spark.sql.sources.{CreateTableUsingAsSelect, CreateTableUsing}
 import org.apache.spark.sql.types.StringType
 
 
@@ -227,12 +227,6 @@ private[hive] trait HiveStrategies {
             tableName, userSpecifiedSchema, provider, opts, allowExisting, managedIfNoPath)) :: Nil
 
       case CreateTableUsingAsSelect(tableName, provider, false, mode, opts, query) =>
-        val logicalPlan = hiveContext.parseSql(query)
-        val cmd =
-          CreateMetastoreDataSourceAsSelect(tableName, provider, mode, opts, logicalPlan)
-        ExecutedCommand(cmd) :: Nil
-
-      case CreateTableUsingAsLogicalPlan(tableName, provider, false, mode, opts, query) =>
         val cmd =
           CreateMetastoreDataSourceAsSelect(tableName, provider, mode, opts, query)
         ExecutedCommand(cmd) :: Nil

From f3ff1eb2985ff3e1567645b898f6b42e4b01f237 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 16 Feb 2015 15:54:01 -0800
Subject: [PATCH 197/357] [SPARK-5839][SQL]HiveMetastoreCatalog does not
 recognize table names and aliases of data source tables.

JIRA: https://issues.apache.org/jira/browse/SPARK-5839

Author: Yin Huai <yhuai@databricks.com>

Closes #4626 from yhuai/SPARK-5839 and squashes the following commits:

f779d85 [Yin Huai] Use subqeury to wrap replaced ParquetRelation.
2695f13 [Yin Huai] Merge remote-tracking branch 'upstream/master' into SPARK-5839
f1ba6ca [Yin Huai] Address comment.
2c7fa08 [Yin Huai] Use Subqueries to wrap a data source table.
---
 .../spark/sql/parquet/ParquetQuerySuite.scala |  5 +--
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 18 ++++++++--
 .../sql/hive/MetastoreDataSourcesSuite.scala  | 34 +++++++++++++++++++
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index 9318c15520a10..8b4d05ec547c6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -30,9 +30,10 @@ import org.apache.spark.sql.test.TestSQLContext._
 class ParquetQuerySuiteBase extends QueryTest with ParquetTest {
   val sqlContext = TestSQLContext
 
-  test("simple projection") {
+  test("simple select queries") {
     withParquetTable((0 until 10).map(i => (i, i.toString)), "t") {
-      checkAnswer(sql("SELECT _1 FROM t"), (0 until 10).map(Row.apply(_)))
+      checkAnswer(sql("SELECT _1 FROM t where t._1 > 5"), (6 until 10).map(Row.apply(_)))
+      checkAnswer(sql("SELECT _1 FROM t as tmp where tmp._1 < 5"), (0 until 5).map(Row.apply(_)))
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 72211fe2e46c6..87bc9fe4fe9ad 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -160,7 +160,15 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
     }
 
     if (table.getProperty("spark.sql.sources.provider") != null) {
-      cachedDataSourceTables(QualifiedTableName(databaseName, tblName).toLowerCase)
+      val dataSourceTable =
+        cachedDataSourceTables(QualifiedTableName(databaseName, tblName).toLowerCase)
+      // Then, if alias is specified, wrap the table with a Subquery using the alias.
+      // Othersie, wrap the table with a Subquery using the table name.
+      val withAlias =
+        alias.map(a => Subquery(a, dataSourceTable)).getOrElse(
+          Subquery(tableIdent.last, dataSourceTable))
+
+      withAlias
     } else if (table.isView) {
       // if the unresolved relation is from hive view
       // parse the text into logic node.
@@ -433,7 +441,13 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
         val attributedRewrites = AttributeMap(relation.output.zip(parquetRelation.output))
 
         lastPlan.transformUp {
-          case r: MetastoreRelation if r == relation => parquetRelation
+          case r: MetastoreRelation if r == relation => {
+            val withAlias =
+              r.alias.map(a => Subquery(a, parquetRelation)).getOrElse(
+                Subquery(r.tableName, parquetRelation))
+
+            withAlias
+          }
           case other => other.transformExpressions {
             case a: Attribute if a.resolved => attributedRewrites.getOrElse(a, a)
           }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 375aae5d51915..0263e3bb56617 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -401,6 +401,40 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
     sql("DROP TABLE jsonTable").collect().foreach(println)
   }
 
+  test("SPARK-5839 HiveMetastoreCatalog does not recognize table aliases of data source tables.") {
+    val originalDefaultSource = conf.defaultDataSourceName
+
+    val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
+    val df = jsonRDD(rdd)
+
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.json")
+    // Save the df as a managed table (by not specifiying the path).
+    df.saveAsTable("savedJsonTable")
+
+    checkAnswer(
+      sql("SELECT * FROM savedJsonTable where savedJsonTable.a < 5"),
+      (1 to 4).map(i => Row(i, s"str${i}")))
+
+    checkAnswer(
+      sql("SELECT * FROM savedJsonTable tmp where tmp.a > 5"),
+      (6 to 10).map(i => Row(i, s"str${i}")))
+
+    invalidateTable("savedJsonTable")
+
+    checkAnswer(
+      sql("SELECT * FROM savedJsonTable where savedJsonTable.a < 5"),
+      (1 to 4).map(i => Row(i, s"str${i}")))
+
+    checkAnswer(
+      sql("SELECT * FROM savedJsonTable tmp where tmp.a > 5"),
+      (6 to 10).map(i => Row(i, s"str${i}")))
+
+    // Drop table will also delete the data.
+    sql("DROP TABLE savedJsonTable")
+
+    conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource)
+  }
+
   test("save table") {
     val originalDefaultSource = conf.defaultDataSourceName
 

From cb6c48c874af2bd78ee73c1dc8a44fd28ecc0991 Mon Sep 17 00:00:00 2001
From: kai <kaizeng@eecs.berkeley.edu>
Date: Mon, 16 Feb 2015 15:58:05 -0800
Subject: [PATCH 198/357] [SQL] Optimize arithmetic and predicate operators

Existing implementation of arithmetic operators and BinaryComparison operators have redundant type checking codes, e.g.:
Expression.n2 is used by Add/Subtract/Multiply.
(1) n2 always checks left.dataType == right.dataType. However, this checking should be done once when we resolve expression types;
(2) n2 requires dataType is a NumericType. This can be done once.

This PR optimizes arithmetic and predicate operators by removing such redundant type-checking codes.

Some preliminary benchmarking on 10G TPC-H data over 5 r3.2xlarge EC2 machines shows that this PR can reduce the query time by 5.5% to 11%.
The benchmark queries follow the template below, where OP is plus/minus/times/divide/remainder/bitwise and/bitwise or/bitwise xor.

SELECT l_returnflag,  l_linestatus, SUM(l_quantity OP cnt1), SUM(l_quantity OP cnt2), ...., SUM(l_quantity OP cnt700)
FROM (
    SELECT l_returnflag, l_linestatus, l_quantity, 1 AS cnt1, 2 AS cnt2, ..., 700 AS cnt700
    FROM lineitem
    WHERE l_shipdate <= '1998-09-01'
)
GROUP BY l_returnflag,  l_linestatus;

Author: kai <kaizeng@eecs.berkeley.edu>

Closes #4472 from kai-zeng/arithmetic-optimize and squashes the following commits:

fef0cf1 [kai] Merge branch 'master' of github.com:apache/spark into arithmetic-optimize
4b3a1bb [kai] chmod a-x
5a41e49 [kai] chmod a-x Expression.scala
cb37c94 [kai] rebase onto spark master
7f6e968 [kai] chmod 100755 -> 100644
6cddb46 [kai] format
7490dbc [kai] fix unresolved-expression exception for EqualTo
9c40bc0 [kai] fix bitwisenot
3cbd363 [kai] clean up test code
ca47801 [kai] override evalInternal for bitwise ops
8fa84a1 [kai] add bitwise or and xor
6892fc4 [kai] revert override evalInternal
f8eba24 [kai] override evalInternal
31ccdd4 [kai] rewrite all bitwise op and remove evalInternal
86297e2 [kai] generalized
cb92ae1 [kai] bitwise-and: override eval
97a7d6c [kai] bitwise-and: override evalInternal using and func
0906c39 [kai] add bitwise test
62abbbc [kai] clean up predicate and arithmetic
b34d58d [kai] add caching and benmark option
12c5b32 [kai] override eval
1cd7571 [kai] fix sqrt and maxof
03fd0c3 [kai] fix predicate
16fd84c [kai] optimize + - * / % -(unary) abs < > <= >=
fd95823 [kai] remove unnecessary type checking
24d062f [kai] test suite
---
 .../apache/spark/sql/catalyst/SqlParser.scala |   0
 .../spark/sql/catalyst/dsl/package.scala      |   0
 .../sql/catalyst/expressions/Expression.scala | 200 --------------
 .../sql/catalyst/expressions/aggregates.scala |   0
 .../sql/catalyst/expressions/arithmetic.scala | 243 ++++++++++++++----
 .../sql/catalyst/expressions/predicates.scala | 107 +++++++-
 .../org/apache/spark/sql/types/Metadata.scala |   0
 .../sql/catalyst/util/MetadataSuite.scala     |   0
 .../spark/sql/execution/Aggregate.scala       |   0
 .../hive/thriftserver/SparkSQLCLIDriver.scala |   0
 10 files changed, 290 insertions(+), 260 deletions(-)
 mode change 100755 => 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
 mode change 100755 => 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
 mode change 100755 => 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
 mode change 100755 => 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/types/Metadata.scala
 mode change 100755 => 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/MetadataSuite.scala
 mode change 100755 => 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
 mode change 100755 => 100644 sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
old mode 100755
new mode 100644
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
old mode 100755
new mode 100644
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index c32a4b886eb82..6ad39b8372cfb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -77,206 +77,6 @@ abstract class Expression extends TreeNode[Expression] {
       case u: UnresolvedAttribute => PrettyAttribute(u.name)
     }.toString
   }
-
-  /**
-   * A set of helper functions that return the correct descendant of `scala.math.Numeric[T]` type
-   * and do any casting necessary of child evaluation.
-   */
-  @inline
-  def n1(e: Expression, i: Row, f: ((Numeric[Any], Any) => Any)): Any  = {
-    val evalE = e.eval(i)
-    if (evalE == null) {
-      null
-    } else {
-      e.dataType match {
-        case n: NumericType =>
-          val castedFunction = f.asInstanceOf[(Numeric[n.JvmType], n.JvmType) => n.JvmType]
-          castedFunction(n.numeric, evalE.asInstanceOf[n.JvmType])
-        case other => sys.error(s"Type $other does not support numeric operations")
-      }
-    }
-  }
-
-  /**
-   * Evaluation helper function for 2 Numeric children expressions. Those expressions are supposed
-   * to be in the same data type, and also the return type.
-   * Either one of the expressions result is null, the evaluation result should be null.
-   */
-  @inline
-  protected final def n2(
-      i: Row,
-      e1: Expression,
-      e2: Expression,
-      f: ((Numeric[Any], Any, Any) => Any)): Any  = {
-
-    if (e1.dataType != e2.dataType) {
-      throw new TreeNodeException(this,  s"Types do not match ${e1.dataType} != ${e2.dataType}")
-    }
-
-    val evalE1 = e1.eval(i)
-    if(evalE1 == null) {
-      null
-    } else {
-      val evalE2 = e2.eval(i)
-      if (evalE2 == null) {
-        null
-      } else {
-        e1.dataType match {
-          case n: NumericType =>
-            f.asInstanceOf[(Numeric[n.JvmType], n.JvmType, n.JvmType) => n.JvmType](
-              n.numeric, evalE1.asInstanceOf[n.JvmType], evalE2.asInstanceOf[n.JvmType])
-          case other => sys.error(s"Type $other does not support numeric operations")
-        }
-      }
-    }
-  }
-
-  /**
-   * Evaluation helper function for 2 Fractional children expressions. Those expressions are
-   * supposed to be in the same data type, and also the return type.
-   * Either one of the expressions result is null, the evaluation result should be null.
-   */
-  @inline
-  protected final def f2(
-      i: Row,
-      e1: Expression,
-      e2: Expression,
-      f: ((Fractional[Any], Any, Any) => Any)): Any  = {
-    if (e1.dataType != e2.dataType) {
-      throw new TreeNodeException(this,  s"Types do not match ${e1.dataType} != ${e2.dataType}")
-    }
-
-    val evalE1 = e1.eval(i: Row)
-    if(evalE1 == null) {
-      null
-    } else {
-      val evalE2 = e2.eval(i: Row)
-      if (evalE2 == null) {
-        null
-      } else {
-        e1.dataType match {
-          case ft: FractionalType =>
-            f.asInstanceOf[(Fractional[ft.JvmType], ft.JvmType, ft.JvmType) => ft.JvmType](
-              ft.fractional, evalE1.asInstanceOf[ft.JvmType], evalE2.asInstanceOf[ft.JvmType])
-          case other => sys.error(s"Type $other does not support fractional operations")
-        }
-      }
-    }
-  }
-
-  /**
-   * Evaluation helper function for 1 Fractional children expression.
-   * if the expression result is null, the evaluation result should be null.
-   */
-  @inline
-  protected final def f1(i: Row, e1: Expression, f: ((Fractional[Any], Any) => Any)): Any  = {
-    val evalE1 = e1.eval(i: Row)
-    if(evalE1 == null) {
-      null
-    } else {
-      e1.dataType match {
-        case ft: FractionalType =>
-          f.asInstanceOf[(Fractional[ft.JvmType], ft.JvmType) => ft.JvmType](
-            ft.fractional, evalE1.asInstanceOf[ft.JvmType])
-        case other => sys.error(s"Type $other does not support fractional operations")
-      }
-    }
-  }
-
-  /**
-   * Evaluation helper function for 2 Integral children expressions. Those expressions are
-   * supposed to be in the same data type, and also the return type.
-   * Either one of the expressions result is null, the evaluation result should be null.
-   */
-  @inline
-  protected final def i2(
-      i: Row,
-      e1: Expression,
-      e2: Expression,
-      f: ((Integral[Any], Any, Any) => Any)): Any  = {
-    if (e1.dataType != e2.dataType) {
-      throw new TreeNodeException(this,  s"Types do not match ${e1.dataType} != ${e2.dataType}")
-    }
-
-    val evalE1 = e1.eval(i)
-    if(evalE1 == null) {
-      null
-    } else {
-      val evalE2 = e2.eval(i)
-      if (evalE2 == null) {
-        null
-      } else {
-        e1.dataType match {
-          case i: IntegralType =>
-            f.asInstanceOf[(Integral[i.JvmType], i.JvmType, i.JvmType) => i.JvmType](
-              i.integral, evalE1.asInstanceOf[i.JvmType], evalE2.asInstanceOf[i.JvmType])
-          case i: FractionalType =>
-            f.asInstanceOf[(Integral[i.JvmType], i.JvmType, i.JvmType) => i.JvmType](
-              i.asIntegral, evalE1.asInstanceOf[i.JvmType], evalE2.asInstanceOf[i.JvmType])
-          case other => sys.error(s"Type $other does not support numeric operations")
-        }
-      }
-    }
-  }
-
-  /**
-   * Evaluation helper function for 1 Integral children expression.
-   * if the expression result is null, the evaluation result should be null.
-   */
-  @inline
-  protected final def i1(i: Row, e1: Expression, f: ((Integral[Any], Any) => Any)): Any  = {
-    val evalE1 = e1.eval(i)
-    if(evalE1 == null) {
-      null
-    } else {
-      e1.dataType match {
-        case i: IntegralType =>
-          f.asInstanceOf[(Integral[i.JvmType], i.JvmType) => i.JvmType](
-            i.integral, evalE1.asInstanceOf[i.JvmType])
-        case i: FractionalType =>
-          f.asInstanceOf[(Integral[i.JvmType], i.JvmType) => i.JvmType](
-            i.asIntegral, evalE1.asInstanceOf[i.JvmType])
-        case other => sys.error(s"Type $other does not support numeric operations")
-      }
-    }
-  }
-
-  /**
-   * Evaluation helper function for 2 Comparable children expressions. Those expressions are
-   * supposed to be in the same data type, and the return type should be Integer:
-   * Negative value: 1st argument less than 2nd argument
-   * Zero:  1st argument equals 2nd argument
-   * Positive value: 1st argument greater than 2nd argument
-   *
-   * Either one of the expressions result is null, the evaluation result should be null.
-   */
-  @inline
-  protected final def c2(
-      i: Row,
-      e1: Expression,
-      e2: Expression,
-      f: ((Ordering[Any], Any, Any) => Any)): Any  = {
-    if (e1.dataType != e2.dataType) {
-      throw new TreeNodeException(this,  s"Types do not match ${e1.dataType} != ${e2.dataType}")
-    }
-
-    val evalE1 = e1.eval(i)
-    if(evalE1 == null) {
-      null
-    } else {
-      val evalE2 = e2.eval(i)
-      if (evalE2 == null) {
-        null
-      } else {
-        e1.dataType match {
-          case i: NativeType =>
-            f.asInstanceOf[(Ordering[i.JvmType], i.JvmType, i.JvmType) => Boolean](
-              i.ordering, evalE1.asInstanceOf[i.JvmType], evalE2.asInstanceOf[i.JvmType])
-          case other => sys.error(s"Type $other does not support ordered operations")
-        }
-      }
-    }
-  }
 }
 
 abstract class BinaryExpression extends Expression with trees.BinaryNode[Expression] {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
old mode 100755
new mode 100644
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 574907f566c0f..00b0d3c683fe2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
+import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.types._
 
 case class UnaryMinus(child: Expression) extends UnaryExpression {
@@ -28,8 +29,18 @@ case class UnaryMinus(child: Expression) extends UnaryExpression {
   def nullable = child.nullable
   override def toString = s"-$child"
 
+  lazy val numeric = dataType match {
+    case n: NumericType => n.numeric.asInstanceOf[Numeric[Any]]
+    case other => sys.error(s"Type $other does not support numeric operations")
+  }
+
   override def eval(input: Row): Any = {
-    n1(child, input, _.negate(_))
+    val evalE = child.eval(input)
+    if (evalE == null) {
+      null
+    } else {
+      numeric.negate(evalE)
+    }
   }
 }
 
@@ -41,18 +52,19 @@ case class Sqrt(child: Expression) extends UnaryExpression {
   def nullable = true
   override def toString = s"SQRT($child)"
 
+  lazy val numeric = child.dataType match {
+    case n: NumericType => n.numeric.asInstanceOf[Numeric[Any]]
+    case other => sys.error(s"Type $other does not support non-negative numeric operations")
+  }
+
   override def eval(input: Row): Any = {
     val evalE = child.eval(input)
     if (evalE == null) {
       null
     } else {
-      child.dataType match {
-        case n: NumericType =>
-          val value = n.numeric.toDouble(evalE.asInstanceOf[n.JvmType])
-          if (value < 0) null
-          else math.sqrt(value)
-        case other => sys.error(s"Type $other does not support non-negative numeric operations")
-      }
+      val value = numeric.toDouble(evalE)
+      if (value < 0) null
+      else math.sqrt(value)
     }
   }
 }
@@ -98,19 +110,70 @@ abstract class BinaryArithmetic extends BinaryExpression {
 case class Add(left: Expression, right: Expression) extends BinaryArithmetic {
   def symbol = "+"
 
-  override def eval(input: Row): Any = n2(input, left, right, _.plus(_, _))
+  lazy val numeric = dataType match {
+    case n: NumericType => n.numeric.asInstanceOf[Numeric[Any]]
+    case other => sys.error(s"Type $other does not support numeric operations")
+  }
+
+  override def eval(input: Row): Any = {
+    val evalE1 = left.eval(input)
+    if(evalE1 == null) {
+      null
+    } else {
+      val evalE2 = right.eval(input)
+      if (evalE2 == null) {
+        null
+      } else {
+        numeric.plus(evalE1, evalE2)
+      }
+    }
+  }
 }
 
 case class Subtract(left: Expression, right: Expression) extends BinaryArithmetic {
   def symbol = "-"
 
-  override def eval(input: Row): Any = n2(input, left, right, _.minus(_, _))
+  lazy val numeric = dataType match {
+    case n: NumericType => n.numeric.asInstanceOf[Numeric[Any]]
+    case other => sys.error(s"Type $other does not support numeric operations")
+  }
+
+  override def eval(input: Row): Any = {
+    val evalE1 = left.eval(input)
+    if(evalE1 == null) {
+      null
+    } else {
+      val evalE2 = right.eval(input)
+      if (evalE2 == null) {
+        null
+      } else {
+        numeric.minus(evalE1, evalE2)
+      }
+    }
+  }
 }
 
 case class Multiply(left: Expression, right: Expression) extends BinaryArithmetic {
   def symbol = "*"
 
-  override def eval(input: Row): Any = n2(input, left, right, _.times(_, _))
+  lazy val numeric = dataType match {
+    case n: NumericType => n.numeric.asInstanceOf[Numeric[Any]]
+    case other => sys.error(s"Type $other does not support numeric operations")
+  }
+
+  override def eval(input: Row): Any = {
+    val evalE1 = left.eval(input)
+    if(evalE1 == null) {
+      null
+    } else {
+      val evalE2 = right.eval(input)
+      if (evalE2 == null) {
+        null
+      } else {
+        numeric.times(evalE1, evalE2)
+      }
+    }
+  }
 }
 
 case class Divide(left: Expression, right: Expression) extends BinaryArithmetic {
@@ -118,16 +181,25 @@ case class Divide(left: Expression, right: Expression) extends BinaryArithmetic
 
   override def nullable = true
 
+  lazy val div: (Any, Any) => Any = dataType match {
+    case ft: FractionalType => ft.fractional.asInstanceOf[Fractional[Any]].div
+    case it: IntegralType => it.integral.asInstanceOf[Integral[Any]].quot
+    case other => sys.error(s"Type $other does not support numeric operations")
+  }
+  
   override def eval(input: Row): Any = {
     val evalE2 = right.eval(input)
-    dataType match {
-      case _ if evalE2 == null => null
-      case _ if evalE2 == 0 => null
-      case ft: FractionalType => f1(input, left, _.div(_, evalE2.asInstanceOf[ft.JvmType]))
-      case it: IntegralType => i1(input, left, _.quot(_, evalE2.asInstanceOf[it.JvmType]))
+    if (evalE2 == null || evalE2 == 0) {
+      null
+    } else {
+      val evalE1 = left.eval(input)
+      if (evalE1 == null) {
+        null
+      } else {
+        div(evalE1, evalE2)
+      }
     }
   }
-
 }
 
 case class Remainder(left: Expression, right: Expression) extends BinaryArithmetic {
@@ -135,12 +207,23 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
 
   override def nullable = true
 
+  lazy val integral = dataType match {
+    case i: IntegralType => i.integral.asInstanceOf[Integral[Any]]
+    case i: FractionalType => i.asIntegral.asInstanceOf[Integral[Any]]
+    case other => sys.error(s"Type $other does not support numeric operations")
+  }
+
   override def eval(input: Row): Any = {
     val evalE2 = right.eval(input)
-    dataType match {
-      case _ if evalE2 == null => null
-      case _ if evalE2 == 0 => null
-      case nt: NumericType => i1(input, left, _.rem(_, evalE2.asInstanceOf[nt.JvmType]))
+    if (evalE2 == null || evalE2 == 0) {
+      null
+    } else {
+      val evalE1 = left.eval(input)
+      if (evalE1 == null) {
+        null
+      } else {
+        integral.rem(evalE1, evalE2)
+      }
     }
   }
 }
@@ -151,13 +234,19 @@ case class Remainder(left: Expression, right: Expression) extends BinaryArithmet
 case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithmetic {
   def symbol = "&"
 
-  override def evalInternal(evalE1: EvaluatedType, evalE2: EvaluatedType): Any = dataType match {
-    case ByteType => (evalE1.asInstanceOf[Byte] & evalE2.asInstanceOf[Byte]).toByte
-    case ShortType => (evalE1.asInstanceOf[Short] & evalE2.asInstanceOf[Short]).toShort
-    case IntegerType => evalE1.asInstanceOf[Int] & evalE2.asInstanceOf[Int]
-    case LongType => evalE1.asInstanceOf[Long] & evalE2.asInstanceOf[Long]
+  lazy val and: (Any, Any) => Any = dataType match {
+    case ByteType =>
+      ((evalE1: Byte, evalE2: Byte) => (evalE1 & evalE2).toByte).asInstanceOf[(Any, Any) => Any]
+    case ShortType =>
+      ((evalE1: Short, evalE2: Short) => (evalE1 & evalE2).toShort).asInstanceOf[(Any, Any) => Any]
+    case IntegerType =>
+      ((evalE1: Int, evalE2: Int) => evalE1 & evalE2).asInstanceOf[(Any, Any) => Any]
+    case LongType =>
+      ((evalE1: Long, evalE2: Long) => evalE1 & evalE2).asInstanceOf[(Any, Any) => Any]
     case other => sys.error(s"Unsupported bitwise & operation on $other")
   }
+
+  override def evalInternal(evalE1: EvaluatedType, evalE2: EvaluatedType): Any = and(evalE1, evalE2)
 }
 
 /**
@@ -166,13 +255,19 @@ case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithme
 case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmetic {
   def symbol = "|"
 
-  override def evalInternal(evalE1: EvaluatedType, evalE2: EvaluatedType): Any = dataType match {
-    case ByteType => (evalE1.asInstanceOf[Byte] | evalE2.asInstanceOf[Byte]).toByte
-    case ShortType => (evalE1.asInstanceOf[Short] | evalE2.asInstanceOf[Short]).toShort
-    case IntegerType => evalE1.asInstanceOf[Int] | evalE2.asInstanceOf[Int]
-    case LongType => evalE1.asInstanceOf[Long] | evalE2.asInstanceOf[Long]
+  lazy val or: (Any, Any) => Any = dataType match {
+    case ByteType =>
+      ((evalE1: Byte, evalE2: Byte) => (evalE1 | evalE2).toByte).asInstanceOf[(Any, Any) => Any]
+    case ShortType =>
+      ((evalE1: Short, evalE2: Short) => (evalE1 | evalE2).toShort).asInstanceOf[(Any, Any) => Any]
+    case IntegerType =>
+      ((evalE1: Int, evalE2: Int) => evalE1 | evalE2).asInstanceOf[(Any, Any) => Any]
+    case LongType =>
+      ((evalE1: Long, evalE2: Long) => evalE1 | evalE2).asInstanceOf[(Any, Any) => Any]
     case other => sys.error(s"Unsupported bitwise | operation on $other")
   }
+
+  override def evalInternal(evalE1: EvaluatedType, evalE2: EvaluatedType): Any = or(evalE1, evalE2)
 }
 
 /**
@@ -181,13 +276,19 @@ case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmet
 case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithmetic {
   def symbol = "^"
 
-  override def evalInternal(evalE1: EvaluatedType, evalE2: EvaluatedType): Any = dataType match {
-    case ByteType => (evalE1.asInstanceOf[Byte] ^ evalE2.asInstanceOf[Byte]).toByte
-    case ShortType => (evalE1.asInstanceOf[Short] ^ evalE2.asInstanceOf[Short]).toShort
-    case IntegerType => evalE1.asInstanceOf[Int] ^ evalE2.asInstanceOf[Int]
-    case LongType => evalE1.asInstanceOf[Long] ^ evalE2.asInstanceOf[Long]
+  lazy val xor: (Any, Any) => Any = dataType match {
+    case ByteType =>
+      ((evalE1: Byte, evalE2: Byte) => (evalE1 ^ evalE2).toByte).asInstanceOf[(Any, Any) => Any]
+    case ShortType =>
+      ((evalE1: Short, evalE2: Short) => (evalE1 ^ evalE2).toShort).asInstanceOf[(Any, Any) => Any]
+    case IntegerType =>
+      ((evalE1: Int, evalE2: Int) => evalE1 ^ evalE2).asInstanceOf[(Any, Any) => Any]
+    case LongType =>
+      ((evalE1: Long, evalE2: Long) => evalE1 ^ evalE2).asInstanceOf[(Any, Any) => Any]
     case other => sys.error(s"Unsupported bitwise ^ operation on $other")
   }
+
+  override def evalInternal(evalE1: EvaluatedType, evalE2: EvaluatedType): Any = xor(evalE1, evalE2)
 }
 
 /**
@@ -201,18 +302,24 @@ case class BitwiseNot(child: Expression) extends UnaryExpression {
   def nullable = child.nullable
   override def toString = s"~$child"
 
+  lazy val not: (Any) => Any = dataType match {
+    case ByteType =>
+      ((evalE: Byte) => (~evalE).toByte).asInstanceOf[(Any) => Any]
+    case ShortType =>
+      ((evalE: Short) => (~evalE).toShort).asInstanceOf[(Any) => Any]
+    case IntegerType =>
+      ((evalE: Int) => ~evalE).asInstanceOf[(Any) => Any]
+    case LongType =>
+      ((evalE: Long) => ~evalE).asInstanceOf[(Any) => Any]
+    case other => sys.error(s"Unsupported bitwise ~ operation on $other")
+  }
+
   override def eval(input: Row): Any = {
     val evalE = child.eval(input)
     if (evalE == null) {
       null
     } else {
-      dataType match {
-        case ByteType => (~evalE.asInstanceOf[Byte]).toByte
-        case ShortType => (~evalE.asInstanceOf[Short]).toShort
-        case IntegerType => ~evalE.asInstanceOf[Int]
-        case LongType => ~evalE.asInstanceOf[Long]
-        case other => sys.error(s"Unsupported bitwise ~ operation on $other")
-      }
+      not(evalE)
     }
   }
 }
@@ -226,21 +333,35 @@ case class MaxOf(left: Expression, right: Expression) extends Expression {
 
   override def children = left :: right :: Nil
 
-  override def dataType = left.dataType
+  override lazy val resolved =
+    left.resolved && right.resolved &&
+    left.dataType == right.dataType
+
+  override def dataType = {
+    if (!resolved) {
+      throw new UnresolvedException(this,
+        s"datatype. Can not resolve due to differing types ${left.dataType}, ${right.dataType}")
+    }
+    left.dataType
+  }
+
+  lazy val ordering = left.dataType match {
+    case i: NativeType => i.ordering.asInstanceOf[Ordering[Any]]
+    case other => sys.error(s"Type $other does not support ordered operations")
+  }
 
   override def eval(input: Row): Any = {
-    val leftEval = left.eval(input)
-    val rightEval = right.eval(input)
-    if (leftEval == null) {
-      rightEval
-    } else if (rightEval == null) {
-      leftEval
+    val evalE1 = left.eval(input)
+    val evalE2 = right.eval(input)
+    if (evalE1 == null) {
+      evalE2
+    } else if (evalE2 == null) {
+      evalE1
     } else {
-      val numeric = left.dataType.asInstanceOf[NumericType].numeric.asInstanceOf[Numeric[Any]]
-      if (numeric.compare(leftEval, rightEval) < 0) {
-        rightEval
+      if (ordering.compare(evalE1, evalE2) < 0) {
+        evalE2
       } else {
-        leftEval
+        evalE1
       }
     }
   }
@@ -259,5 +380,17 @@ case class Abs(child: Expression) extends UnaryExpression  {
   def nullable = child.nullable
   override def toString = s"Abs($child)"
 
-  override def eval(input: Row): Any = n1(child, input, _.abs(_))
+  lazy val numeric = dataType match {
+    case n: NumericType => n.numeric.asInstanceOf[Numeric[Any]]
+    case other => sys.error(s"Type $other does not support numeric operations")
+  }
+
+  override def eval(input: Row): Any = {
+    val evalE = child.eval(input)
+    if (evalE == null) {
+      null
+    } else {
+      numeric.abs(evalE)
+    }
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 365b1685a8e71..0024ef92c0452 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -18,8 +18,9 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
+import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.types.{BinaryType, BooleanType}
+import org.apache.spark.sql.types.{BinaryType, BooleanType, NativeType}
 
 object InterpretedPredicate {
   def apply(expression: Expression, inputSchema: Seq[Attribute]): (Row => Boolean) =
@@ -201,22 +202,118 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
 
 case class LessThan(left: Expression, right: Expression) extends BinaryComparison {
   def symbol = "<"
-  override def eval(input: Row): Any = c2(input, left, right, _.lt(_, _))
+
+  lazy val ordering = {
+    if (left.dataType != right.dataType) {
+      throw new TreeNodeException(this,
+        s"Types do not match ${left.dataType} != ${right.dataType}")
+    }
+    left.dataType match {
+      case i: NativeType => i.ordering.asInstanceOf[Ordering[Any]]
+      case other => sys.error(s"Type $other does not support ordered operations")
+    }
+  }
+
+  override def eval(input: Row): Any = {
+    val evalE1 = left.eval(input)
+    if(evalE1 == null) {
+      null
+    } else {
+      val evalE2 = right.eval(input)
+      if (evalE2 == null) {
+        null
+      } else {
+        ordering.lt(evalE1, evalE2)
+      }
+    }
+  }
 }
 
 case class LessThanOrEqual(left: Expression, right: Expression) extends BinaryComparison {
   def symbol = "<="
-  override def eval(input: Row): Any = c2(input, left, right, _.lteq(_, _))
+
+  lazy val ordering = {
+    if (left.dataType != right.dataType) {
+      throw new TreeNodeException(this,
+        s"Types do not match ${left.dataType} != ${right.dataType}")
+    }
+    left.dataType match {
+      case i: NativeType => i.ordering.asInstanceOf[Ordering[Any]]
+      case other => sys.error(s"Type $other does not support ordered operations")
+    }
+  }
+
+  override def eval(input: Row): Any = {
+    val evalE1 = left.eval(input)
+    if(evalE1 == null) {
+      null
+    } else {
+      val evalE2 = right.eval(input)
+      if (evalE2 == null) {
+        null
+      } else {
+        ordering.lteq(evalE1, evalE2)
+      }
+    }
+  }
 }
 
 case class GreaterThan(left: Expression, right: Expression) extends BinaryComparison {
   def symbol = ">"
-  override def eval(input: Row): Any = c2(input, left, right, _.gt(_, _))
+
+  lazy val ordering = {
+    if (left.dataType != right.dataType) {
+      throw new TreeNodeException(this,
+        s"Types do not match ${left.dataType} != ${right.dataType}")
+    }
+    left.dataType match {
+      case i: NativeType => i.ordering.asInstanceOf[Ordering[Any]]
+      case other => sys.error(s"Type $other does not support ordered operations")
+    }
+  }
+
+  override def eval(input: Row): Any = {
+    val evalE1 = left.eval(input)
+    if(evalE1 == null) {
+      null
+    } else {
+      val evalE2 = right.eval(input)
+      if (evalE2 == null) {
+        null
+      } else {
+        ordering.gt(evalE1, evalE2)
+      }
+    }
+  }
 }
 
 case class GreaterThanOrEqual(left: Expression, right: Expression) extends BinaryComparison {
   def symbol = ">="
-  override def eval(input: Row): Any = c2(input, left, right, _.gteq(_, _))
+
+  lazy val ordering = {
+    if (left.dataType != right.dataType) {
+      throw new TreeNodeException(this,
+        s"Types do not match ${left.dataType} != ${right.dataType}")
+    }
+    left.dataType match {
+      case i: NativeType => i.ordering.asInstanceOf[Ordering[Any]]
+      case other => sys.error(s"Type $other does not support ordered operations")
+    }
+  }
+
+  override def eval(input: Row): Any = {
+    val evalE1 = left.eval(input)
+    if(evalE1 == null) {
+      null
+    } else {
+      val evalE2 = right.eval(input)
+      if (evalE2 == null) {
+        null
+      } else {
+        ordering.gteq(evalE1, evalE2)
+      }
+    }
+  }
 }
 
 case class If(predicate: Expression, trueValue: Expression, falseValue: Expression)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Metadata.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Metadata.scala
old mode 100755
new mode 100644
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/MetadataSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/MetadataSuite.scala
old mode 100755
new mode 100644
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
old mode 100755
new mode 100644
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
old mode 100755
new mode 100644

From e189cbb052d59eb499dd4312403925fdd72f5718 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 16 Feb 2015 15:59:23 -0800
Subject: [PATCH 199/357] [SPARK-4865][SQL]Include temporary tables in SHOW
 TABLES

This PR adds a `ShowTablesCommand` to support `SHOW TABLES [IN databaseName]` SQL command. The result of `SHOW TABLE` has two columns, `tableName` and `isTemporary`. For temporary tables, the value of `isTemporary` column will be `false`.

JIRA: https://issues.apache.org/jira/browse/SPARK-4865

Author: Yin Huai <yhuai@databricks.com>

Closes #4618 from yhuai/showTablesCommand and squashes the following commits:

0c09791 [Yin Huai] Use ShowTablesCommand.
85ee76d [Yin Huai] Since SHOW TABLES is not a Hive native command any more and we will not see "OK" (originally generated by Hive's driver), use SHOW DATABASES in the test.
94bacac [Yin Huai] Add SHOW TABLES to the list of noExplainCommands.
d71ed09 [Yin Huai] Fix test.
a4a6ec3 [Yin Huai] Add SHOW TABLE command.
---
 .../org/apache/spark/sql/SQLContext.scala     |  4 +-
 .../org/apache/spark/sql/SparkSQLParser.scala | 12 ++++-
 .../apache/spark/sql/execution/commands.scala | 34 +++++++++++++-
 .../apache/spark/sql/ListTablesSuite.scala    | 37 +++++++++------
 .../sql/hive/thriftserver/CliSuite.scala      |  2 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 10 +++-
 .../org/apache/spark/sql/hive/HiveQl.scala    |  2 +-
 .../spark/sql/hive/ListTablesSuite.scala      | 46 ++++++++++---------
 .../sql/hive/execution/HiveQuerySuite.scala   | 14 +++---
 9 files changed, 111 insertions(+), 50 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index d08c2d1cfe030..aa0fc3e359a9b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -794,7 +794,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * indicating if a table is a temporary one or not).
    */
   def tables(): DataFrame = {
-    createDataFrame(catalog.getTables(None)).toDF("tableName", "isTemporary")
+    DataFrame(this, ShowTablesCommand(None))
   }
 
   /**
@@ -803,7 +803,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * indicating if a table is a temporary one or not).
    */
   def tables(databaseName: String): DataFrame = {
-    createDataFrame(catalog.getTables(Some(databaseName))).toDF("tableName", "isTemporary")
+    DataFrame(this, ShowTablesCommand(Some(databaseName)))
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
index f1a4053b79113..00e19da4374a8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
@@ -23,7 +23,7 @@ import scala.util.parsing.combinator.RegexParsers
 import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.{UncacheTableCommand, CacheTableCommand, SetCommand}
+import org.apache.spark.sql.execution.{ShowTablesCommand, UncacheTableCommand, CacheTableCommand, SetCommand}
 import org.apache.spark.sql.types.StringType
 
 
@@ -57,12 +57,15 @@ private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends Abstr
 
   protected val AS      = Keyword("AS")
   protected val CACHE   = Keyword("CACHE")
+  protected val IN      = Keyword("IN")
   protected val LAZY    = Keyword("LAZY")
   protected val SET     = Keyword("SET")
+  protected val SHOW    = Keyword("SHOW")
   protected val TABLE   = Keyword("TABLE")
+  protected val TABLES  = Keyword("TABLES")
   protected val UNCACHE = Keyword("UNCACHE")
 
-  override protected lazy val start: Parser[LogicalPlan] = cache | uncache | set | others
+  override protected lazy val start: Parser[LogicalPlan] = cache | uncache | set | show | others
 
   private lazy val cache: Parser[LogicalPlan] =
     CACHE ~> LAZY.? ~ (TABLE ~> ident) ~ (AS ~> restInput).? ^^ {
@@ -80,6 +83,11 @@ private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends Abstr
       case input => SetCommandParser(input)
     }
 
+  private lazy val show: Parser[LogicalPlan] =
+    SHOW ~> TABLES ~ (IN ~> ident).? ^^ {
+      case _ ~ dbName => ShowTablesCommand(dbName)
+    }
+
   private lazy val others: Parser[LogicalPlan] =
     wholeInput ^^ {
       case input => fallback(input)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index 2b1726ad4e89f..c6cd6eb6a22b6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.Logging
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.StringType
+import org.apache.spark.sql.types.{BooleanType, StructField, StructType, StringType}
 import org.apache.spark.sql.{DataFrame, SQLConf, SQLContext}
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Row, Attribute}
@@ -191,3 +191,35 @@ case class DescribeCommand(
     }
   }
 }
+
+/**
+ * A command for users to get tables in the given database.
+ * If a databaseName is not given, the current database will be used.
+ * The syntax of using this command in SQL is:
+ * {{{
+ *    SHOW TABLES [IN databaseName]
+ * }}}
+ * :: DeveloperApi ::
+ */
+@DeveloperApi
+case class ShowTablesCommand(databaseName: Option[String]) extends RunnableCommand {
+
+  // The result of SHOW TABLES has two columns, tableName and isTemporary.
+  override val output = {
+    val schema = StructType(
+      StructField("tableName", StringType, false) ::
+      StructField("isTemporary", BooleanType, false) :: Nil)
+
+    schema.toAttributes
+  }
+
+  override def run(sqlContext: SQLContext) = {
+    // Since we need to return a Seq of rows, we will call getTables directly
+    // instead of calling tables in sqlContext.
+    val rows = sqlContext.catalog.getTables(databaseName).map {
+      case (tableName, isTemporary) => Row(tableName, isTemporary)
+    }
+
+    rows
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
index 282b98a987dd4..f9f41eb358bd5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
@@ -43,6 +43,10 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter {
       tables().filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
+    checkAnswer(
+      sql("SHOW tables").filter("tableName = 'ListTablesSuiteTable'"),
+      Row("ListTablesSuiteTable", true))
+
     catalog.unregisterTable(Seq("ListTablesSuiteTable"))
     assert(tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
   }
@@ -52,25 +56,32 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter {
       tables("DB").filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
+    checkAnswer(
+      sql("show TABLES in DB").filter("tableName = 'ListTablesSuiteTable'"),
+      Row("ListTablesSuiteTable", true))
+
     catalog.unregisterTable(Seq("ListTablesSuiteTable"))
     assert(tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
   }
 
   test("query the returned DataFrame of tables") {
-    val tableDF = tables()
-    val schema = StructType(
-      StructField("tableName", StringType, true) ::
+    val expectedSchema = StructType(
+      StructField("tableName", StringType, false) ::
       StructField("isTemporary", BooleanType, false) :: Nil)
-    assert(schema === tableDF.schema)
 
-    tableDF.registerTempTable("tables")
-    checkAnswer(
-      sql("SELECT isTemporary, tableName from tables WHERE tableName = 'ListTablesSuiteTable'"),
-      Row(true, "ListTablesSuiteTable")
-    )
-    checkAnswer(
-      tables().filter("tableName = 'tables'").select("tableName", "isTemporary"),
-      Row("tables", true))
-    dropTempTable("tables")
+    Seq(tables(), sql("SHOW TABLes")).foreach {
+      case tableDF =>
+        assert(expectedSchema === tableDF.schema)
+
+        tableDF.registerTempTable("tables")
+        checkAnswer(
+          sql("SELECT isTemporary, tableName from tables WHERE tableName = 'ListTablesSuiteTable'"),
+          Row(true, "ListTablesSuiteTable")
+        )
+        checkAnswer(
+          tables().filter("tableName = 'tables'").select("tableName", "isTemporary"),
+          Row("tables", true))
+        dropTempTable("tables")
+    }
   }
 }
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
index 60953576d0e37..8bca4b33b3ad1 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -121,6 +121,6 @@ class CliSuite extends FunSuite with BeforeAndAfterAll with Logging {
   }
 
   test("Single command with -e") {
-    runCliWithin(1.minute, Seq("-e", "SHOW TABLES;"))("" -> "OK")
+    runCliWithin(1.minute, Seq("-e", "SHOW DATABASES;"))("" -> "OK")
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 87bc9fe4fe9ad..0e43faa8afdaf 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -220,8 +220,14 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
   }
 
   override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
-    val dbName = databaseName.getOrElse(hive.sessionState.getCurrentDatabase)
-    client.getAllTables(dbName).map(tableName => (tableName, false))
+    val dbName = if (!caseSensitive) {
+      if (databaseName.isDefined) Some(databaseName.get.toLowerCase) else None
+    } else {
+      databaseName
+    }
+    val db = dbName.getOrElse(hive.sessionState.getCurrentDatabase)
+
+    client.getAllTables(db).map(tableName => (tableName, false))
   }
 
   /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 5a1825a87dadb..98263f602e9ec 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -64,7 +64,6 @@ private[hive] object HiveQl {
     "TOK_SHOWINDEXES",
     "TOK_SHOWINDEXES",
     "TOK_SHOWPARTITIONS",
-    "TOK_SHOWTABLES",
     "TOK_SHOW_TBLPROPERTIES",
 
     "TOK_LOCKTABLE",
@@ -129,6 +128,7 @@ private[hive] object HiveQl {
   // Commands that we do not need to explain.
   protected val noExplainCommands = Seq(
     "TOK_DESCTABLE",
+    "TOK_SHOWTABLES",
     "TOK_TRUNCATETABLE"     // truncate table" is a NativeCommand, does not need to explain.
   ) ++ nativeCommands
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
index 321b784a3f842..e12a6c21ccac4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
@@ -49,29 +49,33 @@ class ListTablesSuite extends QueryTest with BeforeAndAfterAll {
   }
 
   test("get all tables of current database") {
-    val allTables = tables()
-    // We are using default DB.
-    checkAnswer(
-      allTables.filter("tableName = 'listtablessuitetable'"),
-      Row("listtablessuitetable", true))
-    assert(allTables.filter("tableName = 'indblisttablessuitetable'").count() === 0)
-    checkAnswer(
-      allTables.filter("tableName = 'hivelisttablessuitetable'"),
-      Row("hivelisttablessuitetable", false))
-    assert(allTables.filter("tableName = 'hiveindblisttablessuitetable'").count() === 0)
+    Seq(tables(), sql("SHOW TABLes")).foreach {
+      case allTables =>
+        // We are using default DB.
+        checkAnswer(
+          allTables.filter("tableName = 'listtablessuitetable'"),
+          Row("listtablessuitetable", true))
+        assert(allTables.filter("tableName = 'indblisttablessuitetable'").count() === 0)
+        checkAnswer(
+          allTables.filter("tableName = 'hivelisttablessuitetable'"),
+          Row("hivelisttablessuitetable", false))
+        assert(allTables.filter("tableName = 'hiveindblisttablessuitetable'").count() === 0)
+    }
   }
 
   test("getting all tables with a database name") {
-    val allTables = tables("ListTablesSuiteDB")
-    checkAnswer(
-      allTables.filter("tableName = 'listtablessuitetable'"),
-      Row("listtablessuitetable", true))
-    checkAnswer(
-      allTables.filter("tableName = 'indblisttablessuitetable'"),
-      Row("indblisttablessuitetable", true))
-    assert(allTables.filter("tableName = 'hivelisttablessuitetable'").count() === 0)
-    checkAnswer(
-      allTables.filter("tableName = 'hiveindblisttablessuitetable'"),
-      Row("hiveindblisttablessuitetable", false))
+    Seq(tables("listtablessuiteDb"), sql("SHOW TABLes in listTablesSuitedb")).foreach {
+      case allTables =>
+        checkAnswer(
+          allTables.filter("tableName = 'listtablessuitetable'"),
+          Row("listtablessuitetable", true))
+        checkAnswer(
+          allTables.filter("tableName = 'indblisttablessuitetable'"),
+          Row("indblisttablessuitetable", true))
+        assert(allTables.filter("tableName = 'hivelisttablessuitetable'").count() === 0)
+        checkAnswer(
+          allTables.filter("tableName = 'hiveindblisttablessuitetable'"),
+          Row("hiveindblisttablessuitetable", false))
+    }
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index d01dbf80ef66d..955f3f51cfe9f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -62,7 +62,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
   test("SPARK-4908: concurrent hive native commands") {
     (1 to 100).par.map { _ =>
       sql("USE default")
-      sql("SHOW TABLES")
+      sql("SHOW DATABASES")
     }
   }
 
@@ -630,24 +630,24 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
   }
 
   test("Query Hive native command execution result") {
-    val tableName = "test_native_commands"
+    val databaseName = "test_native_commands"
 
     assertResult(0) {
-      sql(s"DROP TABLE IF EXISTS $tableName").count()
+      sql(s"DROP DATABASE IF EXISTS $databaseName").count()
     }
 
     assertResult(0) {
-      sql(s"CREATE TABLE $tableName(key INT, value STRING)").count()
+      sql(s"CREATE DATABASE $databaseName").count()
     }
 
     assert(
-      sql("SHOW TABLES")
+      sql("SHOW DATABASES")
         .select('result)
         .collect()
         .map(_.getString(0))
-        .contains(tableName))
+        .contains(databaseName))
 
-    assert(isExplanation(sql(s"EXPLAIN SELECT key, COUNT(*) FROM $tableName GROUP BY key")))
+    assert(isExplanation(sql(s"EXPLAIN SELECT key, COUNT(*) FROM src GROUP BY key")))
 
     TestHive.reset()
   }

From 1294a6e01af0d4f6678ea8cb5d47dc97112608b5 Mon Sep 17 00:00:00 2001
From: Matt Whelan <mwhelan@perka.com>
Date: Tue, 17 Feb 2015 00:59:49 +0000
Subject: [PATCH 200/357] SPARK-5848: tear down the ConsoleProgressBar timer

The timer is a GC root, and failing to terminate it leaks SparkContext
instances.

Author: Matt Whelan <mwhelan@perka.com>

Closes #4635 from MattWhelan/SPARK-5848 and squashes the following commits:

2a1e8a5 [Matt Whelan] SPARK-5848: teardown the ConsoleProgressBar timer
---
 core/src/main/scala/org/apache/spark/SparkContext.scala    | 1 +
 .../scala/org/apache/spark/ui/ConsoleProgressBar.scala     | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 24a316e40e673..fd8fac6df0d0e 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1363,6 +1363,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
         cleaner.foreach(_.stop())
         dagScheduler.stop()
         dagScheduler = null
+        progressBar.foreach(_.stop())
         taskScheduler = null
         // TODO: Cache.stop()?
         env.stop()
diff --git a/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala b/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala
index 27ba9e18237b5..67f572e79314d 100644
--- a/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala
+++ b/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala
@@ -28,7 +28,6 @@ import org.apache.spark._
  * of them will be combined together, showed in one line.
  */
 private[spark] class ConsoleProgressBar(sc: SparkContext) extends Logging {
-
   // Carrige return
   val CR = '\r'
   // Update period of progress bar, in milliseconds
@@ -121,4 +120,10 @@ private[spark] class ConsoleProgressBar(sc: SparkContext) extends Logging {
     clear()
     lastFinishTime = System.currentTimeMillis()
   }
+
+  /**
+   * Tear down the timer thread.  The timer thread is a GC root, and it retains the entire
+   * SparkContext if it's not terminated.
+   */
+  def stop(): Unit = timer.cancel()
 }

From b1bd1dd3228ef50fa7310d466afd834b8cb1f22e Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 16 Feb 2015 17:57:14 -0800
Subject: [PATCH 201/357] [SPARK-5788] [PySpark] capture the exception in
 python write thread

The exception in Python writer thread will shutdown executor.

Author: Davies Liu <davies@databricks.com>

Closes #4577 from davies/exception and squashes the following commits:

eb0ceff [Davies Liu] Update PythonRDD.scala
139b0db [Davies Liu] capture the exception in python write thread
---
 .../main/scala/org/apache/spark/api/python/PythonRDD.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index b89effc16d36d..252721192904f 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -248,13 +248,13 @@ private[spark] class PythonRDD(
       } catch {
         case e: Exception if context.isCompleted || context.isInterrupted =>
           logDebug("Exception thrown after task completion (likely due to cleanup)", e)
-          worker.shutdownOutput()
+          Utils.tryLog(worker.shutdownOutput())
 
         case e: Exception =>
           // We must avoid throwing exceptions here, because the thread uncaught exception handler
           // will kill the whole executor (see org.apache.spark.executor.Executor).
           _exception = e
-          worker.shutdownOutput()
+          Utils.tryLog(worker.shutdownOutput())
       } finally {
         // Release memory used by this thread for shuffles
         env.shuffleMemoryManager.releaseMemoryForThisThread()

From 16687651f05bde8ff2e2fcef100383168958bf7f Mon Sep 17 00:00:00 2001
From: azagrebin <azagrebin@gmail.com>
Date: Mon, 16 Feb 2015 18:06:19 -0800
Subject: [PATCH 202/357] [SPARK-3340] Deprecate ADD_JARS and ADD_FILES

I created a patch that disables the environment variables.
Thereby scala or python shell log a warning message to notify user about the deprecation
with the following message:
scala: "ADD_JARS environment variable is deprecated, use --jar spark submit argument instead"
python: "Warning: ADD_FILES environment variable is deprecated, use --py-files argument instead"

Is it what is expected or the code associated with the variables should be just completely removed?
Should it be somewhere documented?

Author: azagrebin <azagrebin@gmail.com>

Closes #4616 from azagrebin/master and squashes the following commits:

bab1aa9 [azagrebin] [SPARK-3340] Deprecate ADD_JARS and ADD_FILES: minor readability issue
0643895 [azagrebin] [SPARK-3340] Deprecate ADD_JARS and ADD_FILES: add warning messages
42f0107 [azagrebin] [SPARK-3340] Deprecate ADD_JARS and ADD_FILES
---
 python/pyspark/shell.py                                   | 8 +++++---
 .../src/main/scala/org/apache/spark/repl/SparkILoop.scala | 7 ++++---
 .../src/main/scala/org/apache/spark/repl/Main.scala       | 3 +++
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py
index 89cf76920e353..4cf4b89ccfaa7 100644
--- a/python/pyspark/shell.py
+++ b/python/pyspark/shell.py
@@ -35,9 +35,10 @@
 from pyspark.context import SparkContext
 from pyspark.storagelevel import StorageLevel
 
-# this is the equivalent of ADD_JARS
-add_files = (os.environ.get("ADD_FILES").split(',')
-             if os.environ.get("ADD_FILES") is not None else None)
+# this is the deprecated equivalent of ADD_JARS
+add_files = None
+if os.environ.get("ADD_FILES") is not None:
+    add_files = os.environ.get("ADD_FILES").split(',')
 
 if os.environ.get("SPARK_EXECUTOR_URI"):
     SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])
@@ -59,6 +60,7 @@
 print("SparkContext available as sc.")
 
 if add_files is not None:
+    print("Warning: ADD_FILES environment variable is deprecated, use --py-files argument instead")
     print("Adding files: [%s]" % ", ".join(add_files))
 
 # The ./bin/pyspark script stores the old PYTHONSTARTUP value in OLD_PYTHONSTARTUP,
diff --git a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index b4db3df795177..8dc0e0c965923 100644
--- a/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -1064,15 +1064,16 @@ class SparkILoop(
   private def main(settings: Settings): Unit = process(settings)
 }
 
-object SparkILoop {
+object SparkILoop extends Logging {
   implicit def loopToInterpreter(repl: SparkILoop): SparkIMain = repl.intp
   private def echo(msg: String) = Console println msg
 
   def getAddedJars: Array[String] = {
     val envJars = sys.env.get("ADD_JARS")
-    val propJars = sys.props.get("spark.jars").flatMap { p =>
-      if (p == "") None else Some(p)
+    if (envJars.isDefined) {
+      logWarning("ADD_JARS environment variable is deprecated, use --jar spark submit argument instead")
     }
+    val propJars = sys.props.get("spark.jars").flatMap { p => if (p == "") None else Some(p) }
     val jars = propJars.orElse(envJars).getOrElse("")
     Utils.resolveURIs(jars).split(",").filter(_.nonEmpty)
   }
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
index dc25692749aad..2210fbaafeadb 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala
@@ -51,6 +51,9 @@ object Main extends Logging {
 
   def getAddedJars: Array[String] = {
     val envJars = sys.env.get("ADD_JARS")
+    if (envJars.isDefined) {
+      logWarning("ADD_JARS environment variable is deprecated, use --jar spark submit argument instead")
+    }
     val propJars = sys.props.get("spark.jars").flatMap { p => if (p == "") None else Some(p) }
     val jars = propJars.orElse(envJars).getOrElse("")
     Utils.resolveURIs(jars).split(",").filter(_.nonEmpty)

From 58a82a7882d7a8a7e4064278c4bf28607d9a42ba Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 16 Feb 2015 18:08:02 -0800
Subject: [PATCH 203/357] [SPARK-5849] Handle more types of invalid JSON
 requests in SubmitRestProtocolMessage.parseAction

This patch improves SubmitRestProtocol's handling of invalid JSON requests in cases where those requests were parsable as JSON but not as JSON objects (e.g. they could be parsed as ararys or strings).  I replaced an unchecked cast with pattern-matching and added a new test case.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #4637 from JoshRosen/rest-protocol-cast and squashes the following commits:

b3f282b [Josh Rosen] [SPARK-5849] Handle more types of invalid JSON in SubmitRestProtocolMessage.parseAction
---
 .../deploy/rest/SubmitRestProtocolMessage.scala  | 16 ++++++++--------
 .../deploy/rest/StandaloneRestSubmitSuite.scala  |  4 ++++
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala
index b877898231e3e..8f36635674a28 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolMessage.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.deploy.rest
 
-import scala.util.Try
-
 import com.fasterxml.jackson.annotation._
 import com.fasterxml.jackson.annotation.JsonAutoDetect.Visibility
 import com.fasterxml.jackson.annotation.JsonInclude.Include
@@ -111,12 +109,14 @@ private[spark] object SubmitRestProtocolMessage {
    * If the action field is not found, throw a [[SubmitRestMissingFieldException]].
    */
   def parseAction(json: String): String = {
-    parse(json).asInstanceOf[JObject].obj
-      .find { case (f, _) => f == "action" }
-      .map { case (_, v) => v.asInstanceOf[JString].s }
-      .getOrElse {
-        throw new SubmitRestMissingFieldException(s"Action field not found in JSON:\n$json")
-      }
+    val value: Option[String] = parse(json) match {
+      case JObject(fields) =>
+        fields.collectFirst { case ("action", v) => v }.collect { case JString(s) => s }
+      case _ => None
+    }
+    value.getOrElse {
+      throw new SubmitRestMissingFieldException(s"Action field not found in JSON:\n$json")
+    }
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
index a345e06ecb7d2..2fa90e3bd1c63 100644
--- a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
@@ -245,6 +245,7 @@ class StandaloneRestSubmitSuite extends FunSuite with BeforeAndAfterEach {
     val goodJson = constructSubmitRequest(masterUrl).toJson
     val badJson1 = goodJson.replaceAll("action", "fraction") // invalid JSON
     val badJson2 = goodJson.substring(goodJson.size / 2) // malformed JSON
+    val notJson = "\"hello, world\""
     val (response1, code1) = sendHttpRequestWithResponse(submitRequestPath, "POST") // missing JSON
     val (response2, code2) = sendHttpRequestWithResponse(submitRequestPath, "POST", badJson1)
     val (response3, code3) = sendHttpRequestWithResponse(submitRequestPath, "POST", badJson2)
@@ -252,6 +253,7 @@ class StandaloneRestSubmitSuite extends FunSuite with BeforeAndAfterEach {
     val (response5, code5) = sendHttpRequestWithResponse(s"$killRequestPath/", "POST")
     val (response6, code6) = sendHttpRequestWithResponse(statusRequestPath, "GET") // missing ID
     val (response7, code7) = sendHttpRequestWithResponse(s"$statusRequestPath/", "GET")
+    val (response8, code8) = sendHttpRequestWithResponse(submitRequestPath, "POST", notJson)
     // these should all fail as error responses
     getErrorResponse(response1)
     getErrorResponse(response2)
@@ -260,6 +262,7 @@ class StandaloneRestSubmitSuite extends FunSuite with BeforeAndAfterEach {
     getErrorResponse(response5)
     getErrorResponse(response6)
     getErrorResponse(response7)
+    getErrorResponse(response8)
     assert(code1 === HttpServletResponse.SC_BAD_REQUEST)
     assert(code2 === HttpServletResponse.SC_BAD_REQUEST)
     assert(code3 === HttpServletResponse.SC_BAD_REQUEST)
@@ -267,6 +270,7 @@ class StandaloneRestSubmitSuite extends FunSuite with BeforeAndAfterEach {
     assert(code5 === HttpServletResponse.SC_BAD_REQUEST)
     assert(code6 === HttpServletResponse.SC_BAD_REQUEST)
     assert(code7 === HttpServletResponse.SC_BAD_REQUEST)
+    assert(code8 === HttpServletResponse.SC_BAD_REQUEST)
   }
 
   test("bad request paths") {

From 0e180bfc3c7f18780d4fc4f42681609832418e43 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 16 Feb 2015 19:00:30 -0800
Subject: [PATCH 204/357] [SQL] Various DataFrame doc changes.

Added a bunch of tags.

Also changed parquetFile to take varargs rather than a string followed by varargs.

Author: Reynold Xin <rxin@databricks.com>

Closes #4636 from rxin/df-doc and squashes the following commits:

651f80c [Reynold Xin] Fixed parquetFile in PySpark.
8dc3024 [Reynold Xin] [SQL] Various DataFrame doc changes.
---
 python/pyspark/sql/context.py                 |   7 +-
 .../scala/org/apache/spark/sql/Column.scala   | 165 ++++++++++++-
 .../org/apache/spark/sql/DataFrame.scala      |  14 +-
 .../org/apache/spark/sql/DataFrameImpl.scala  |   4 +-
 .../org/apache/spark/sql/GroupedData.scala    |   3 +
 .../org/apache/spark/sql/SQLContext.scala     |  97 +++++---
 .../apache/spark/sql/UDFRegistration.scala    |   4 +-
 .../org/apache/spark/sql/functions.scala      | 226 +++++++++++++++---
 .../spark/sql/hive/HiveStrategies.scala       |   3 +-
 9 files changed, 436 insertions(+), 87 deletions(-)

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 7683c1b4dfa4e..dd2cd5ee76f60 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -431,11 +431,10 @@ def parquetFile(self, *paths):
         True
         """
         gateway = self._sc._gateway
-        jpath = paths[0]
-        jpaths = gateway.new_array(gateway.jvm.java.lang.String, len(paths) - 1)
-        for i in range(1, len(paths)):
+        jpaths = gateway.new_array(gateway.jvm.java.lang.String, len(paths))
+        for i in range(0, len(paths)):
             jpaths[i] = paths[i]
-        jdf = self._ssql_ctx.parquetFile(jpath, jpaths)
+        jdf = self._ssql_ctx.parquetFile(jpaths)
         return DataFrame(jdf, self)
 
     def jsonFile(self, path, schema=None, samplingRatio=1.0):
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index f6ecee1af8aad..8b6241c213c87 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql
 
 import scala.language.implicitConversions
 
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.functions.lit
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan}
@@ -41,21 +42,15 @@ private[sql] object Column {
 
 
 /**
+ * :: Experimental ::
  * A column in a [[DataFrame]].
  *
- * `Column` instances can be created by:
- * {{{
- *   // 1. Select a column out of a DataFrame
- *   df("colName")
- *
- *   // 2. Create a literal expression
- *   Literal(1)
- *
- *   // 3. Create new columns from
- * }}}
- *
+ * @groupname java_expr_ops Java-specific expression operators.
+ * @groupname expr_ops Expression operators.
+ * @groupname df_ops DataFrame functions.
+ * @groupname Ungrouped Support functions for DataFrames.
  */
-// TODO: Improve documentation.
+@Experimental
 trait Column extends DataFrame {
 
   protected[sql] def expr: Expression
@@ -129,6 +124,8 @@ trait Column extends DataFrame {
    *   import static org.apache.spark.sql.functions.*;
    *   df.select( negate(col("amount") );
    * }}}
+   *
+   * @group expr_ops
    */
   def unary_- : Column = exprToColumn(UnaryMinus(expr))
 
@@ -142,6 +139,8 @@ trait Column extends DataFrame {
    *   import static org.apache.spark.sql.functions.*;
    *   df.filter( not(df.col("isActive")) );
    * }}
+   *
+   * @group expr_ops
    */
   def unary_! : Column = exprToColumn(Not(expr))
 
@@ -155,6 +154,8 @@ trait Column extends DataFrame {
    *   import static org.apache.spark.sql.functions.*;
    *   df.filter( col("colA").equalTo(col("colB")) );
    * }}}
+   *
+   * @group expr_ops
    */
   def === (other: Any): Column = constructColumn(other) { o =>
     EqualTo(expr, o.expr)
@@ -170,6 +171,8 @@ trait Column extends DataFrame {
    *   import static org.apache.spark.sql.functions.*;
    *   df.filter( col("colA").equalTo(col("colB")) );
    * }}}
+   *
+   * @group expr_ops
    */
   def equalTo(other: Any): Column = this === other
 
@@ -184,6 +187,8 @@ trait Column extends DataFrame {
    *   import static org.apache.spark.sql.functions.*;
    *   df.filter( col("colA").notEqual(col("colB")) );
    * }}}
+   *
+   * @group expr_ops
    */
   def !== (other: Any): Column = constructColumn(other) { o =>
     Not(EqualTo(expr, o.expr))
@@ -200,6 +205,8 @@ trait Column extends DataFrame {
    *   import static org.apache.spark.sql.functions.*;
    *   df.filter( col("colA").notEqual(col("colB")) );
    * }}}
+   *
+   * @group java_expr_ops
    */
   def notEqual(other: Any): Column = constructColumn(other) { o =>
     Not(EqualTo(expr, o.expr))
@@ -215,6 +222,8 @@ trait Column extends DataFrame {
    *   import static org.apache.spark.sql.functions.*;
    *   people.select( people("age").gt(21) );
    * }}}
+   *
+   * @group expr_ops
    */
   def > (other: Any): Column = constructColumn(other) { o =>
     GreaterThan(expr, o.expr)
@@ -230,6 +239,8 @@ trait Column extends DataFrame {
    *   import static org.apache.spark.sql.functions.*;
    *   people.select( people("age").gt(21) );
    * }}}
+   *
+   * @group java_expr_ops
    */
   def gt(other: Any): Column = this > other
 
@@ -242,6 +253,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("age").lt(21) );
    * }}}
+   *
+   * @group expr_ops
    */
   def < (other: Any): Column = constructColumn(other) { o =>
     LessThan(expr, o.expr)
@@ -256,6 +269,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("age").lt(21) );
    * }}}
+   *
+   * @group java_expr_ops
    */
   def lt(other: Any): Column = this < other
 
@@ -268,6 +283,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("age").leq(21) );
    * }}}
+   *
+   * @group expr_ops
    */
   def <= (other: Any): Column = constructColumn(other) { o =>
     LessThanOrEqual(expr, o.expr)
@@ -282,6 +299,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("age").leq(21) );
    * }}}
+   *
+   * @group java_expr_ops
    */
   def leq(other: Any): Column = this <= other
 
@@ -294,6 +313,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("age").geq(21) )
    * }}}
+   *
+   * @group expr_ops
    */
   def >= (other: Any): Column = constructColumn(other) { o =>
     GreaterThanOrEqual(expr, o.expr)
@@ -308,11 +329,15 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("age").geq(21) )
    * }}}
+   *
+   * @group java_expr_ops
    */
   def geq(other: Any): Column = this >= other
 
   /**
    * Equality test that is safe for null values.
+   *
+   * @group expr_ops
    */
   def <=> (other: Any): Column = constructColumn(other) { o =>
     EqualNullSafe(expr, o.expr)
@@ -320,16 +345,22 @@ trait Column extends DataFrame {
 
   /**
    * Equality test that is safe for null values.
+   *
+   * @group java_expr_ops
    */
   def eqNullSafe(other: Any): Column = this <=> other
 
   /**
    * True if the current expression is null.
+   *
+   * @group expr_ops
    */
   def isNull: Column = exprToColumn(IsNull(expr))
 
   /**
    * True if the current expression is NOT null.
+   *
+   * @group expr_ops
    */
   def isNotNull: Column = exprToColumn(IsNotNull(expr))
 
@@ -342,6 +373,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.filter( people("inSchool").or(people("isEmployed")) );
    * }}}
+   *
+   * @group expr_ops
    */
   def || (other: Any): Column = constructColumn(other) { o =>
     Or(expr, o.expr)
@@ -356,6 +389,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.filter( people("inSchool").or(people("isEmployed")) );
    * }}}
+   *
+   * @group java_expr_ops
    */
   def or(other: Column): Column = this || other
 
@@ -368,6 +403,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("inSchool").and(people("isEmployed")) );
    * }}}
+   *
+   * @group expr_ops
    */
   def && (other: Any): Column = constructColumn(other) { o =>
     And(expr, o.expr)
@@ -382,6 +419,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("inSchool").and(people("isEmployed")) );
    * }}}
+   *
+   * @group java_expr_ops
    */
   def and(other: Column): Column = this && other
 
@@ -394,6 +433,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("height").plus(people("weight")) );
    * }}}
+   *
+   * @group expr_ops
    */
   def + (other: Any): Column = constructColumn(other) { o =>
     Add(expr, o.expr)
@@ -408,6 +449,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("height").plus(people("weight")) );
    * }}}
+   *
+   * @group java_expr_ops
    */
   def plus(other: Any): Column = this + other
 
@@ -420,6 +463,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("height").minus(people("weight")) );
    * }}}
+   *
+   * @group expr_ops
    */
   def - (other: Any): Column = constructColumn(other) { o =>
     Subtract(expr, o.expr)
@@ -434,6 +479,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("height").minus(people("weight")) );
    * }}}
+   *
+   * @group java_expr_ops
    */
   def minus(other: Any): Column = this - other
 
@@ -446,6 +493,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("height").multiply(people("weight")) );
    * }}}
+   *
+   * @group expr_ops
    */
   def * (other: Any): Column = constructColumn(other) { o =>
     Multiply(expr, o.expr)
@@ -460,6 +509,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("height").multiply(people("weight")) );
    * }}}
+   *
+   * @group java_expr_ops
    */
   def multiply(other: Any): Column = this * other
 
@@ -472,6 +523,8 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("height").divide(people("weight")) );
    * }}}
+   *
+   * @group expr_ops
    */
   def / (other: Any): Column = constructColumn(other) { o =>
     Divide(expr, o.expr)
@@ -486,11 +539,15 @@ trait Column extends DataFrame {
    *   // Java:
    *   people.select( people("height").divide(people("weight")) );
    * }}}
+   *
+   * @group java_expr_ops
    */
   def divide(other: Any): Column = this / other
 
   /**
    * Modulo (a.k.a. remainder) expression.
+   *
+   * @group expr_ops
    */
   def % (other: Any): Column = constructColumn(other) { o =>
     Remainder(expr, o.expr)
@@ -498,29 +555,47 @@ trait Column extends DataFrame {
 
   /**
    * Modulo (a.k.a. remainder) expression.
+   *
+   * @group java_expr_ops
    */
   def mod(other: Any): Column = this % other
 
   /**
    * A boolean expression that is evaluated to true if the value of this expression is contained
    * by the evaluated values of the arguments.
+   *
+   * @group expr_ops
    */
   @scala.annotation.varargs
   def in(list: Column*): Column = {
     new IncomputableColumn(In(expr, list.map(_.expr)))
   }
 
+  /**
+   * SQL like expression.
+   *
+   * @group expr_ops
+   */
   def like(literal: String): Column = exprToColumn(Like(expr, lit(literal).expr))
 
+  /**
+   * SQL RLIKE expression (LIKE with Regex).
+   *
+   * @group expr_ops
+   */
   def rlike(literal: String): Column = exprToColumn(RLike(expr, lit(literal).expr))
 
   /**
    * An expression that gets an item at position `ordinal` out of an array.
+   *
+   * @group expr_ops
    */
   def getItem(ordinal: Int): Column = exprToColumn(GetItem(expr, Literal(ordinal)))
 
   /**
    * An expression that gets a field by name in a [[StructField]].
+   *
+   * @group expr_ops
    */
   def getField(fieldName: String): Column = exprToColumn(UnresolvedGetField(expr, fieldName))
 
@@ -528,6 +603,8 @@ trait Column extends DataFrame {
    * An expression that returns a substring.
    * @param startPos expression for the starting position.
    * @param len expression for the length of the substring.
+   *
+   * @group expr_ops
    */
   def substr(startPos: Column, len: Column): Column =
     exprToColumn(Substring(expr, startPos.expr, len.expr), computable = false)
@@ -536,24 +613,51 @@ trait Column extends DataFrame {
    * An expression that returns a substring.
    * @param startPos starting position.
    * @param len length of the substring.
+   *
+   * @group expr_ops
    */
   def substr(startPos: Int, len: Int): Column =
     exprToColumn(Substring(expr, lit(startPos).expr, lit(len).expr))
 
+  /**
+   * Contains the other element.
+   *
+   * @group expr_ops
+   */
   def contains(other: Any): Column = constructColumn(other) { o =>
     Contains(expr, o.expr)
   }
 
+  /**
+   * String starts with.
+   *
+   * @group expr_ops
+   */
   def startsWith(other: Column): Column = constructColumn(other) { o =>
     StartsWith(expr, o.expr)
   }
 
+  /**
+   * String starts with another string literal.
+   *
+   * @group expr_ops
+   */
   def startsWith(literal: String): Column = this.startsWith(lit(literal))
 
+  /**
+   * String ends with.
+   *
+   * @group expr_ops
+   */
   def endsWith(other: Column): Column = constructColumn(other) { o =>
     EndsWith(expr, o.expr)
   }
 
+  /**
+   * String ends with another string literal.
+   *
+   * @group expr_ops
+   */
   def endsWith(literal: String): Column = this.endsWith(lit(literal))
 
   /**
@@ -562,6 +666,8 @@ trait Column extends DataFrame {
    *   // Renames colA to colB in select output.
    *   df.select($"colA".as("colB"))
    * }}}
+   *
+   * @group expr_ops
    */
   override def as(alias: String): Column = exprToColumn(Alias(expr, alias)())
 
@@ -571,6 +677,8 @@ trait Column extends DataFrame {
    *   // Renames colA to colB in select output.
    *   df.select($"colA".as('colB))
    * }}}
+   *
+   * @group expr_ops
    */
   override def as(alias: Symbol): Column = exprToColumn(Alias(expr, alias.name)())
 
@@ -584,6 +692,8 @@ trait Column extends DataFrame {
    *   // equivalent to
    *   df.select(df("colA").cast("int"))
    * }}}
+   *
+   * @group expr_ops
    */
   def cast(to: DataType): Column = exprToColumn(Cast(expr, to))
 
@@ -595,6 +705,8 @@ trait Column extends DataFrame {
    *   // Casts colA to integer.
    *   df.select(df("colA").cast("int"))
    * }}}
+   *
+   * @group expr_ops
    */
   def cast(to: String): Column = exprToColumn(
     Cast(expr, to.toLowerCase match {
@@ -613,10 +725,39 @@ trait Column extends DataFrame {
     })
   )
 
+  /**
+   * Returns an ordering used in sorting.
+   * {{{
+   *   // Scala: sort a DataFrame by age column in descending order.
+   *   df.sort(df("age").desc)
+   *
+   *   // Java
+   *   df.sort(df.col("age").desc());
+   * }}}
+   *
+   * @group expr_ops
+   */
   def desc: Column = exprToColumn(SortOrder(expr, Descending), computable = false)
 
+  /**
+   * Returns an ordering used in sorting.
+   * {{{
+   *   // Scala: sort a DataFrame by age column in ascending order.
+   *   df.sort(df("age").asc)
+   *
+   *   // Java
+   *   df.sort(df.col("age").asc());
+   * }}}
+   *
+   * @group expr_ops
+   */
   def asc: Column = exprToColumn(SortOrder(expr, Ascending), computable = false)
 
+  /**
+   * Prints the plans (logical and physical) to the console for debugging purpose.
+   *
+   * @group df_ops
+   */
   override def explain(extended: Boolean): Unit = {
     if (extended) {
       println(expr)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index e21e989f36c65..c0c3cb40cf1da 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -41,19 +41,23 @@ private[sql] object DataFrame {
  * :: Experimental ::
  * A distributed collection of data organized into named columns.
  *
- * A [[DataFrame]] is equivalent to a relational table in Spark SQL, and can be created using
- * various functions in [[SQLContext]].
+ * A [[DataFrame]] is equivalent to a relational table in Spark SQL. There are multiple ways
+ * to create a [[DataFrame]]:
  * {{{
+ *   // Create a DataFrame from Parquet files
  *   val people = sqlContext.parquetFile("...")
+ *
+ *   // Create a DataFrame from data sources
+ *   val df =
  * }}}
  *
  * Once created, it can be manipulated using the various domain-specific-language (DSL) functions
- * defined in: [[DataFrame]] (this class), [[Column]], [[functions]] for the DSL.
+ * defined in: [[DataFrame]] (this class), [[Column]], and [[functions]].
  *
- * To select a column from the data frame, use the apply method:
+ * To select a column from the data frame, use `apply` method in Scala and `col` in Java.
  * {{{
  *   val ageCol = people("age")  // in Scala
- *   Column ageCol = people.apply("age")  // in Java
+ *   Column ageCol = people.col("age")  // in Java
  * }}}
  *
  * Note that the [[Column]] type can also be manipulated through its various functions.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index 3c1cf8d5e3851..848ea2e05624f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -238,9 +238,10 @@ private[sql] class DataFrameImpl protected[sql](
   }
 
   override def withColumnRenamed(existingName: String, newName: String): DataFrame = {
+    val resolver = sqlContext.analyzer.resolver
     val colNames = schema.map { field =>
       val name = field.name
-      if (name == existingName) Column(name).as(newName) else Column(name)
+      if (resolver(name, existingName)) Column(name).as(newName) else Column(name)
     }
     select(colNames :_*)
   }
@@ -368,6 +369,7 @@ private[sql] class DataFrameImpl protected[sql](
   /////////////////////////////////////////////////////////////////////////////
 
   override def rdd: RDD[Row] = {
+    // use a local variable to make sure the map closure doesn't capture the whole DataFrame
     val schema = this.schema
     queryExecution.executedPlan.execute().map(ScalaReflection.convertRowToScala(_, schema))
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index 2ecf086de92f7..17158303b889a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql
 import scala.collection.JavaConversions._
 import scala.language.implicitConversions
 
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.Aggregate
@@ -27,8 +28,10 @@ import org.apache.spark.sql.types.NumericType
 
 
 /**
+ * :: Experimental ::
  * A set of methods for aggregations on a [[DataFrame]], created by [[DataFrame.groupBy]].
  */
+@Experimental
 class GroupedData protected[sql](df: DataFrameImpl, groupingExprs: Seq[Expression]) {
 
   private[this] implicit def toDF(aggExprs: Seq[NamedExpression]): DataFrame = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index aa0fc3e359a9b..0aae0942ca04f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -25,7 +25,7 @@ import scala.collection.immutable
 import scala.language.implicitConversions
 import scala.reflect.runtime.universe.TypeTag
 
-import org.apache.spark.annotation.{AlphaComponent, DeveloperApi, Experimental}
+import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.analysis._
@@ -43,14 +43,13 @@ import org.apache.spark.util.Utils
 import org.apache.spark.{Partition, SparkContext}
 
 /**
- * :: AlphaComponent ::
  * The entry point for running relational queries using Spark.  Allows the creation of [[DataFrame]]
  * objects and the execution of SQL queries.
  *
+ * @groupname ddl_ops Catalog DDL functions
  * @groupname userf Spark SQL Functions
  * @groupname Ungrouped Support functions for language integrated queries.
  */
-@AlphaComponent
 class SQLContext(@transient val sparkContext: SparkContext)
   extends org.apache.spark.Logging
   with Serializable {
@@ -127,13 +126,19 @@ class SQLContext(@transient val sparkContext: SparkContext)
   protected[sql] val cacheManager = new CacheManager(this)
 
   /**
+   * :: Experimental ::
    * A collection of methods that are considered experimental, but can be used to hook into
    * the query planner for advanced functionalities.
    */
+  @Experimental
   @transient
   val experimental: ExperimentalMethods = new ExperimentalMethods(this)
 
-  /** Returns a [[DataFrame]] with no rows or columns. */
+  /**
+   * :: Experimental ::
+   * Returns a [[DataFrame]] with no rows or columns.
+   */
+  @Experimental
   @transient
   lazy val emptyDataFrame = DataFrame(this, NoRelation)
 
@@ -178,9 +183,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
   // scalastyle:off
   // Disable style checker so "implicits" object can start with lowercase i
   /**
-   * (Scala-specific)
-   * Implicit methods available in Scala for converting common Scala objects into [[DataFrame]]s.
+   * :: Experimental ::
+   * (Scala-specific) Implicit methods available in Scala for converting
+   * common Scala objects into [[DataFrame]]s.
    */
+  @Experimental
   object implicits extends Serializable {
     // scalastyle:on
 
@@ -250,10 +257,12 @@ class SQLContext(@transient val sparkContext: SparkContext)
   }
 
   /**
+   * :: Experimental ::
    * Creates a DataFrame from an RDD of case classes.
    *
    * @group userf
    */
+  @Experimental
   def createDataFrame[A <: Product : TypeTag](rdd: RDD[A]): DataFrame = {
     SparkPlan.currentContext.set(self)
     val schema = ScalaReflection.schemaFor[A].dataType.asInstanceOf[StructType]
@@ -263,8 +272,10 @@ class SQLContext(@transient val sparkContext: SparkContext)
   }
 
   /**
+   * :: Experimental ::
    * Creates a DataFrame from a local Seq of Product.
    */
+  @Experimental
   def createDataFrame[A <: Product : TypeTag](data: Seq[A]): DataFrame = {
     SparkPlan.currentContext.set(self)
     val schema = ScalaReflection.schemaFor[A].dataType.asInstanceOf[StructType]
@@ -281,7 +292,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * :: DeveloperApi ::
-   * Creates a [[DataFrame]] from an [[RDD]] containing [[Row]]s by applying a schema to this RDD.
+   * Creates a [[DataFrame]] from an [[RDD]] containing [[Row]]s using the given schema.
    * It is important to make sure that the structure of every [[Row]] of the provided RDD matches
    * the provided schema. Otherwise, there will be runtime exception.
    * Example:
@@ -316,6 +327,12 @@ class SQLContext(@transient val sparkContext: SparkContext)
     DataFrame(this, logicalPlan)
   }
 
+  /**
+   * :: DeveloperApi ::
+   * Creates a [[DataFrame]] from an [[JavaRDD]] containing [[Row]]s using the given schema.
+   * It is important to make sure that the structure of every [[Row]] of the provided RDD matches
+   * the provided schema. Otherwise, there will be runtime exception.
+   */
   @DeveloperApi
   def createDataFrame(rowRDD: JavaRDD[Row], schema: StructType): DataFrame = {
     createDataFrame(rowRDD.rdd, schema)
@@ -402,13 +419,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *
    * @group userf
    */
-  @DeveloperApi
   @deprecated("use createDataFrame", "1.3.0")
   def applySchema(rowRDD: RDD[Row], schema: StructType): DataFrame = {
     createDataFrame(rowRDD, schema)
   }
 
-  @DeveloperApi
   @deprecated("use createDataFrame", "1.3.0")
   def applySchema(rowRDD: JavaRDD[Row], schema: StructType): DataFrame = {
     createDataFrame(rowRDD, schema)
@@ -437,18 +452,22 @@ class SQLContext(@transient val sparkContext: SparkContext)
   }
 
   /**
-   * Loads a Parquet file, returning the result as a [[DataFrame]].
+   * Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty
+   * [[DataFrame]] if no paths are passed in.
    *
    * @group userf
    */
   @scala.annotation.varargs
-  def parquetFile(path: String, paths: String*): DataFrame =
-    if (conf.parquetUseDataSourceApi) {
-      baseRelationToDataFrame(parquet.ParquetRelation2(path +: paths, Map.empty)(this))
+  def parquetFile(paths: String*): DataFrame = {
+    if (paths.isEmpty) {
+      emptyDataFrame
+    } else if (conf.parquetUseDataSourceApi) {
+      baseRelationToDataFrame(parquet.ParquetRelation2(paths, Map.empty)(this))
     } else {
       DataFrame(this, parquet.ParquetRelation(
-        (path +: paths).mkString(","), Some(sparkContext.hadoopConfiguration), this))
+        paths.mkString(","), Some(sparkContext.hadoopConfiguration), this))
     }
+  }
 
   /**
    * Loads a JSON file (one object per line), returning the result as a [[DataFrame]].
@@ -545,8 +564,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * :: Experimental ::
-   * Returns the dataset stored at path as a DataFrame,
-   * using the given data source.
+   * Returns the dataset stored at path as a DataFrame, using the given data source.
    */
   @Experimental
   def load(path: String, source: String): DataFrame = {
@@ -555,7 +573,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * :: Experimental ::
-   * Returns the dataset specified by the given data source and a set of options as a DataFrame.
+   * (Java-specific) Returns the dataset specified by the given data source and
+   * a set of options as a DataFrame.
    */
   @Experimental
   def load(source: String, options: java.util.Map[String, String]): DataFrame = {
@@ -564,8 +583,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * :: Experimental ::
-   * (Scala-specific)
-   * Returns the dataset specified by the given data source and a set of options as a DataFrame.
+   * (Scala-specific) Returns the dataset specified by the given data source and
+   * a set of options as a DataFrame.
    */
   @Experimental
   def load(source: String, options: Map[String, String]): DataFrame = {
@@ -575,8 +594,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * :: Experimental ::
-   * Returns the dataset specified by the given data source and a set of options as a DataFrame,
-   * using the given schema as the schema of the DataFrame.
+   * (Java-specific) Returns the dataset specified by the given data source and
+   * a set of options as a DataFrame, using the given schema as the schema of the DataFrame.
    */
   @Experimental
   def load(
@@ -588,9 +607,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * :: Experimental ::
-   * (Scala-specific)
-   * Returns the dataset specified by the given data source and a set of options as a DataFrame,
-   * using the given schema as the schema of the DataFrame.
+   * (Scala-specific) Returns the dataset specified by the given data source and
+   * a set of options as a DataFrame, using the given schema as the schema of the DataFrame.
    */
   @Experimental
   def load(
@@ -605,6 +623,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * :: Experimental ::
    * Creates an external table from the given path and returns the corresponding DataFrame.
    * It will use the default data source configured by spark.sql.sources.default.
+   *
+   * @group ddl_ops
    */
   @Experimental
   def createExternalTable(tableName: String, path: String): DataFrame = {
@@ -616,6 +636,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * :: Experimental ::
    * Creates an external table from the given path based on a data source
    * and returns the corresponding DataFrame.
+   *
+   * @group ddl_ops
    */
   @Experimental
   def createExternalTable(
@@ -629,6 +651,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * :: Experimental ::
    * Creates an external table from the given path based on a data source and a set of options.
    * Then, returns the corresponding DataFrame.
+   *
+   * @group ddl_ops
    */
   @Experimental
   def createExternalTable(
@@ -643,6 +667,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * (Scala-specific)
    * Creates an external table from the given path based on a data source and a set of options.
    * Then, returns the corresponding DataFrame.
+   *
+   * @group ddl_ops
    */
   @Experimental
   def createExternalTable(
@@ -666,6 +692,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * :: Experimental ::
    * Create an external table from the given path based on a data source, a schema and
    * a set of options. Then, returns the corresponding DataFrame.
+   *
+   * @group ddl_ops
    */
   @Experimental
   def createExternalTable(
@@ -681,6 +709,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * (Scala-specific)
    * Create an external table from the given path based on a data source, a schema and
    * a set of options. Then, returns the corresponding DataFrame.
+   *
+   * @group ddl_ops
    */
   @Experimental
   def createExternalTable(
@@ -734,26 +764,23 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * of the RDD.
    */
   @Experimental
-  def jdbcRDD(url: String, table: String, theParts: Array[String]):
-      DataFrame = {
-    val parts: Array[Partition] = theParts.zipWithIndex.map(
-        x => JDBCPartition(x._1, x._2).asInstanceOf[Partition])
+  def jdbcRDD(url: String, table: String, theParts: Array[String]): DataFrame = {
+    val parts: Array[Partition] = theParts.zipWithIndex.map { case (part, i) =>
+      JDBCPartition(part, i) : Partition
+    }
     jdbcRDD(url, table, parts)
   }
 
-  private def jdbcRDD(url: String, table: String, parts: Array[Partition]):
-      DataFrame = {
+  private def jdbcRDD(url: String, table: String, parts: Array[Partition]): DataFrame = {
     val relation = JDBCRelation(url, table, parts)(this)
     baseRelationToDataFrame(relation)
   }
 
   /**
-   * Registers the given RDD as a temporary table in the catalog.  Temporary tables exist only
+   * Registers the given RDD as a temporary table in the catalog. Temporary tables exist only
    * during the lifetime of this instance of SQLContext.
-   *
-   * @group userf
    */
-  def registerRDDAsTable(rdd: DataFrame, tableName: String): Unit = {
+  private[sql] def registerRDDAsTable(rdd: DataFrame, tableName: String): Unit = {
     catalog.registerTable(Seq(tableName), rdd.logicalPlan)
   }
 
@@ -763,7 +790,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *
    * @param tableName the name of the table to be unregistered.
    *
-   * @group userf
+   * @group ddl_ops
    */
   def dropTempTable(tableName: String): Unit = {
     cacheManager.tryUncacheQuery(table(tableName))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index d8b0a3b26dbab..8051df299252c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -32,9 +32,9 @@ import org.apache.spark.sql.types.DataType
 
 
 /**
- * Functions for registering user-defined functions.
+ * Functions for registering user-defined functions. Use [[SQLContext.udf]] to access this.
  */
-class UDFRegistration(sqlContext: SQLContext) extends Logging {
+class UDFRegistration private[sql] (sqlContext: SQLContext) extends Logging {
 
   private val functionRegistry = sqlContext.functionRegistry
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 4a0ec0b72ce81..2a1e086891423 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql
 import scala.language.implicitConversions
 import scala.reflect.runtime.universe.{TypeTag, typeTag}
 
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.expressions._
@@ -27,8 +28,15 @@ import org.apache.spark.sql.types._
 
 
 /**
- * Domain specific functions available for [[DataFrame]].
+ * :: Experimental ::
+ * Functions available for [[DataFrame]].
+ *
+ * @groupname udf_funcs UDF functions
+ * @groupname agg_funcs Aggregate functions
+ * @groupname normal_funcs Non-aggregate functions
+ * @groupname Ungrouped Support functions for DataFrames.
  */
+@Experimental
 // scalastyle:off
 object functions {
 // scalastyle:on
@@ -37,11 +45,15 @@ object functions {
 
   /**
    * Returns a [[Column]] based on the given column name.
+   *
+   * @group normal_funcs
    */
   def col(colName: String): Column = Column(colName)
 
   /**
    * Returns a [[Column]] based on the given column name. Alias of [[col]].
+   *
+   * @group normal_funcs
    */
   def column(colName: String): Column = Column(colName)
 
@@ -51,6 +63,8 @@ object functions {
    * The passed in object is returned directly if it is already a [[Column]].
    * If the object is a Scala Symbol, it is converted into a [[Column]] also.
    * Otherwise, a new [[Column]] is created to represent the literal value.
+   *
+   * @group normal_funcs
    */
   def lit(literal: Any): Column = {
     literal match {
@@ -84,80 +98,168 @@ object functions {
   //////////////////////////////////////////////////////////////////////////////////////////////
   //////////////////////////////////////////////////////////////////////////////////////////////
 
-  /** Aggregate function: returns the sum of all values in the expression. */
+  /**
+   * Aggregate function: returns the sum of all values in the expression.
+   *
+   * @group agg_funcs
+   */
   def sum(e: Column): Column = Sum(e.expr)
 
-  /** Aggregate function: returns the sum of all values in the given column. */
+  /**
+   * Aggregate function: returns the sum of all values in the given column.
+   *
+   * @group agg_funcs
+   */
   def sum(columnName: String): Column = sum(Column(columnName))
 
-  /** Aggregate function: returns the sum of distinct values in the expression. */
+  /**
+   * Aggregate function: returns the sum of distinct values in the expression.
+   *
+   * @group agg_funcs
+   */
   def sumDistinct(e: Column): Column = SumDistinct(e.expr)
 
-  /** Aggregate function: returns the sum of distinct values in the expression. */
+  /**
+   * Aggregate function: returns the sum of distinct values in the expression.
+   *
+   * @group agg_funcs
+   */
   def sumDistinct(columnName: String): Column = sumDistinct(Column(columnName))
 
-  /** Aggregate function: returns the number of items in a group. */
+  /**
+   * Aggregate function: returns the number of items in a group.
+   *
+   * @group agg_funcs
+   */
   def count(e: Column): Column = e.expr match {
     // Turn count(*) into count(1)
     case s: Star => Count(Literal(1))
     case _ => Count(e.expr)
   }
 
-  /** Aggregate function: returns the number of items in a group. */
+  /**
+   * Aggregate function: returns the number of items in a group.
+   *
+   * @group agg_funcs
+   */
   def count(columnName: String): Column = count(Column(columnName))
 
-  /** Aggregate function: returns the number of distinct items in a group. */
+  /**
+   * Aggregate function: returns the number of distinct items in a group.
+   *
+   * @group agg_funcs
+   */
   @scala.annotation.varargs
   def countDistinct(expr: Column, exprs: Column*): Column =
     CountDistinct((expr +: exprs).map(_.expr))
 
-  /** Aggregate function: returns the number of distinct items in a group. */
+  /**
+   * Aggregate function: returns the number of distinct items in a group.
+   *
+   * @group agg_funcs
+   */
   @scala.annotation.varargs
   def countDistinct(columnName: String, columnNames: String*): Column =
     countDistinct(Column(columnName), columnNames.map(Column.apply) :_*)
 
-  /** Aggregate function: returns the approximate number of distinct items in a group. */
+  /**
+   * Aggregate function: returns the approximate number of distinct items in a group.
+   *
+   * @group agg_funcs
+   */
   def approxCountDistinct(e: Column): Column = ApproxCountDistinct(e.expr)
 
-  /** Aggregate function: returns the approximate number of distinct items in a group. */
+  /**
+   * Aggregate function: returns the approximate number of distinct items in a group.
+   *
+   * @group agg_funcs
+   */
   def approxCountDistinct(columnName: String): Column = approxCountDistinct(column(columnName))
 
-  /** Aggregate function: returns the approximate number of distinct items in a group. */
+  /**
+   * Aggregate function: returns the approximate number of distinct items in a group.
+   *
+   * @group agg_funcs
+   */
   def approxCountDistinct(e: Column, rsd: Double): Column = ApproxCountDistinct(e.expr, rsd)
 
-  /** Aggregate function: returns the approximate number of distinct items in a group. */
+  /**
+   * Aggregate function: returns the approximate number of distinct items in a group.
+   *
+   * @group agg_funcs
+   */
   def approxCountDistinct(columnName: String, rsd: Double): Column = {
     approxCountDistinct(Column(columnName), rsd)
   }
 
-  /** Aggregate function: returns the average of the values in a group. */
+  /**
+   * Aggregate function: returns the average of the values in a group.
+   *
+   * @group agg_funcs
+   */
   def avg(e: Column): Column = Average(e.expr)
 
-  /** Aggregate function: returns the average of the values in a group. */
+  /**
+   * Aggregate function: returns the average of the values in a group.
+   *
+   * @group agg_funcs
+   */
   def avg(columnName: String): Column = avg(Column(columnName))
 
-  /** Aggregate function: returns the first value in a group. */
+  /**
+   * Aggregate function: returns the first value in a group.
+   *
+   * @group agg_funcs
+   */
   def first(e: Column): Column = First(e.expr)
 
-  /** Aggregate function: returns the first value of a column in a group. */
+  /**
+   * Aggregate function: returns the first value of a column in a group.
+   *
+   * @group agg_funcs
+   */
   def first(columnName: String): Column = first(Column(columnName))
 
-  /** Aggregate function: returns the last value in a group. */
+  /**
+   * Aggregate function: returns the last value in a group.
+   *
+   * @group agg_funcs
+   */
   def last(e: Column): Column = Last(e.expr)
 
-  /** Aggregate function: returns the last value of the column in a group. */
+  /**
+   * Aggregate function: returns the last value of the column in a group.
+   *
+   * @group agg_funcs
+   */
   def last(columnName: String): Column = last(Column(columnName))
 
-  /** Aggregate function: returns the minimum value of the expression in a group. */
+  /**
+   * Aggregate function: returns the minimum value of the expression in a group.
+   *
+   * @group agg_funcs
+   */
   def min(e: Column): Column = Min(e.expr)
 
-  /** Aggregate function: returns the minimum value of the column in a group. */
+  /**
+   * Aggregate function: returns the minimum value of the column in a group.
+   *
+   * @group agg_funcs
+   */
   def min(columnName: String): Column = min(Column(columnName))
 
-  /** Aggregate function: returns the maximum value of the expression in a group. */
+  /**
+   * Aggregate function: returns the maximum value of the expression in a group.
+   *
+   * @group agg_funcs
+   */
   def max(e: Column): Column = Max(e.expr)
 
-  /** Aggregate function: returns the maximum value of the column in a group. */
+  /**
+   * Aggregate function: returns the maximum value of the column in a group.
+   *
+   * @group agg_funcs
+   */
   def max(columnName: String): Column = max(Column(columnName))
 
   //////////////////////////////////////////////////////////////////////////////////////////////
@@ -168,6 +270,8 @@ object functions {
    * {{{
    *   df.select(coalesce(df("a"), df("b")))
    * }}}
+   *
+   * @group normal_funcs
    */
   @scala.annotation.varargs
   def coalesce(e: Column*): Column = Coalesce(e.map(_.expr))
@@ -182,6 +286,8 @@ object functions {
    *   // Java:
    *   df.select( negate(df.col("amount")) );
    * }}}
+   *
+   * @group normal_funcs
    */
   def negate(e: Column): Column = -e
 
@@ -194,19 +300,37 @@ object functions {
    *   // Java:
    *   df.filter( not(df.col("isActive")) );
    * }}
+   *
+   * @group normal_funcs
    */
   def not(e: Column): Column = !e
 
-  /** Converts a string expression to upper case. */
+  /**
+   * Converts a string expression to upper case.
+   *
+   * @group normal_funcs
+   */
   def upper(e: Column): Column = Upper(e.expr)
 
-  /** Converts a string exprsesion to lower case. */
+  /**
+   * Converts a string exprsesion to lower case.
+   *
+   * @group normal_funcs
+   */
   def lower(e: Column): Column = Lower(e.expr)
 
-  /** Computes the square root of the specified float value. */
+  /**
+   * Computes the square root of the specified float value.
+   *
+   * @group normal_funcs
+   */
   def sqrt(e: Column): Column = Sqrt(e.expr)
 
-  /** Computes the absolutle value. */
+  /**
+   * Computes the absolutle value.
+   *
+   * @group normal_funcs
+   */
   def abs(e: Column): Column = Abs(e.expr)
 
   //////////////////////////////////////////////////////////////////////////////////////////////
@@ -222,6 +346,8 @@ object functions {
     /**
      * Defines a user-defined function of ${x} arguments as user-defined function (UDF).
      * The data types are automatically inferred based on the function's signature.
+     *
+     * @group udf_funcs
      */
     def udf[$typeTags](f: Function$x[$types]): UserDefinedFunction = {
       UserDefinedFunction(f, ScalaReflection.schemaFor(typeTag[RT]).dataType)
@@ -236,6 +362,8 @@ object functions {
     /**
      * Call a Scala function of ${x} arguments as user-defined function (UDF). This requires
      * you to specify the return data type.
+     *
+     * @group udf_funcs
      */
     def callUDF(f: Function$x[$fTypes], returnType: DataType${if (args.length > 0) ", " + args else ""}): Column = {
       ScalaUdf(f, returnType, Seq($argsInUdf))
@@ -246,6 +374,8 @@ object functions {
   /**
    * Defines a user-defined function of 0 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
+   *
+   * @group udf_funcs
    */
   def udf[RT: TypeTag](f: Function0[RT]): UserDefinedFunction = {
     UserDefinedFunction(f, ScalaReflection.schemaFor(typeTag[RT]).dataType)
@@ -254,6 +384,8 @@ object functions {
   /**
    * Defines a user-defined function of 1 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
+   *
+   * @group udf_funcs
    */
   def udf[RT: TypeTag, A1: TypeTag](f: Function1[A1, RT]): UserDefinedFunction = {
     UserDefinedFunction(f, ScalaReflection.schemaFor(typeTag[RT]).dataType)
@@ -262,6 +394,8 @@ object functions {
   /**
    * Defines a user-defined function of 2 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
+   *
+   * @group udf_funcs
    */
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag](f: Function2[A1, A2, RT]): UserDefinedFunction = {
     UserDefinedFunction(f, ScalaReflection.schemaFor(typeTag[RT]).dataType)
@@ -270,6 +404,8 @@ object functions {
   /**
    * Defines a user-defined function of 3 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
+   *
+   * @group udf_funcs
    */
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag](f: Function3[A1, A2, A3, RT]): UserDefinedFunction = {
     UserDefinedFunction(f, ScalaReflection.schemaFor(typeTag[RT]).dataType)
@@ -278,6 +414,8 @@ object functions {
   /**
    * Defines a user-defined function of 4 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
+   *
+   * @group udf_funcs
    */
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag](f: Function4[A1, A2, A3, A4, RT]): UserDefinedFunction = {
     UserDefinedFunction(f, ScalaReflection.schemaFor(typeTag[RT]).dataType)
@@ -286,6 +424,8 @@ object functions {
   /**
    * Defines a user-defined function of 5 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
+   *
+   * @group udf_funcs
    */
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag](f: Function5[A1, A2, A3, A4, A5, RT]): UserDefinedFunction = {
     UserDefinedFunction(f, ScalaReflection.schemaFor(typeTag[RT]).dataType)
@@ -294,6 +434,8 @@ object functions {
   /**
    * Defines a user-defined function of 6 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
+   *
+   * @group udf_funcs
    */
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag](f: Function6[A1, A2, A3, A4, A5, A6, RT]): UserDefinedFunction = {
     UserDefinedFunction(f, ScalaReflection.schemaFor(typeTag[RT]).dataType)
@@ -302,6 +444,8 @@ object functions {
   /**
    * Defines a user-defined function of 7 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
+   *
+   * @group udf_funcs
    */
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag](f: Function7[A1, A2, A3, A4, A5, A6, A7, RT]): UserDefinedFunction = {
     UserDefinedFunction(f, ScalaReflection.schemaFor(typeTag[RT]).dataType)
@@ -310,6 +454,8 @@ object functions {
   /**
    * Defines a user-defined function of 8 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
+   *
+   * @group udf_funcs
    */
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag](f: Function8[A1, A2, A3, A4, A5, A6, A7, A8, RT]): UserDefinedFunction = {
     UserDefinedFunction(f, ScalaReflection.schemaFor(typeTag[RT]).dataType)
@@ -318,6 +464,8 @@ object functions {
   /**
    * Defines a user-defined function of 9 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
+   *
+   * @group udf_funcs
    */
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag](f: Function9[A1, A2, A3, A4, A5, A6, A7, A8, A9, RT]): UserDefinedFunction = {
     UserDefinedFunction(f, ScalaReflection.schemaFor(typeTag[RT]).dataType)
@@ -326,6 +474,8 @@ object functions {
   /**
    * Defines a user-defined function of 10 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
+   *
+   * @group udf_funcs
    */
   def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag](f: Function10[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT]): UserDefinedFunction = {
     UserDefinedFunction(f, ScalaReflection.schemaFor(typeTag[RT]).dataType)
@@ -336,6 +486,8 @@ object functions {
   /**
    * Call a Scala function of 0 arguments as user-defined function (UDF). This requires
    * you to specify the return data type.
+   *
+   * @group udf_funcs
    */
   def callUDF(f: Function0[_], returnType: DataType): Column = {
     ScalaUdf(f, returnType, Seq())
@@ -344,6 +496,8 @@ object functions {
   /**
    * Call a Scala function of 1 arguments as user-defined function (UDF). This requires
    * you to specify the return data type.
+   *
+   * @group udf_funcs
    */
   def callUDF(f: Function1[_, _], returnType: DataType, arg1: Column): Column = {
     ScalaUdf(f, returnType, Seq(arg1.expr))
@@ -352,6 +506,8 @@ object functions {
   /**
    * Call a Scala function of 2 arguments as user-defined function (UDF). This requires
    * you to specify the return data type.
+   *
+   * @group udf_funcs
    */
   def callUDF(f: Function2[_, _, _], returnType: DataType, arg1: Column, arg2: Column): Column = {
     ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr))
@@ -360,6 +516,8 @@ object functions {
   /**
    * Call a Scala function of 3 arguments as user-defined function (UDF). This requires
    * you to specify the return data type.
+   *
+   * @group udf_funcs
    */
   def callUDF(f: Function3[_, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column): Column = {
     ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr))
@@ -368,6 +526,8 @@ object functions {
   /**
    * Call a Scala function of 4 arguments as user-defined function (UDF). This requires
    * you to specify the return data type.
+   *
+   * @group udf_funcs
    */
   def callUDF(f: Function4[_, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column): Column = {
     ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr))
@@ -376,6 +536,8 @@ object functions {
   /**
    * Call a Scala function of 5 arguments as user-defined function (UDF). This requires
    * you to specify the return data type.
+   *
+   * @group udf_funcs
    */
   def callUDF(f: Function5[_, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column): Column = {
     ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr))
@@ -384,6 +546,8 @@ object functions {
   /**
    * Call a Scala function of 6 arguments as user-defined function (UDF). This requires
    * you to specify the return data type.
+   *
+   * @group udf_funcs
    */
   def callUDF(f: Function6[_, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column): Column = {
     ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr))
@@ -392,6 +556,8 @@ object functions {
   /**
    * Call a Scala function of 7 arguments as user-defined function (UDF). This requires
    * you to specify the return data type.
+   *
+   * @group udf_funcs
    */
   def callUDF(f: Function7[_, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column): Column = {
     ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr))
@@ -400,6 +566,8 @@ object functions {
   /**
    * Call a Scala function of 8 arguments as user-defined function (UDF). This requires
    * you to specify the return data type.
+   *
+   * @group udf_funcs
    */
   def callUDF(f: Function8[_, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column): Column = {
     ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr))
@@ -408,6 +576,8 @@ object functions {
   /**
    * Call a Scala function of 9 arguments as user-defined function (UDF). This requires
    * you to specify the return data type.
+   *
+   * @group udf_funcs
    */
   def callUDF(f: Function9[_, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column): Column = {
     ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr))
@@ -416,6 +586,8 @@ object functions {
   /**
    * Call a Scala function of 10 arguments as user-defined function (UDF). This requires
    * you to specify the return data type.
+   *
+   * @group udf_funcs
    */
   def callUDF(f: Function10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType, arg1: Column, arg2: Column, arg3: Column, arg4: Column, arg5: Column, arg6: Column, arg7: Column, arg8: Column, arg9: Column, arg10: Column): Column = {
     ScalaUdf(f, returnType, Seq(arg1.expr, arg2.expr, arg3.expr, arg4.expr, arg5.expr, arg6.expr, arg7.expr, arg8.expr, arg9.expr, arg10.expr))
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index d2c39ab621713..e63cea60457d9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -143,7 +143,7 @@ private[hive] trait HiveStrategies {
               PhysicalRDD(plan.output, sparkContext.emptyRDD[Row]) :: Nil
             } else {
               hiveContext
-                .parquetFile(partitionLocations.head, partitionLocations.tail: _*)
+                .parquetFile(partitionLocations: _*)
                 .addPartitioningAttributes(relation.partitionKeys)
                 .lowerCase
                 .where(unresolvedOtherPredicates)
@@ -152,6 +152,7 @@ private[hive] trait HiveStrategies {
                 .executedPlan
                 .fakeOutput(projectList.map(_.toAttribute)) :: Nil
             }
+
           } else {
             hiveContext
               .parquetFile(relation.hiveQlTable.getDataLocation.toString)

From ac6fe67e1d8bf01ee565f9cc09ad48d88a275829 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 16 Feb 2015 20:32:03 -0800
Subject: [PATCH 205/357] [SPARK-5363] [PySpark] check ending mark in non-block
 way

There is chance of dead lock that the Python process is waiting for ending mark from JVM, but which is eaten by corrupted stream.

This PR checks the ending mark from Python in non-block way, so it will not blocked by Python process.

There is a small chance that the ending mark is sent by Python process but not available right now, then Python process will not be used.

cc JoshRosen pwendell

Author: Davies Liu <davies@databricks.com>

Closes #4601 from davies/freeze and squashes the following commits:

e15a8c3 [Davies Liu] update logging
890329c [Davies Liu] Merge branch 'freeze' of github.com:davies/spark into freeze
2bd2228 [Davies Liu] add more logging
656d544 [Davies Liu] Update PythonRDD.scala
05e1085 [Davies Liu] check ending mark in non-block way
---
 .../apache/spark/api/python/PythonRDD.scala   | 21 +++++++++++++++----
 python/pyspark/worker.py                      |  1 +
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 252721192904f..c3c8336a437a6 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -144,11 +144,24 @@ private[spark] class PythonRDD(
                 stream.readFully(update)
                 accumulator += Collections.singletonList(update)
               }
+
               // Check whether the worker is ready to be re-used.
-              if (stream.readInt() == SpecialLengths.END_OF_STREAM) {
-                if (reuse_worker) {
-                  env.releasePythonWorker(pythonExec, envVars.toMap, worker)
-                  released = true
+              if (reuse_worker) {
+                // It has a high possibility that the ending mark is already available,
+                // And current task should not be blocked by checking it
+
+                if (stream.available() >= 4) {
+                  val ending = stream.readInt()
+                  if (ending == SpecialLengths.END_OF_STREAM) {
+                    env.releasePythonWorker(pythonExec, envVars.toMap, worker)
+                    released = true
+                    logInfo(s"Communication with worker ended cleanly, re-use it: $worker")
+                  } else {
+                    logInfo(s"Communication with worker did not end cleanly (ending with $ending), " +
+                      s"close it: $worker")
+                  }
+                } else {
+                  logInfo(s"The ending mark from worker is not available, close it: $worker")
                 }
               }
               null
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 8a93c320ec5d3..180bdbb4c2c4f 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -121,6 +121,7 @@ def process():
     write_int(len(_accumulatorRegistry), outfile)
     for (aid, accum) in _accumulatorRegistry.items():
         pickleSer._write_with_length((aid, accum._value), outfile)
+    outfile.flush()
 
     # check end of stream
     if read_int(infile) == SpecialLengths.END_OF_STREAM:

From a51d51ffac00931c80ce93889a98c2f77aef8953 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Mon, 16 Feb 2015 20:33:33 -0800
Subject: [PATCH 206/357] SPARK-5850: Remove experimental label for Scala 2.11
 and FlumePollingStream

Author: Patrick Wendell <patrick@databricks.com>

Closes #4638 from pwendell/SPARK-5850 and squashes the following commits:

386126f [Patrick Wendell] SPARK-5850: Remove experimental label for Scala 2.11 and FlumePollingStream.
---
 docs/building-spark.md                                    | 6 +++---
 docs/streaming-flume-integration.md                       | 2 +-
 .../org/apache/spark/streaming/flume/FlumeUtils.scala     | 8 --------
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/docs/building-spark.md b/docs/building-spark.md
index 088da7da4980e..4c3988e819ad8 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -111,9 +111,9 @@ To produce a Spark package compiled with Scala 2.11, use the `-Dscala-2.11` prop
     dev/change-version-to-2.11.sh
     mvn -Pyarn -Phadoop-2.4 -Dscala-2.11 -DskipTests clean package
 
-Scala 2.11 support in Spark is experimental and does not support a few features.
-Specifically, Spark's external Kafka library and JDBC component are not yet
-supported in Scala 2.11 builds.
+Scala 2.11 support in Spark does not support a few features due to dependencies
+which are themselves not Scala 2.11 ready. Specifically, Spark's external 
+Kafka library and JDBC component are not yet supported in Scala 2.11 builds.
 
 # Spark Tests in Maven
 
diff --git a/docs/streaming-flume-integration.md b/docs/streaming-flume-integration.md
index ac01dd3d8019a..40e17246fea83 100644
--- a/docs/streaming-flume-integration.md
+++ b/docs/streaming-flume-integration.md
@@ -64,7 +64,7 @@ configuring Flume agents.
 
 3. **Deploying:** Package `spark-streaming-flume_{{site.SCALA_BINARY_VERSION}}` and its dependencies (except `spark-core_{{site.SCALA_BINARY_VERSION}}` and `spark-streaming_{{site.SCALA_BINARY_VERSION}}` which are provided by `spark-submit`) into the application JAR. Then use `spark-submit` to launch your application (see [Deploying section](streaming-programming-guide.html#deploying-applications) in the main programming guide).
 
-## Approach 2 (Experimental): Pull-based Approach using a Custom Sink
+## Approach 2: Pull-based Approach using a Custom Sink
 Instead of Flume pushing data directly to Spark Streaming, this approach runs a custom Flume sink that allows the following.
 
 - Flume pushes data into the sink, and the data stays buffered.
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala
index 4b732c1592ab2..44dec45c227ca 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala
@@ -19,7 +19,6 @@ package org.apache.spark.streaming.flume
 
 import java.net.InetSocketAddress
 
-import org.apache.spark.annotation.Experimental
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.StreamingContext
 import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext}
@@ -121,7 +120,6 @@ object FlumeUtils {
    * @param port Port of the host at which the Spark Sink is listening
    * @param storageLevel Storage level to use for storing the received objects
    */
-  @Experimental
   def createPollingStream(
       ssc: StreamingContext,
       hostname: String,
@@ -138,7 +136,6 @@ object FlumeUtils {
    * @param addresses List of InetSocketAddresses representing the hosts to connect to.
    * @param storageLevel Storage level to use for storing the received objects
    */
-  @Experimental
   def createPollingStream(
       ssc: StreamingContext,
       addresses: Seq[InetSocketAddress],
@@ -159,7 +156,6 @@ object FlumeUtils {
    *                    result in this stream using more threads
    * @param storageLevel Storage level to use for storing the received objects
    */
-  @Experimental
   def createPollingStream(
       ssc: StreamingContext,
       addresses: Seq[InetSocketAddress],
@@ -178,7 +174,6 @@ object FlumeUtils {
    * @param hostname Hostname of the host on which the Spark Sink is running
    * @param port     Port of the host at which the Spark Sink is listening
    */
-  @Experimental
   def createPollingStream(
       jssc: JavaStreamingContext,
       hostname: String,
@@ -195,7 +190,6 @@ object FlumeUtils {
    * @param port         Port of the host at which the Spark Sink is listening
    * @param storageLevel Storage level to use for storing the received objects
    */
-  @Experimental
   def createPollingStream(
       jssc: JavaStreamingContext,
       hostname: String,
@@ -212,7 +206,6 @@ object FlumeUtils {
    * @param addresses    List of InetSocketAddresses on which the Spark Sink is running.
    * @param storageLevel Storage level to use for storing the received objects
    */
-  @Experimental
   def createPollingStream(
       jssc: JavaStreamingContext,
       addresses: Array[InetSocketAddress],
@@ -233,7 +226,6 @@ object FlumeUtils {
    *                     result in this stream using more threads
    * @param storageLevel Storage level to use for storing the received objects
    */
-  @Experimental
   def createPollingStream(
       jssc: JavaStreamingContext,
       addresses: Array[InetSocketAddress],

From d380f324c6d38ffacfda83a525a1a7e23347e5b8 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 16 Feb 2015 20:42:57 -0800
Subject: [PATCH 207/357] [SPARK-5853][SQL] Schema support in Row.

Author: Reynold Xin <rxin@databricks.com>

Closes #4640 from rxin/SPARK-5853 and squashes the following commits:

9c6f569 [Reynold Xin] [SPARK-5853][SQL] Schema support in Row.
---
 sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala | 7 ++++++-
 .../org/apache/spark/sql/catalyst/ScalaReflection.scala    | 6 +++---
 .../org/apache/spark/sql/catalyst/expressions/rows.scala   | 6 +++++-
 .../test/scala/org/apache/spark/sql/DataFrameSuite.scala   | 6 ++++++
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index 3a70d25534968..d794f034f5578 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql
 import scala.util.hashing.MurmurHash3
 
 import org.apache.spark.sql.catalyst.expressions.GenericRow
-import org.apache.spark.sql.types.DateUtils
+import org.apache.spark.sql.types.{StructType, DateUtils}
 
 object Row {
   /**
@@ -122,6 +122,11 @@ trait Row extends Serializable {
   /** Number of elements in the Row. */
   def length: Int
 
+  /**
+   * Schema for the row.
+   */
+  def schema: StructType = null
+
   /**
    * Returns the value at position i. If the value is null, null is returned. The following
    * is a mapping between Spark SQL types and return types:
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 11fd443733658..d6126c24fc50d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst
 import java.sql.Timestamp
 
 import org.apache.spark.util.Utils
-import org.apache.spark.sql.catalyst.expressions.{GenericRow, Attribute, AttributeReference, Row}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 import org.apache.spark.sql.types._
 
@@ -91,9 +91,9 @@ trait ScalaReflection {
 
   def convertRowToScala(r: Row, schema: StructType): Row = {
     // TODO: This is very slow!!!
-    new GenericRow(
+    new GenericRowWithSchema(
       r.toSeq.zip(schema.fields.map(_.dataType))
-        .map(r_dt => convertToScala(r_dt._1, r_dt._2)).toArray)
+        .map(r_dt => convertToScala(r_dt._1, r_dt._2)).toArray, schema)
   }
 
   /** Returns a Sequence of attributes for the given case class type. */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
index 73ec7a6d114f5..faa366771824b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.sql.types.NativeType
+import org.apache.spark.sql.types.{StructType, NativeType}
 
 
 /**
@@ -149,6 +149,10 @@ class GenericRow(protected[sql] val values: Array[Any]) extends Row {
   def copy() = this
 }
 
+class GenericRowWithSchema(values: Array[Any], override val schema: StructType)
+  extends GenericRow(values) {
+}
+
 class GenericMutableRow(v: Array[Any]) extends GenericRow(v) with MutableRow {
   /** No-arg constructor for serialization. */
   def this() = this(null)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 524571d9cc636..0da619def1d09 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -89,6 +89,12 @@ class DataFrameSuite extends QueryTest {
       testData.collect().toSeq)
   }
 
+  test("head and take") {
+    assert(testData.take(2) === testData.collect().take(2))
+    assert(testData.head(2) === testData.collect().take(2))
+    assert(testData.head(2).head.schema === testData.schema)
+  }
+
   test("self join") {
     val df1 = testData.select(testData("key")).as('df1)
     val df2 = testData.select(testData("key")).as('df2)

From fd84229e2aeb6a03760703c9dccd2db853779400 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Mon, 16 Feb 2015 22:09:04 -0800
Subject: [PATCH 208/357] [SPARK-5802][MLLIB] cache transformed data in glm

If we need to transform the input data, we should cache the output to avoid re-computing feature vectors every iteration. dbtsai

Author: Xiangrui Meng <meng@databricks.com>

Closes #4593 from mengxr/SPARK-5802 and squashes the following commits:

ae3be84 [Xiangrui Meng] cache transformed data in glm
---
 .../GeneralizedLinearAlgorithm.scala          | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index 17de215b97f9d..2b7145362ab0b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -205,7 +205,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
       throw new SparkException("Input validation failed.")
     }
 
-    /**
+    /*
      * Scaling columns to unit variance as a heuristic to reduce the condition number:
      *
      * During the optimization process, the convergence (rate) depends on the condition number of
@@ -225,26 +225,27 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
      * Currently, it's only enabled in LogisticRegressionWithLBFGS
      */
     val scaler = if (useFeatureScaling) {
-      (new StandardScaler(withStd = true, withMean = false)).fit(input.map(x => x.features))
+      new StandardScaler(withStd = true, withMean = false).fit(input.map(_.features))
     } else {
       null
     }
 
     // Prepend an extra variable consisting of all 1.0's for the intercept.
-    val data = if (addIntercept) {
-      if (useFeatureScaling) {
-        input.map(labeledPoint =>
-          (labeledPoint.label, appendBias(scaler.transform(labeledPoint.features))))
-      } else {
-        input.map(labeledPoint => (labeledPoint.label, appendBias(labeledPoint.features)))
-      }
-    } else {
-      if (useFeatureScaling) {
-        input.map(labeledPoint => (labeledPoint.label, scaler.transform(labeledPoint.features)))
+    // TODO: Apply feature scaling to the weight vector instead of input data.
+    val data =
+      if (addIntercept) {
+        if (useFeatureScaling) {
+          input.map(lp => (lp.label, appendBias(scaler.transform(lp.features)))).cache()
+        } else {
+          input.map(lp => (lp.label, appendBias(lp.features))).cache()
+        }
       } else {
-        input.map(labeledPoint => (labeledPoint.label, labeledPoint.features))
+        if (useFeatureScaling) {
+          input.map(lp => (lp.label, scaler.transform(lp.features))).cache()
+        } else {
+          input.map(lp => (lp.label, lp.features))
+        }
       }
-    }
 
     /**
      * TODO: For better convergence, in logistic regression, the intercepts should be computed

From c06e42f2c1e5fcf123b466efd27ee4cb53bbed3f Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Mon, 16 Feb 2015 22:10:39 -0800
Subject: [PATCH 209/357] HOTFIX: Style issue causing build break

Caused by #4601
---
 .../main/scala/org/apache/spark/api/python/PythonRDD.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index c3c8336a437a6..e94c390df8224 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -157,8 +157,8 @@ private[spark] class PythonRDD(
                     released = true
                     logInfo(s"Communication with worker ended cleanly, re-use it: $worker")
                   } else {
-                    logInfo(s"Communication with worker did not end cleanly (ending with $ending), " +
-                      s"close it: $worker")
+                    logInfo(s"Communication with worker did not end cleanly " +
+                      s"(ending with $ending), close it: $worker")
                   }
                 } else {
                   logInfo(s"The ending mark from worker is not available, close it: $worker")

From a65766bf0244a41b793b9dc5fbdd2882664ad00e Mon Sep 17 00:00:00 2001
From: jerryshao <saisai.shao@intel.com>
Date: Tue, 17 Feb 2015 10:45:18 +0000
Subject: [PATCH 210/357] [SPARK-5826][Streaming] Fix Configuration not
 serializable problem

Author: jerryshao <saisai.shao@intel.com>

Closes #4612 from jerryshao/SPARK-5826 and squashes the following commits:

7ec71db [jerryshao] Remove transient for conf statement
88d84e6 [jerryshao] Fix Configuration not serializable problem
---
 .../apache/spark/streaming/dstream/FileInputDStream.scala   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
index 6379b88527ec8..4f7db41abe76f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.streaming.dstream
 
 import java.io.{IOException, ObjectInputStream}
-import java.util.concurrent.ConcurrentHashMap
 
 import scala.collection.mutable
 import scala.reflect.ClassTag
@@ -27,6 +26,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path, PathFilter}
 import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat}
 
+import org.apache.spark.SerializableWritable
 import org.apache.spark.rdd.{RDD, UnionRDD}
 import org.apache.spark.streaming._
 import org.apache.spark.util.{TimeStampedHashMap, Utils}
@@ -78,6 +78,8 @@ class FileInputDStream[K, V, F <: NewInputFormat[K,V]](
     (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F])
   extends InputDStream[(K, V)](ssc_) {
 
+  private val serializableConfOpt = conf.map(new SerializableWritable(_))
+
   // This is a def so that it works during checkpoint recovery:
   private def clock = ssc.scheduler.clock
 
@@ -240,7 +242,7 @@ class FileInputDStream[K, V, F <: NewInputFormat[K,V]](
   /** Generate one RDD from an array of files */
   private def filesToRDD(files: Seq[String]): RDD[(K, V)] = {
     val fileRDDs = files.map(file =>{
-      val rdd = conf match {
+      val rdd = serializableConfOpt.map(_.value) match {
         case Some(config) => context.sparkContext.newAPIHadoopFile(
           file,
           fm.runtimeClass.asInstanceOf[Class[F]],

From ee6e3eff02e9e08b1113ba6faf3397d7e7775087 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 17 Feb 2015 07:48:27 -0800
Subject: [PATCH 211/357] Revert "[SPARK-5363] [PySpark] check ending mark in
 non-block way"

This reverts commits ac6fe67e1d8bf01ee565f9cc09ad48d88a275829 and c06e42f2c1e5fcf123b466efd27ee4cb53bbed3f.
---
 .../apache/spark/api/python/PythonRDD.scala   | 21 ++++---------------
 python/pyspark/worker.py                      |  1 -
 2 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index e94c390df8224..252721192904f 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -144,24 +144,11 @@ private[spark] class PythonRDD(
                 stream.readFully(update)
                 accumulator += Collections.singletonList(update)
               }
-
               // Check whether the worker is ready to be re-used.
-              if (reuse_worker) {
-                // It has a high possibility that the ending mark is already available,
-                // And current task should not be blocked by checking it
-
-                if (stream.available() >= 4) {
-                  val ending = stream.readInt()
-                  if (ending == SpecialLengths.END_OF_STREAM) {
-                    env.releasePythonWorker(pythonExec, envVars.toMap, worker)
-                    released = true
-                    logInfo(s"Communication with worker ended cleanly, re-use it: $worker")
-                  } else {
-                    logInfo(s"Communication with worker did not end cleanly " +
-                      s"(ending with $ending), close it: $worker")
-                  }
-                } else {
-                  logInfo(s"The ending mark from worker is not available, close it: $worker")
+              if (stream.readInt() == SpecialLengths.END_OF_STREAM) {
+                if (reuse_worker) {
+                  env.releasePythonWorker(pythonExec, envVars.toMap, worker)
+                  released = true
                 }
               }
               null
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 180bdbb4c2c4f..8a93c320ec5d3 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -121,7 +121,6 @@ def process():
     write_int(len(_accumulatorRegistry), outfile)
     for (aid, accum) in _accumulatorRegistry.items():
         pickleSer._write_with_length((aid, accum._value), outfile)
-    outfile.flush()
 
     # check end of stream
     if read_int(infile) == SpecialLengths.END_OF_STREAM:

From 3ce46e94fe77d15f18e916b76b37fa96356ace93 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Tue, 17 Feb 2015 10:10:01 -0800
Subject: [PATCH 212/357] SPARK-5856: In Maven build script, launch Zinc with
 more memory

I've seen out of memory exceptions when trying
to run many parallel builds against the same Zinc
server during packaging. We should use the same
increased memory settings we use for Maven itself.

I tested this and confirmed that the Nailgun JVM
launched with higher memory.

Author: Patrick Wendell <patrick@databricks.com>

Closes #4643 from pwendell/zinc-memory and squashes the following commits:

717cfb0 [Patrick Wendell] SPARK-5856: Launch Zinc with larger memory options.
---
 build/mvn | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/build/mvn b/build/mvn
index 53babf54debb6..3561110a4c019 100755
--- a/build/mvn
+++ b/build/mvn
@@ -21,6 +21,8 @@
 _DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 # Preserve the calling directory
 _CALLING_DIR="$(pwd)"
+# Options used during compilation
+_COMPILE_JVM_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"
 
 # Installs any application tarball given a URL, the expected tarball name,
 # and, optionally, a checkable binary path to determine if the binary has
@@ -136,6 +138,7 @@ cd "${_CALLING_DIR}"
 # Now that zinc is ensured to be installed, check its status and, if its
 # not running or just installed, start it
 if [ -n "${ZINC_INSTALL_FLAG}" -o -z "`${ZINC_BIN} -status`" ]; then
+  export ZINC_OPTS=${ZINC_OPTS:-"$_COMPILE_JVM_OPTS"}
   ${ZINC_BIN} -shutdown
   ${ZINC_BIN} -start -port ${ZINC_PORT} \
     -scala-compiler "${SCALA_COMPILER}" \
@@ -143,7 +146,7 @@ if [ -n "${ZINC_INSTALL_FLAG}" -o -z "`${ZINC_BIN} -status`" ]; then
 fi
 
 # Set any `mvn` options if not already present
-export MAVEN_OPTS=${MAVEN_OPTS:-"-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"}
+export MAVEN_OPTS=${MAVEN_OPTS:-"$_COMPILE_JVM_OPTS"}
 
 # Last, call the `mvn` command as usual
 ${MVN_BIN} "$@"

From c76da36c2163276b5c34e59fbb139eeb34ed0faa Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 17 Feb 2015 10:17:45 -0800
Subject: [PATCH 213/357] [SPARK-5858][MLLIB] Remove unnecessary first() call
 in GLM

`numFeatures` is only used by multinomial logistic regression. Calling `.first()` for every GLM causes performance regression, especially in Python.

Author: Xiangrui Meng <meng@databricks.com>

Closes #4647 from mengxr/SPARK-5858 and squashes the following commits:

036dc7f [Xiangrui Meng] remove unnecessary first() call
12c5548 [Xiangrui Meng] check numFeatures only once
---
 .../spark/mllib/classification/LogisticRegression.scala    | 6 +++++-
 .../mllib/regression/GeneralizedLinearAlgorithm.scala      | 7 ++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index 420d6e2861934..b787667b018e6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -355,6 +355,10 @@ class LogisticRegressionWithLBFGS
   }
 
   override protected def createModel(weights: Vector, intercept: Double) = {
-    new LogisticRegressionModel(weights, intercept, numFeatures, numOfLinearPredictor + 1)
+    if (numOfLinearPredictor == 1) {
+      new LogisticRegressionModel(weights, intercept)
+    } else {
+      new LogisticRegressionModel(weights, intercept, numFeatures, numOfLinearPredictor + 1)
+    }
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index 2b7145362ab0b..7c66e8cdebdbe 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -126,7 +126,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
   /**
    * The dimension of training features.
    */
-  protected var numFeatures: Int = 0
+  protected var numFeatures: Int = -1
 
   /**
    * Set if the algorithm should use feature scaling to improve the convergence during optimization.
@@ -163,7 +163,9 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
    * RDD of LabeledPoint entries.
    */
   def run(input: RDD[LabeledPoint]): M = {
-    numFeatures = input.first().features.size
+    if (numFeatures < 0) {
+      numFeatures = input.map(_.features.size).first()
+    }
 
     /**
      * When `numOfLinearPredictor > 1`, the intercepts are encapsulated into weights,
@@ -193,7 +195,6 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
    * of LabeledPoint entries starting from the initial weights provided.
    */
   def run(input: RDD[LabeledPoint], initialWeights: Vector): M = {
-    numFeatures = input.first().features.size
 
     if (input.getStorageLevel == StorageLevel.NONE) {
       logWarning("The input data is not directly cached, which may hurt performance if its"

From c74b07fa94a8da50437d952ae05cf6ac70fbb93e Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 17 Feb 2015 10:21:17 -0800
Subject: [PATCH 214/357] [SPARK-5166][SPARK-5247][SPARK-5258][SQL] API Cleanup
 / Documentation

Author: Michael Armbrust <michael@databricks.com>

Closes #4642 from marmbrus/docs and squashes the following commits:

d291c34 [Michael Armbrust] python tests
9be66e3 [Michael Armbrust] comments
d56afc2 [Michael Armbrust] fix style
f004747 [Michael Armbrust] fix build
c4a907b [Michael Armbrust] fix tests
42e2b73 [Michael Armbrust] [SQL] Documentation / API Clean-up.
---
 project/SparkBuild.scala                      |  12 +-
 python/pyspark/sql/context.py                 |  28 +--
 .../org/apache/spark/sql/jdbc/JDBCUtils.java  |  59 ------
 .../org/apache/spark/sql/DataFrame.scala      | 153 +++++++++++++-
 .../org/apache/spark/sql/DataFrameImpl.scala  |  33 ++-
 .../spark/sql/ExperimentalMethods.scala       |   5 +
 .../apache/spark/sql/IncomputableColumn.scala |   4 +
 .../org/apache/spark/sql/SQLContext.scala     | 200 ++++++++++++++----
 .../spark/sql/UserDefinedFunction.scala       |   3 +-
 .../org/apache/spark/sql/api/package.scala    |  23 ++
 .../apache/spark/sql/execution/commands.scala |   2 +-
 .../spark/sql/execution/debug/package.scala   |  10 +-
 .../org/apache/spark/sql/jdbc/jdbc.scala      |  74 ++-----
 .../sql/parquet/ParquetTableOperations.scala  |   4 +-
 .../spark/sql/parquet/ParquetTest.scala       |   4 +-
 .../apache/spark/sql/parquet/newParquet.scala |   6 +-
 .../sql/parquet/timestamp/NanoTime.scala      |   2 +-
 .../org/apache/spark/sql/sources/ddl.scala    |   4 +-
 .../apache/spark/sql/jdbc/JavaJDBCTest.java   | 102 ---------
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala |   7 +-
 .../spark/sql/jdbc/JDBCWriteSuite.scala       |  20 +-
 .../spark/sql/jdbc/MySQLIntegration.scala     |  14 +-
 .../spark/sql/jdbc/PostgresIntegration.scala  |   4 +-
 .../execution/DescribeHiveTableCommand.scala  |   4 +-
 .../spark/sql/hive/execution/commands.scala   |   8 +
 .../spark/sql/hive/execution/package.scala}   |  19 +-
 .../org/apache/spark/sql/hive/package.scala   |  10 +
 .../sql/hive/parquet/FakeParquetSerDe.scala   |  56 -----
 .../org/apache/spark/sql/hive/Shim12.scala    |   9 +-
 .../org/apache/spark/sql/hive/Shim13.scala    |   9 +-
 30 files changed, 483 insertions(+), 405 deletions(-)
 delete mode 100644 sql/core/src/main/java/org/apache/spark/sql/jdbc/JDBCUtils.java
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/api/package.scala
 delete mode 100644 sql/core/src/test/java/org/apache/spark/sql/jdbc/JavaJDBCTest.java
 rename sql/{core/src/main/scala/org/apache/spark/sql/jdbc/JavaJDBCTrampoline.scala => hive/src/main/scala/org/apache/spark/sql/hive/execution/package.scala} (66%)
 delete mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/parquet/FakeParquetSerDe.scala

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 8fb1239b4a96b..e4b1b96527fbd 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -361,9 +361,16 @@ object Unidoc {
     publish := {},
 
     unidocProjectFilter in(ScalaUnidoc, unidoc) :=
-      inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, catalyst, streamingFlumeSink, yarn),
+      inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, streamingFlumeSink, yarn),
     unidocProjectFilter in(JavaUnidoc, unidoc) :=
-      inAnyProject -- inProjects(OldDeps.project, repl, bagel, examples, tools, catalyst, streamingFlumeSink, yarn),
+      inAnyProject -- inProjects(OldDeps.project, repl, bagel, examples, tools, streamingFlumeSink, yarn),
+
+    // Skip actual catalyst, but include the subproject.
+    // Catalyst is not public API and contains quasiquotes which break scaladoc.
+    unidocAllSources in (ScalaUnidoc, unidoc) := {
+      (unidocAllSources in (ScalaUnidoc, unidoc)).value
+        .map(_.filterNot(_.getCanonicalPath.contains("sql/catalyst")))
+    },
 
     // Skip class names containing $ and some internal packages in Javadocs
     unidocAllSources in (JavaUnidoc, unidoc) := {
@@ -376,6 +383,7 @@ object Unidoc {
         .map(_.filterNot(_.getCanonicalPath.contains("executor")))
         .map(_.filterNot(_.getCanonicalPath.contains("python")))
         .map(_.filterNot(_.getCanonicalPath.contains("collection")))
+        .map(_.filterNot(_.getCanonicalPath.contains("sql/catalyst")))
     },
 
     // Javadoc options: create a window title, and group key packages on index page
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index dd2cd5ee76f60..2e2309f10375d 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -252,7 +252,7 @@ def applySchema(self, rdd, schema):
         >>> schema = StructType([StructField("field1", IntegerType(), False),
         ...     StructField("field2", StringType(), False)])
         >>> df = sqlCtx.applySchema(rdd2, schema)
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df, "table1")
         >>> df2 = sqlCtx.sql("SELECT * from table1")
         >>> df2.collect()
         [Row(field1=1, field2=u'row1'),..., Row(field1=3, field2=u'row3')]
@@ -405,17 +405,17 @@ def createDataFrame(self, data, schema=None, samplingRatio=None):
 
         return self.applySchema(data, schema)
 
-    def registerRDDAsTable(self, rdd, tableName):
+    def registerDataFrameAsTable(self, rdd, tableName):
         """Registers the given RDD as a temporary table in the catalog.
 
         Temporary tables exist only during the lifetime of this instance of
         SQLContext.
 
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df, "table1")
         """
         if (rdd.__class__ is DataFrame):
             df = rdd._jdf
-            self._ssql_ctx.registerRDDAsTable(df, tableName)
+            self._ssql_ctx.registerDataFrameAsTable(df, tableName)
         else:
             raise ValueError("Can only register DataFrame as table")
 
@@ -456,7 +456,7 @@ def jsonFile(self, path, schema=None, samplingRatio=1.0):
         ...   print>>ofn, json
         >>> ofn.close()
         >>> df1 = sqlCtx.jsonFile(jsonFile)
-        >>> sqlCtx.registerRDDAsTable(df1, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df1, "table1")
         >>> df2 = sqlCtx.sql(
         ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
         ...   "field6 as f4 from table1")
@@ -467,7 +467,7 @@ def jsonFile(self, path, schema=None, samplingRatio=1.0):
         Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
 
         >>> df3 = sqlCtx.jsonFile(jsonFile, df1.schema)
-        >>> sqlCtx.registerRDDAsTable(df3, "table2")
+        >>> sqlCtx.registerDataFrameAsTable(df3, "table2")
         >>> df4 = sqlCtx.sql(
         ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
         ...   "field6 as f4 from table2")
@@ -485,7 +485,7 @@ def jsonFile(self, path, schema=None, samplingRatio=1.0):
         ...             StructField("field5",
         ...                 ArrayType(IntegerType(), False), True)]), False)])
         >>> df5 = sqlCtx.jsonFile(jsonFile, schema)
-        >>> sqlCtx.registerRDDAsTable(df5, "table3")
+        >>> sqlCtx.registerDataFrameAsTable(df5, "table3")
         >>> df6 = sqlCtx.sql(
         ...   "SELECT field2 AS f1, field3.field5 as f2, "
         ...   "field3.field5[0] as f3 from table3")
@@ -509,7 +509,7 @@ def jsonRDD(self, rdd, schema=None, samplingRatio=1.0):
         determine the schema.
 
         >>> df1 = sqlCtx.jsonRDD(json)
-        >>> sqlCtx.registerRDDAsTable(df1, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df1, "table1")
         >>> df2 = sqlCtx.sql(
         ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
         ...   "field6 as f4 from table1")
@@ -520,7 +520,7 @@ def jsonRDD(self, rdd, schema=None, samplingRatio=1.0):
         Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
 
         >>> df3 = sqlCtx.jsonRDD(json, df1.schema)
-        >>> sqlCtx.registerRDDAsTable(df3, "table2")
+        >>> sqlCtx.registerDataFrameAsTable(df3, "table2")
         >>> df4 = sqlCtx.sql(
         ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
         ...   "field6 as f4 from table2")
@@ -538,7 +538,7 @@ def jsonRDD(self, rdd, schema=None, samplingRatio=1.0):
         ...             StructField("field5",
         ...                 ArrayType(IntegerType(), False), True)]), False)])
         >>> df5 = sqlCtx.jsonRDD(json, schema)
-        >>> sqlCtx.registerRDDAsTable(df5, "table3")
+        >>> sqlCtx.registerDataFrameAsTable(df5, "table3")
         >>> df6 = sqlCtx.sql(
         ...   "SELECT field2 AS f1, field3.field5 as f2, "
         ...   "field3.field5[0] as f3 from table3")
@@ -628,7 +628,7 @@ def createExternalTable(self, tableName, path=None, source=None,
     def sql(self, sqlQuery):
         """Return a L{DataFrame} representing the result of the given query.
 
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df, "table1")
         >>> df2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2 from table1")
         >>> df2.collect()
         [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
@@ -638,7 +638,7 @@ def sql(self, sqlQuery):
     def table(self, tableName):
         """Returns the specified table as a L{DataFrame}.
 
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df, "table1")
         >>> df2 = sqlCtx.table("table1")
         >>> sorted(df.collect()) == sorted(df2.collect())
         True
@@ -653,7 +653,7 @@ def tables(self, dbName=None):
         The returned DataFrame has two columns, tableName and isTemporary
         (a column with BooleanType indicating if a table is a temporary one or not).
 
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df, "table1")
         >>> df2 = sqlCtx.tables()
         >>> df2.filter("tableName = 'table1'").first()
         Row(tableName=u'table1', isTemporary=True)
@@ -668,7 +668,7 @@ def tableNames(self, dbName=None):
 
         If `dbName` is not specified, the current database will be used.
 
-        >>> sqlCtx.registerRDDAsTable(df, "table1")
+        >>> sqlCtx.registerDataFrameAsTable(df, "table1")
         >>> "table1" in sqlCtx.tableNames()
         True
         >>> "table1" in sqlCtx.tableNames("db")
diff --git a/sql/core/src/main/java/org/apache/spark/sql/jdbc/JDBCUtils.java b/sql/core/src/main/java/org/apache/spark/sql/jdbc/JDBCUtils.java
deleted file mode 100644
index aa441b2096f18..0000000000000
--- a/sql/core/src/main/java/org/apache/spark/sql/jdbc/JDBCUtils.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.jdbc;
-
-import org.apache.spark.Partition;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.sql.DataFrame;
-
-public class JDBCUtils {
-  /**
-   * Construct a DataFrame representing the JDBC table at the database
-   * specified by url with table name table.
-   */
-  public static DataFrame jdbcRDD(SQLContext sql, String url, String table) {
-    Partition[] parts = new Partition[1];
-    parts[0] = new JDBCPartition(null, 0);
-    return sql.baseRelationToDataFrame(
-        new JDBCRelation(url, table, parts, sql));
-  }
-
-  /**
-   * Construct a DataFrame representing the JDBC table at the database
-   * specified by url with table name table partitioned by parts.
-   * Here, parts is an array of expressions suitable for insertion into a WHERE
-   * clause; each one defines one partition.
-   */
-  public static DataFrame jdbcRDD(SQLContext sql, String url, String table, String[] parts) {
-    Partition[] partitions = new Partition[parts.length];
-    for (int i = 0; i < parts.length; i++)
-      partitions[i] = new JDBCPartition(parts[i], i);
-    return sql.baseRelationToDataFrame(
-        new JDBCRelation(url, table, partitions, sql));
-  }
-
-  private static JavaJDBCTrampoline trampoline = new JavaJDBCTrampoline();
-
-  public static void createJDBCTable(DataFrame rdd, String url, String table, boolean allowExisting) {
-    trampoline.createJDBCTable(rdd, url, table, allowExisting);
-  }
-
-  public static void insertIntoJDBC(DataFrame rdd, String url, String table, boolean overwrite) {
-    trampoline.insertIntoJDBC(rdd, url, table, overwrite);
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index c0c3cb40cf1da..fa5fe84263ece 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql
 
+import java.sql.DriverManager
+
+
 import scala.collection.JavaConversions._
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
@@ -27,6 +30,7 @@ import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.jdbc.JDBCWriteDetails
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
@@ -77,6 +81,12 @@ private[sql] object DataFrame {
  *     .groupBy(department("name"), "gender")
  *     .agg(avg(people("salary")), max(people("age")))
  * }}}
+ *
+ * @groupname basic Basic DataFrame functions
+ * @groupname dfops Language Integrated Queries
+ * @groupname rdd RDD Operations
+ * @groupname output Output Operations
+ * @groupname action Actions
  */
 // TODO: Improve documentation.
 @Experimental
@@ -102,7 +112,8 @@ trait DataFrame extends RDDApi[Row] with Serializable {
   def toSchemaRDD: DataFrame = this
 
   /**
-   * Returns the object itself. Used to force an implicit conversion from RDD to DataFrame in Scala.
+   * Returns the object itself.
+   * @group basic
    */
   // This is declared with parentheses to prevent the Scala compiler from treating
   // `rdd.toDF("1")` as invoking this toDF and then apply on the returned DataFrame.
@@ -116,31 +127,51 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *   rdd.toDF  // this implicit conversion creates a DataFrame with column name _1 and _2
    *   rdd.toDF("id", "name")  // this creates a DataFrame with column name "id" and "name"
    * }}}
+   * @group basic
    */
   @scala.annotation.varargs
   def toDF(colNames: String*): DataFrame
 
-  /** Returns the schema of this [[DataFrame]]. */
+  /**
+   * Returns the schema of this [[DataFrame]].
+   * @group basic
+   */
   def schema: StructType
 
-  /** Returns all column names and their data types as an array. */
+  /**
+   * Returns all column names and their data types as an array.
+   * @group basic
+   */
   def dtypes: Array[(String, String)]
 
-  /** Returns all column names as an array. */
+  /**
+   * Returns all column names as an array.
+   * @group basic
+   */
   def columns: Array[String] = schema.fields.map(_.name)
 
-  /** Prints the schema to the console in a nice tree format. */
+  /**
+   * Prints the schema to the console in a nice tree format.
+   * @group basic
+   */
   def printSchema(): Unit
 
-  /** Prints the plans (logical and physical) to the console for debugging purpose. */
+  /**
+   * Prints the plans (logical and physical) to the console for debugging purpose.
+   * @group basic
+   */
   def explain(extended: Boolean): Unit
 
-  /** Only prints the physical plan to the console for debugging purpose. */
+  /**
+   * Only prints the physical plan to the console for debugging purpose.
+   * @group basic
+   */
   def explain(): Unit = explain(extended = false)
 
   /**
    * Returns true if the `collect` and `take` methods can be run locally
    * (without any Spark executors).
+   * @group basic
    */
   def isLocal: Boolean
 
@@ -154,6 +185,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *   1983  03    0.410516        0.442194
    *   1984  04    0.450090        0.483521
    * }}}
+   * @group basic
    */
   def show(): Unit
 
@@ -163,6 +195,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * Note that cartesian joins are very expensive without an extra filter that can be pushed down.
    *
    * @param right Right side of the join operation.
+   * @group dfops
    */
   def join(right: DataFrame): DataFrame
 
@@ -174,6 +207,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *   df1.join(df2, $"df1Key" === $"df2Key")
    *   df1.join(df2).where($"df1Key" === $"df2Key")
    * }}}
+   * @group dfops
    */
   def join(right: DataFrame, joinExprs: Column): DataFrame
 
@@ -194,6 +228,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @param right Right side of the join.
    * @param joinExprs Join expression.
    * @param joinType One of: `inner`, `outer`, `left_outer`, `right_outer`, `semijoin`.
+   * @group dfops
    */
   def join(right: DataFrame, joinExprs: Column, joinType: String): DataFrame
 
@@ -205,6 +240,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *   df.sort($"sortcol")
    *   df.sort($"sortcol".asc)
    * }}}
+   * @group dfops
    */
   @scala.annotation.varargs
   def sort(sortCol: String, sortCols: String*): DataFrame
@@ -214,6 +250,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * {{{
    *   df.sort($"col1", $"col2".desc)
    * }}}
+   * @group dfops
    */
   @scala.annotation.varargs
   def sort(sortExprs: Column*): DataFrame
@@ -221,6 +258,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
   /**
    * Returns a new [[DataFrame]] sorted by the given expressions.
    * This is an alias of the `sort` function.
+   * @group dfops
    */
   @scala.annotation.varargs
   def orderBy(sortCol: String, sortCols: String*): DataFrame
@@ -228,27 +266,32 @@ trait DataFrame extends RDDApi[Row] with Serializable {
   /**
    * Returns a new [[DataFrame]] sorted by the given expressions.
    * This is an alias of the `sort` function.
+   * @group dfops
    */
   @scala.annotation.varargs
   def orderBy(sortExprs: Column*): DataFrame
 
   /**
    * Selects column based on the column name and return it as a [[Column]].
+   * @group dfops
    */
   def apply(colName: String): Column = col(colName)
 
   /**
    * Selects column based on the column name and return it as a [[Column]].
+   * @group dfops
    */
   def col(colName: String): Column
 
   /**
    * Returns a new [[DataFrame]] with an alias set.
+   * @group dfops
    */
   def as(alias: String): DataFrame
 
   /**
    * (Scala-specific) Returns a new [[DataFrame]] with an alias set.
+   * @group dfops
    */
   def as(alias: Symbol): DataFrame
 
@@ -257,6 +300,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * {{{
    *   df.select($"colA", $"colB" + 1)
    * }}}
+   * @group dfops
    */
   @scala.annotation.varargs
   def select(cols: Column*): DataFrame
@@ -270,6 +314,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *   df.select("colA", "colB")
    *   df.select($"colA", $"colB")
    * }}}
+   * @group dfops
    */
   @scala.annotation.varargs
   def select(col: String, cols: String*): DataFrame
@@ -281,6 +326,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * {{{
    *   df.selectExpr("colA", "colB as newName", "abs(colC)")
    * }}}
+   * @group dfops
    */
   @scala.annotation.varargs
   def selectExpr(exprs: String*): DataFrame
@@ -293,6 +339,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *   peopleDf.where($"age" > 15)
    *   peopleDf($"age" > 15)
    * }}}
+   * @group dfops
    */
   def filter(condition: Column): DataFrame
 
@@ -301,6 +348,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * {{{
    *   peopleDf.filter("age > 15")
    * }}}
+   * @group dfops
    */
   def filter(conditionExpr: String): DataFrame
 
@@ -312,6 +360,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *   peopleDf.where($"age" > 15)
    *   peopleDf($"age" > 15)
    * }}}
+   * @group dfops
    */
   def where(condition: Column): DataFrame
 
@@ -329,6 +378,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *     "age" -> "max"
    *   ))
    * }}}
+   * @group dfops
    */
   @scala.annotation.varargs
   def groupBy(cols: Column*): GroupedData
@@ -350,6 +400,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *     "age" -> "max"
    *   ))
    * }}}
+   * @group dfops
    */
   @scala.annotation.varargs
   def groupBy(col1: String, cols: String*): GroupedData
@@ -366,6 +417,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *     "expense" -> "sum"
    *   )
    * }}}
+   * @group dfops
    */
   def agg(aggExpr: (String, String), aggExprs: (String, String)*): DataFrame = {
     groupBy().agg(aggExpr, aggExprs :_*)
@@ -378,6 +430,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *   df.agg(Map("age" -> "max", "salary" -> "avg"))
    *   df.groupBy().agg(Map("age" -> "max", "salary" -> "avg"))
    * }}
+   * @group dfops
    */
   def agg(exprs: Map[String, String]): DataFrame = groupBy().agg(exprs)
 
@@ -388,6 +441,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *   df.agg(Map("age" -> "max", "salary" -> "avg"))
    *   df.groupBy().agg(Map("age" -> "max", "salary" -> "avg"))
    * }}
+   * @group dfops
    */
   def agg(exprs: java.util.Map[String, String]): DataFrame = groupBy().agg(exprs)
 
@@ -398,6 +452,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *   df.agg(max($"age"), avg($"salary"))
    *   df.groupBy().agg(max($"age"), avg($"salary"))
    * }}
+   * @group dfops
    */
   @scala.annotation.varargs
   def agg(expr: Column, exprs: Column*): DataFrame = groupBy().agg(expr, exprs :_*)
@@ -405,24 +460,28 @@ trait DataFrame extends RDDApi[Row] with Serializable {
   /**
    * Returns a new [[DataFrame]] by taking the first `n` rows. The difference between this function
    * and `head` is that `head` returns an array while `limit` returns a new [[DataFrame]].
+   * @group dfops
    */
   def limit(n: Int): DataFrame
 
   /**
    * Returns a new [[DataFrame]] containing union of rows in this frame and another frame.
    * This is equivalent to `UNION ALL` in SQL.
+   * @group dfops
    */
   def unionAll(other: DataFrame): DataFrame
 
   /**
    * Returns a new [[DataFrame]] containing rows only in both this frame and another frame.
    * This is equivalent to `INTERSECT` in SQL.
+   * @group dfops
    */
   def intersect(other: DataFrame): DataFrame
 
   /**
    * Returns a new [[DataFrame]] containing rows in this frame but not in another frame.
    * This is equivalent to `EXCEPT` in SQL.
+   * @group dfops
    */
   def except(other: DataFrame): DataFrame
 
@@ -432,6 +491,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @param withReplacement Sample with replacement or not.
    * @param fraction Fraction of rows to generate.
    * @param seed Seed for sampling.
+   * @group dfops
    */
   def sample(withReplacement: Boolean, fraction: Double, seed: Long): DataFrame
 
@@ -440,6 +500,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *
    * @param withReplacement Sample with replacement or not.
    * @param fraction Fraction of rows to generate.
+   * @group dfops
    */
   def sample(withReplacement: Boolean, fraction: Double): DataFrame = {
     sample(withReplacement, fraction, Utils.random.nextLong)
@@ -464,6 +525,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *
    *   val bookCountPerWord = allWords.groupBy("word").agg(countDistinct("title"))
    * }}}
+   * @group dfops
    */
   def explode[A <: Product : TypeTag](input: Column*)(f: Row => TraversableOnce[A]): DataFrame
 
@@ -476,6 +538,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * {{{
    *   df.explode("words", "word")(words: String => words.split(" "))
    * }}}
+   * @group dfops
    */
   def explode[A, B : TypeTag](
       inputColumn: String,
@@ -486,11 +549,13 @@ trait DataFrame extends RDDApi[Row] with Serializable {
 
   /**
    * Returns a new [[DataFrame]] by adding a column.
+   * @group dfops
    */
   def withColumn(colName: String, col: Column): DataFrame
 
   /**
    * Returns a new [[DataFrame]] with a column renamed.
+   * @group dfops
    */
   def withColumnRenamed(existingName: String, newName: String): DataFrame
 
@@ -511,62 +576,84 @@ trait DataFrame extends RDDApi[Row] with Serializable {
 
   /**
    * Returns a new RDD by applying a function to all rows of this DataFrame.
+   * @group rdd
    */
   override def map[R: ClassTag](f: Row => R): RDD[R]
 
   /**
    * Returns a new RDD by first applying a function to all rows of this [[DataFrame]],
    * and then flattening the results.
+   * @group rdd
    */
   override def flatMap[R: ClassTag](f: Row => TraversableOnce[R]): RDD[R]
 
   /**
    * Returns a new RDD by applying a function to each partition of this DataFrame.
+   * @group rdd
    */
   override def mapPartitions[R: ClassTag](f: Iterator[Row] => Iterator[R]): RDD[R]
 
   /**
    * Applies a function `f` to all rows.
+   * @group rdd
    */
   override def foreach(f: Row => Unit): Unit
 
   /**
    * Applies a function f to each partition of this [[DataFrame]].
+   * @group rdd
    */
   override def foreachPartition(f: Iterator[Row] => Unit): Unit
 
   /**
    * Returns the first `n` rows in the [[DataFrame]].
+   * @group action
    */
   override def take(n: Int): Array[Row]
 
   /**
    * Returns an array that contains all of [[Row]]s in this [[DataFrame]].
+   * @group action
    */
   override def collect(): Array[Row]
 
   /**
    * Returns a Java list that contains all of [[Row]]s in this [[DataFrame]].
+   * @group action
    */
   override def collectAsList(): java.util.List[Row]
 
   /**
    * Returns the number of rows in the [[DataFrame]].
+   * @group action
    */
   override def count(): Long
 
   /**
    * Returns a new [[DataFrame]] that has exactly `numPartitions` partitions.
+   * @group rdd
    */
   override def repartition(numPartitions: Int): DataFrame
 
-  /** Returns a new [[DataFrame]] that contains only the unique rows from this [[DataFrame]]. */
+  /**
+   * Returns a new [[DataFrame]] that contains only the unique rows from this [[DataFrame]].
+   * @group dfops
+   */
   override def distinct: DataFrame
 
+  /**
+   * @group basic
+   */
   override def persist(): this.type
 
+  /**
+   * @group basic
+   */
   override def persist(newLevel: StorageLevel): this.type
 
+  /**
+   * @group basic
+   */
   override def unpersist(blocking: Boolean): this.type
 
   /////////////////////////////////////////////////////////////////////////////
@@ -575,16 +662,19 @@ trait DataFrame extends RDDApi[Row] with Serializable {
 
   /**
    * Returns the content of the [[DataFrame]] as an [[RDD]] of [[Row]]s.
+   * @group rdd
    */
   def rdd: RDD[Row]
 
   /**
    * Returns the content of the [[DataFrame]] as a [[JavaRDD]] of [[Row]]s.
+   * @group rdd
    */
   def toJavaRDD: JavaRDD[Row] = rdd.toJavaRDD()
 
   /**
    * Returns the content of the [[DataFrame]] as a [[JavaRDD]] of [[Row]]s.
+   * @group rdd
    */
   def javaRDD: JavaRDD[Row] = toJavaRDD
 
@@ -592,7 +682,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * Registers this RDD as a temporary table using the given name.  The lifetime of this temporary
    * table is tied to the [[SQLContext]] that was used to create this DataFrame.
    *
-   * @group schema
+   * @group basic
    */
   def registerTempTable(tableName: String): Unit
 
@@ -600,6 +690,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * Saves the contents of this [[DataFrame]] as a parquet file, preserving the schema.
    * Files that are written out using this method can be read back in as a [[DataFrame]]
    * using the `parquetFile` function in [[SQLContext]].
+   * @group output
    */
   def saveAsParquetFile(path: String): Unit
 
@@ -613,6 +704,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * there is no notion of a persisted catalog in a standard SQL context.  Instead you can write
    * an RDD out to a parquet file, and then register that file as a table.  This "table" can then
    * be the target of an `insertInto`.
+   * @group output
    */
   @Experimental
   def saveAsTable(tableName: String): Unit = {
@@ -628,6 +720,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * there is no notion of a persisted catalog in a standard SQL context.  Instead you can write
    * an RDD out to a parquet file, and then register that file as a table.  This "table" can then
    * be the target of an `insertInto`.
+   * @group output
    */
   @Experimental
   def saveAsTable(tableName: String, mode: SaveMode): Unit = {
@@ -651,6 +744,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * there is no notion of a persisted catalog in a standard SQL context.  Instead you can write
    * an RDD out to a parquet file, and then register that file as a table.  This "table" can then
    * be the target of an `insertInto`.
+   * @group output
    */
   @Experimental
   def saveAsTable(
@@ -668,6 +762,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * there is no notion of a persisted catalog in a standard SQL context.  Instead you can write
    * an RDD out to a parquet file, and then register that file as a table.  This "table" can then
    * be the target of an `insertInto`.
+   * @group output
    */
   @Experimental
   def saveAsTable(
@@ -686,6 +781,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * there is no notion of a persisted catalog in a standard SQL context.  Instead you can write
    * an RDD out to a parquet file, and then register that file as a table.  This "table" can then
    * be the target of an `insertInto`.
+   * @group output
    */
   @Experimental
   def saveAsTable(
@@ -706,6 +802,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * there is no notion of a persisted catalog in a standard SQL context.  Instead you can write
    * an RDD out to a parquet file, and then register that file as a table.  This "table" can then
    * be the target of an `insertInto`.
+   * @group output
    */
   @Experimental
   def saveAsTable(
@@ -719,6 +816,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * Saves the contents of this DataFrame to the given path,
    * using the default data source configured by spark.sql.sources.default and
    * [[SaveMode.ErrorIfExists]] as the save mode.
+   * @group output
    */
   @Experimental
   def save(path: String): Unit = {
@@ -729,6 +827,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * :: Experimental ::
    * Saves the contents of this DataFrame to the given path and [[SaveMode]] specified by mode,
    * using the default data source configured by spark.sql.sources.default.
+   * @group output
    */
   @Experimental
   def save(path: String, mode: SaveMode): Unit = {
@@ -740,6 +839,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * :: Experimental ::
    * Saves the contents of this DataFrame to the given path based on the given data source,
    * using [[SaveMode.ErrorIfExists]] as the save mode.
+   * @group output
    */
   @Experimental
   def save(path: String, source: String): Unit = {
@@ -750,6 +850,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * :: Experimental ::
    * Saves the contents of this DataFrame to the given path based on the given data source and
    * [[SaveMode]] specified by mode.
+   * @group output
    */
   @Experimental
   def save(path: String, source: String, mode: SaveMode): Unit = {
@@ -760,6 +861,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * :: Experimental ::
    * Saves the contents of this DataFrame based on the given data source,
    * [[SaveMode]] specified by mode, and a set of options.
+   * @group output
    */
   @Experimental
   def save(
@@ -774,6 +876,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * (Scala-specific)
    * Saves the contents of this DataFrame based on the given data source,
    * [[SaveMode]] specified by mode, and a set of options
+   * @group output
    */
   @Experimental
   def save(
@@ -784,6 +887,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
   /**
    * :: Experimental ::
    * Adds the rows from this RDD to the specified table, optionally overwriting the existing data.
+   * @group output
    */
   @Experimental
   def insertInto(tableName: String, overwrite: Boolean): Unit
@@ -792,15 +896,46 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * :: Experimental ::
    * Adds the rows from this RDD to the specified table.
    * Throws an exception if the table already exists.
+   * @group output
    */
   @Experimental
   def insertInto(tableName: String): Unit = insertInto(tableName, overwrite = false)
 
   /**
    * Returns the content of the [[DataFrame]] as a RDD of JSON strings.
+   * @group rdd
    */
   def toJSON: RDD[String]
 
+  ////////////////////////////////////////////////////////////////////////////
+  // JDBC Write Support
+  ////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * Save this RDD to a JDBC database at `url` under the table name `table`.
+   * This will run a `CREATE TABLE` and a bunch of `INSERT INTO` statements.
+   * If you pass `true` for `allowExisting`, it will drop any table with the
+   * given name; if you pass `false`, it will throw if the table already
+   * exists.
+   * @group output
+   */
+  def createJDBCTable(url: String, table: String, allowExisting: Boolean): Unit
+
+  /**
+   * Save this RDD to a JDBC database at `url` under the table name `table`.
+   * Assumes the table already exists and has a compatible schema.  If you
+   * pass `true` for `overwrite`, it will `TRUNCATE` the table before
+   * performing the `INSERT`s.
+   *
+   * The table must already exist on the database.  It must have a schema
+   * that is compatible with the schema of this RDD; inserting the rows of
+   * the RDD in order via the simple statement
+   * `INSERT INTO table VALUES (?, ?, ..., ?)` should not fail.
+   * @group output
+   */
+  def insertIntoJDBC(url: String, table: String, overwrite: Boolean): Unit
+
+
   ////////////////////////////////////////////////////////////////////////////
   // for Python API
   ////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index 848ea2e05624f..25bc9d929237d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql
 
 import java.io.CharArrayWriter
+import java.sql.DriverManager
 
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
@@ -36,6 +37,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.{JoinType, Inner}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.{ExplainCommand, LogicalRDD, EvaluatePython}
+import org.apache.spark.sql.jdbc.JDBCWriteDetails
 import org.apache.spark.sql.json.JsonRDD
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{NumericType, StructType}
@@ -375,7 +377,7 @@ private[sql] class DataFrameImpl protected[sql](
   }
 
   override def registerTempTable(tableName: String): Unit = {
-    sqlContext.registerRDDAsTable(this, tableName)
+    sqlContext.registerDataFrameAsTable(this, tableName)
   }
 
   override def saveAsParquetFile(path: String): Unit = {
@@ -441,6 +443,35 @@ private[sql] class DataFrameImpl protected[sql](
     }
   }
 
+  def createJDBCTable(url: String, table: String, allowExisting: Boolean): Unit = {
+    val conn = DriverManager.getConnection(url)
+    try {
+      if (allowExisting) {
+        val sql = s"DROP TABLE IF EXISTS $table"
+        conn.prepareStatement(sql).executeUpdate()
+      }
+      val schema = JDBCWriteDetails.schemaString(this, url)
+      val sql = s"CREATE TABLE $table ($schema)"
+      conn.prepareStatement(sql).executeUpdate()
+    } finally {
+      conn.close()
+    }
+    JDBCWriteDetails.saveTable(this, url, table)
+  }
+
+  def insertIntoJDBC(url: String, table: String, overwrite: Boolean): Unit = {
+    if (overwrite) {
+      val conn = DriverManager.getConnection(url)
+      try {
+        val sql = s"TRUNCATE TABLE $table"
+        conn.prepareStatement(sql).executeUpdate()
+      } finally {
+        conn.close()
+      }
+    }
+    JDBCWriteDetails.saveTable(this, url, table)
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   // for Python API
   ////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/ExperimentalMethods.scala b/sql/core/src/main/scala/org/apache/spark/sql/ExperimentalMethods.scala
index f0e6a8f332188..d5d7e35a6b35d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/ExperimentalMethods.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/ExperimentalMethods.scala
@@ -20,8 +20,13 @@ package org.apache.spark.sql
 import org.apache.spark.annotation.Experimental
 
 /**
+ * :: Experimental ::
  * Holder for experimental methods for the bravest. We make NO guarantee about the stability
  * regarding binary compatibility and source compatibility of methods here.
+ *
+ * {{{
+ *   sqlContext.experimental.extraStrategies += ...
+ * }}}
  */
 @Experimental
 class ExperimentalMethods protected[sql](sqlContext: SQLContext) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
index fc37cfa7a899f..b48b682b36e1f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
@@ -173,6 +173,10 @@ private[sql] class IncomputableColumn(protected[sql] val expr: Expression) exten
 
   override def insertInto(tableName: String, overwrite: Boolean): Unit = err()
 
+  def createJDBCTable(url: String, table: String, allowExisting: Boolean): Unit = err()
+
+  def insertIntoJDBC(url: String, table: String, overwrite: Boolean): Unit = err()
+
   override def toJSON: RDD[String] = err()
 
   protected[sql] override def javaToPython: JavaRDD[Array[Byte]] = err()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 0aae0942ca04f..31afa0eb59a8e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -43,11 +43,16 @@ import org.apache.spark.util.Utils
 import org.apache.spark.{Partition, SparkContext}
 
 /**
- * The entry point for running relational queries using Spark.  Allows the creation of [[DataFrame]]
- * objects and the execution of SQL queries.
+ * The entry point for working with structured data (rows and columns) in Spark.  Allows the
+ * creation of [[DataFrame]] objects as well as the execution of SQL queries.
  *
- * @groupname ddl_ops Catalog DDL functions
- * @groupname userf Spark SQL Functions
+ * @groupname basic Basic Operations
+ * @groupname ddl_ops Persistent Catalog DDL
+ * @groupname cachemgmt Cached Table Management
+ * @groupname genericdata Generic Data Sources
+ * @groupname specificdata Specific Data Sources
+ * @groupname config Configuration
+ * @groupname dataframes Custom DataFrame Creation
  * @groupname Ungrouped Support functions for language integrated queries.
  */
 class SQLContext(@transient val sparkContext: SparkContext)
@@ -61,24 +66,40 @@ class SQLContext(@transient val sparkContext: SparkContext)
   // Note that this is a lazy val so we can override the default value in subclasses.
   protected[sql] lazy val conf: SQLConf = new SQLConf
 
-  /** Set Spark SQL configuration properties. */
+  /**
+   * Set Spark SQL configuration properties.
+   *
+   * @group config
+   */
   def setConf(props: Properties): Unit = conf.setConf(props)
 
-  /** Set the given Spark SQL configuration property. */
+  /**
+   * Set the given Spark SQL configuration property.
+   *
+   * @group config
+   */
   def setConf(key: String, value: String): Unit = conf.setConf(key, value)
 
-  /** Return the value of Spark SQL configuration property for the given key. */
+  /**
+   * Return the value of Spark SQL configuration property for the given key.
+   *
+   * @group config
+   */
   def getConf(key: String): String = conf.getConf(key)
 
   /**
    * Return the value of Spark SQL configuration property for the given key. If the key is not set
    * yet, return `defaultValue`.
+   *
+   * @group config
    */
   def getConf(key: String, defaultValue: String): String = conf.getConf(key, defaultValue)
 
   /**
    * Return all the configuration properties that have been set (i.e. not the default).
    * This creates a new copy of the config properties in the form of a Map.
+   *
+   * @group config
    */
   def getAllConfs: immutable.Map[String, String] = conf.getAllConfs
 
@@ -128,7 +149,9 @@ class SQLContext(@transient val sparkContext: SparkContext)
   /**
    * :: Experimental ::
    * A collection of methods that are considered experimental, but can be used to hook into
-   * the query planner for advanced functionalities.
+   * the query planner for advanced functionality.
+   *
+   * @group basic
    */
   @Experimental
   @transient
@@ -137,6 +160,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
   /**
    * :: Experimental ::
    * Returns a [[DataFrame]] with no rows or columns.
+   *
+   * @group basic
    */
   @Experimental
   @transient
@@ -167,17 +192,28 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *       (Integer arg1, String arg2) -> arg2 + arg1),
    *       DataTypes.StringType);
    * }}}
+   *
+   * @group basic
    */
   @transient
   val udf: UDFRegistration = new UDFRegistration(this)
 
-  /** Returns true if the table is currently cached in-memory. */
+  /**
+   * Returns true if the table is currently cached in-memory.
+   * @group cachemgmt
+   */
   def isCached(tableName: String): Boolean = cacheManager.isCached(tableName)
 
-  /** Caches the specified table in-memory. */
+  /**
+   * Caches the specified table in-memory.
+   * @group cachemgmt
+   */
   def cacheTable(tableName: String): Unit = cacheManager.cacheTable(tableName)
 
-  /** Removes the specified table from the in-memory cache. */
+  /**
+   * Removes the specified table from the in-memory cache.
+   * @group cachemgmt
+   */
   def uncacheTable(tableName: String): Unit = cacheManager.uncacheTable(tableName)
 
   // scalastyle:off
@@ -186,6 +222,13 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * :: Experimental ::
    * (Scala-specific) Implicit methods available in Scala for converting
    * common Scala objects into [[DataFrame]]s.
+   *
+   * {{{
+   *   val sqlContext = new SQLContext
+   *   import sqlContext._
+   * }}}
+   *
+   * @group basic
    */
   @Experimental
   object implicits extends Serializable {
@@ -260,7 +303,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * :: Experimental ::
    * Creates a DataFrame from an RDD of case classes.
    *
-   * @group userf
+   * @group dataframes
    */
   @Experimental
   def createDataFrame[A <: Product : TypeTag](rdd: RDD[A]): DataFrame = {
@@ -274,6 +317,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
   /**
    * :: Experimental ::
    * Creates a DataFrame from a local Seq of Product.
+   *
+   * @group dataframes
    */
   @Experimental
   def createDataFrame[A <: Product : TypeTag](data: Seq[A]): DataFrame = {
@@ -285,6 +330,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * Convert a [[BaseRelation]] created for external data sources into a [[DataFrame]].
+   *
+   * @group dataframes
    */
   def baseRelationToDataFrame(baseRelation: BaseRelation): DataFrame = {
     DataFrame(this, LogicalRelation(baseRelation))
@@ -318,6 +365,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *  dataFrame.registerTempTable("people")
    *  sqlContext.sql("select name from people").collect.foreach(println)
    * }}}
+   *
+   * @group dataframes
    */
   @DeveloperApi
   def createDataFrame(rowRDD: RDD[Row], schema: StructType): DataFrame = {
@@ -332,6 +381,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * Creates a [[DataFrame]] from an [[JavaRDD]] containing [[Row]]s using the given schema.
    * It is important to make sure that the structure of every [[Row]] of the provided RDD matches
    * the provided schema. Otherwise, there will be runtime exception.
+   *
+   * @group dataframes
    */
   @DeveloperApi
   def createDataFrame(rowRDD: JavaRDD[Row], schema: StructType): DataFrame = {
@@ -346,6 +397,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @param rowRDD an JavaRDD of Row
    * @param columns names for each column
    * @return DataFrame
+   * @group dataframes
    */
   def createDataFrame(rowRDD: JavaRDD[Row], columns: java.util.List[String]): DataFrame = {
     createDataFrame(rowRDD.rdd, columns.toSeq)
@@ -356,6 +408,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *
    * WARNING: Since there is no guaranteed ordering for fields in a Java Bean,
    *          SELECT * queries will return the columns in an undefined order.
+   * @group dataframes
    */
   def createDataFrame(rdd: RDD[_], beanClass: Class[_]): DataFrame = {
     val attributeSeq = getSchema(beanClass)
@@ -383,6 +436,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *
    * WARNING: Since there is no guaranteed ordering for fields in a Java Bean,
    *          SELECT * queries will return the columns in an undefined order.
+   * @group dataframes
    */
   def createDataFrame(rdd: JavaRDD[_], beanClass: Class[_]): DataFrame = {
     createDataFrame(rdd.rdd, beanClass)
@@ -416,8 +470,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *  dataFrame.registerTempTable("people")
    *  sqlContext.sql("select name from people").collect.foreach(println)
    * }}}
-   *
-   * @group userf
    */
   @deprecated("use createDataFrame", "1.3.0")
   def applySchema(rowRDD: RDD[Row], schema: StructType): DataFrame = {
@@ -455,7 +507,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty
    * [[DataFrame]] if no paths are passed in.
    *
-   * @group userf
+   * @group specificdata
    */
   @scala.annotation.varargs
   def parquetFile(paths: String*): DataFrame = {
@@ -473,7 +525,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * Loads a JSON file (one object per line), returning the result as a [[DataFrame]].
    * It goes through the entire dataset once to determine the schema.
    *
-   * @group userf
+   * @group specificdata
    */
   def jsonFile(path: String): DataFrame = jsonFile(path, 1.0)
 
@@ -482,7 +534,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * Loads a JSON file (one object per line) and applies the given schema,
    * returning the result as a [[DataFrame]].
    *
-   * @group userf
+   * @group specificdata
    */
   @Experimental
   def jsonFile(path: String, schema: StructType): DataFrame = {
@@ -492,6 +544,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * :: Experimental ::
+   * @group specificdata
    */
   @Experimental
   def jsonFile(path: String, samplingRatio: Double): DataFrame = {
@@ -504,10 +557,18 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * [[DataFrame]].
    * It goes through the entire dataset once to determine the schema.
    *
-   * @group userf
+   * @group specificdata
    */
   def jsonRDD(json: RDD[String]): DataFrame = jsonRDD(json, 1.0)
 
+
+  /**
+   * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a
+   * [[DataFrame]].
+   * It goes through the entire dataset once to determine the schema.
+   *
+   * @group specificdata
+   */
   def jsonRDD(json: JavaRDD[String]): DataFrame = jsonRDD(json.rdd, 1.0)
 
   /**
@@ -515,7 +576,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema,
    * returning the result as a [[DataFrame]].
    *
-   * @group userf
+   * @group specificdata
    */
   @Experimental
   def jsonRDD(json: RDD[String], schema: StructType): DataFrame = {
@@ -528,6 +589,13 @@ class SQLContext(@transient val sparkContext: SparkContext)
     createDataFrame(rowRDD, appliedSchema)
   }
 
+  /**
+   * :: Experimental ::
+   * Loads an JavaRDD<String> storing JSON objects (one object per record) and applies the given
+   * schema, returning the result as a [[DataFrame]].
+   *
+   * @group specificdata
+   */
   @Experimental
   def jsonRDD(json: JavaRDD[String], schema: StructType): DataFrame = {
     jsonRDD(json.rdd, schema)
@@ -535,6 +603,10 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * :: Experimental ::
+   * Loads an RDD[String] storing JSON objects (one object per record) inferring the
+   * schema, returning the result as a [[DataFrame]].
+   *
+   * @group specificdata
    */
   @Experimental
   def jsonRDD(json: RDD[String], samplingRatio: Double): DataFrame = {
@@ -546,6 +618,13 @@ class SQLContext(@transient val sparkContext: SparkContext)
     createDataFrame(rowRDD, appliedSchema)
   }
 
+  /**
+   * :: Experimental ::
+   * Loads a JavaRDD[String] storing JSON objects (one object per record) inferring the
+   * schema, returning the result as a [[DataFrame]].
+   *
+   * @group specificdata
+   */
   @Experimental
   def jsonRDD(json: JavaRDD[String], samplingRatio: Double): DataFrame = {
     jsonRDD(json.rdd, samplingRatio);
@@ -555,6 +634,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * :: Experimental ::
    * Returns the dataset stored at path as a DataFrame,
    * using the default data source configured by spark.sql.sources.default.
+   *
+   * @group genericdata
    */
   @Experimental
   def load(path: String): DataFrame = {
@@ -565,6 +646,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
   /**
    * :: Experimental ::
    * Returns the dataset stored at path as a DataFrame, using the given data source.
+   *
+   * @group genericdata
    */
   @Experimental
   def load(path: String, source: String): DataFrame = {
@@ -575,6 +658,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * :: Experimental ::
    * (Java-specific) Returns the dataset specified by the given data source and
    * a set of options as a DataFrame.
+   *
+   * @group genericdata
    */
   @Experimental
   def load(source: String, options: java.util.Map[String, String]): DataFrame = {
@@ -585,6 +670,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * :: Experimental ::
    * (Scala-specific) Returns the dataset specified by the given data source and
    * a set of options as a DataFrame.
+   *
+   * @group genericdata
    */
   @Experimental
   def load(source: String, options: Map[String, String]): DataFrame = {
@@ -596,6 +683,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * :: Experimental ::
    * (Java-specific) Returns the dataset specified by the given data source and
    * a set of options as a DataFrame, using the given schema as the schema of the DataFrame.
+   *
+   * @group genericdata
    */
   @Experimental
   def load(
@@ -609,6 +698,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * :: Experimental ::
    * (Scala-specific) Returns the dataset specified by the given data source and
    * a set of options as a DataFrame, using the given schema as the schema of the DataFrame.
+   * @group genericdata
    */
   @Experimental
   def load(
@@ -733,54 +823,70 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * :: Experimental ::
-   * Construct an RDD representing the database table accessible via JDBC URL
+   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
    * url named table.
+   *
+   * @group specificdata
    */
   @Experimental
-  def jdbcRDD(url: String, table: String): DataFrame = {
-    jdbcRDD(url, table, null.asInstanceOf[JDBCPartitioningInfo])
+  def jdbc(url: String, table: String): DataFrame = {
+    jdbc(url, table, JDBCRelation.columnPartition(null))
   }
 
   /**
    * :: Experimental ::
-   * Construct an RDD representing the database table accessible via JDBC URL
-   * url named table.  The PartitioningInfo parameter
-   * gives the name of a column of integral type, a number of partitions, and
-   * advisory minimum and maximum values for the column.  The RDD is
-   * partitioned according to said column.
+   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * url named table.  Partitions of the table will be retrieved in parallel based on the parameters
+   * passed to this function.
+   *
+   * @param columnName the name of a column of integral type that will be used for partitioning.
+   * @param lowerBound the minimum value of `columnName` to retrieve
+   * @param upperBound the maximum value of `columnName` to retrieve
+   * @param numPartitions the number of partitions.  the range `minValue`-`maxValue` will be split
+   *                      evenly into this many partitions
+   *
+   * @group specificdata
    */
   @Experimental
-  def jdbcRDD(url: String, table: String, partitioning: JDBCPartitioningInfo):
-      DataFrame = {
+  def jdbc(
+      url: String,
+      table: String,
+      columnName: String,
+      lowerBound: Long,
+      upperBound: Long,
+      numPartitions: Int): DataFrame = {
+    val partitioning = JDBCPartitioningInfo(columnName, lowerBound, upperBound, numPartitions)
     val parts = JDBCRelation.columnPartition(partitioning)
-    jdbcRDD(url, table, parts)
+    jdbc(url, table, parts)
   }
 
   /**
    * :: Experimental ::
-   * Construct an RDD representing the database table accessible via JDBC URL
+   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
    * url named table.  The theParts parameter gives a list expressions
    * suitable for inclusion in WHERE clauses; each one defines one partition
-   * of the RDD.
+   * of the [[DataFrame]].
+   *
+   * @group specificdata
    */
   @Experimental
-  def jdbcRDD(url: String, table: String, theParts: Array[String]): DataFrame = {
+  def jdbc(url: String, table: String, theParts: Array[String]): DataFrame = {
     val parts: Array[Partition] = theParts.zipWithIndex.map { case (part, i) =>
       JDBCPartition(part, i) : Partition
     }
-    jdbcRDD(url, table, parts)
+    jdbc(url, table, parts)
   }
 
-  private def jdbcRDD(url: String, table: String, parts: Array[Partition]): DataFrame = {
+  private def jdbc(url: String, table: String, parts: Array[Partition]): DataFrame = {
     val relation = JDBCRelation(url, table, parts)(this)
     baseRelationToDataFrame(relation)
   }
 
   /**
-   * Registers the given RDD as a temporary table in the catalog. Temporary tables exist only
-   * during the lifetime of this instance of SQLContext.
+   * Registers the given [[DataFrame]] as a temporary table in the catalog. Temporary tables exist
+   * only during the lifetime of this instance of SQLContext.
    */
-  private[sql] def registerRDDAsTable(rdd: DataFrame, tableName: String): Unit = {
+  private[sql] def registerDataFrameAsTable(rdd: DataFrame, tableName: String): Unit = {
     catalog.registerTable(Seq(tableName), rdd.logicalPlan)
   }
 
@@ -790,7 +896,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *
    * @param tableName the name of the table to be unregistered.
    *
-   * @group ddl_ops
+   * @group basic
    */
   def dropTempTable(tableName: String): Unit = {
     cacheManager.tryUncacheQuery(table(tableName))
@@ -801,7 +907,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * Executes a SQL query using Spark, returning the result as a [[DataFrame]]. The dialect that is
    * used for SQL parsing can be configured with 'spark.sql.dialect'.
    *
-   * @group userf
+   * @group basic
    */
   def sql(sqlText: String): DataFrame = {
     if (conf.dialect == "sql") {
@@ -811,7 +917,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
     }
   }
 
-  /** Returns the specified table as a [[DataFrame]]. */
+  /**
+   * Returns the specified table as a [[DataFrame]].
+   *
+   * @group ddl_ops
+   */
   def table(tableName: String): DataFrame =
     DataFrame(this, catalog.lookupRelation(Seq(tableName)))
 
@@ -819,6 +929,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * Returns a [[DataFrame]] containing names of existing tables in the current database.
    * The returned DataFrame has two columns, tableName and isTemporary (a Boolean
    * indicating if a table is a temporary one or not).
+   *
+   * @group ddl_ops
    */
   def tables(): DataFrame = {
     DataFrame(this, ShowTablesCommand(None))
@@ -828,6 +940,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * Returns a [[DataFrame]] containing names of existing tables in the given database.
    * The returned DataFrame has two columns, tableName and isTemporary (a Boolean
    * indicating if a table is a temporary one or not).
+   *
+   * @group ddl_ops
    */
   def tables(databaseName: String): DataFrame = {
     DataFrame(this, ShowTablesCommand(Some(databaseName)))
@@ -835,6 +949,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * Returns the names of tables in the current database as an array.
+   *
+   * @group ddl_ops
    */
   def tableNames(): Array[String] = {
     catalog.getTables(None).map {
@@ -844,6 +960,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
 
   /**
    * Returns the names of tables in the given database as an array.
+   *
+   * @group ddl_ops
    */
   def tableNames(databaseName: String): Array[String] = {
     catalog.getTables(Some(databaseName)).map {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala
index ee94a5fdbe376..295db539adfc4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UserDefinedFunction.scala
@@ -37,7 +37,7 @@ import org.apache.spark.sql.types.DataType
  *   df.select( predict(df("score")) )
  * }}}
  */
-case class UserDefinedFunction(f: AnyRef, dataType: DataType) {
+case class UserDefinedFunction protected[sql] (f: AnyRef, dataType: DataType) {
 
   def apply(exprs: Column*): Column = {
     Column(ScalaUdf(f, dataType, exprs.map(_.expr)))
@@ -58,6 +58,7 @@ private[sql] case class UserDefinedPythonFunction(
     accumulator: Accumulator[JList[Array[Byte]]],
     dataType: DataType) {
 
+  /** Returns a [[Column]] that will evaluate to calling this UDF with the given input. */
   def apply(exprs: Column*): Column = {
     val udf = PythonUDF(name, command, envVars, pythonIncludes, pythonExec, broadcastVars,
       accumulator, dataType, exprs.map(_.expr))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/package.scala
new file mode 100644
index 0000000000000..cbbd005228d44
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/package.scala
@@ -0,0 +1,23 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql
+
+/**
+ * Contains API classes that are specific to a single language (i.e. Java).
+ */
+package object api
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index c6cd6eb6a22b6..7c92e9fc88168 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -144,7 +144,7 @@ case class CacheTableCommand(
 
   override def run(sqlContext: SQLContext) = {
     plan.foreach { logicalPlan =>
-      sqlContext.registerRDDAsTable(DataFrame(sqlContext, logicalPlan), tableName)
+      sqlContext.registerDataFrameAsTable(DataFrame(sqlContext, logicalPlan), tableName)
     }
     sqlContext.cacheTable(tableName)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index acef49aabfe70..73162b22fa9cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -32,7 +32,9 @@ import org.apache.spark.sql.types._
  *
  * Usage:
  * {{{
- *   sql("SELECT key FROM src").debug
+ *   import org.apache.spark.sql.execution.debug._
+ *   sql("SELECT key FROM src").debug()
+ *   dataFrame.typeCheck()
  * }}}
  */
 package object debug {
@@ -144,11 +146,9 @@ package object debug {
   }
 
   /**
-   * :: DeveloperApi ::
    * Helper functions for checking that runtime types match a given schema.
    */
-  @DeveloperApi
-  object TypeCheck {
+  private[sql] object TypeCheck {
     def typeCheck(data: Any, schema: DataType): Unit = (data, schema) match {
       case (null, _) =>
 
@@ -174,10 +174,8 @@ package object debug {
   }
 
   /**
-   * :: DeveloperApi ::
    * Augments [[DataFrame]]s with debug methods.
    */
-  @DeveloperApi
   private[sql] case class TypeCheck(child: SparkPlan) extends SparkPlan {
     import TypeCheck._
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
index 34a83f0a5dad8..34f864f5fda7a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/jdbc.scala
@@ -26,11 +26,11 @@ import org.apache.spark.sql.jdbc.{JDBCPartitioningInfo, JDBCRelation, JDBCPartit
 import org.apache.spark.sql.types._
 
 package object jdbc {
-  object JDBCWriteDetails extends Logging {
+  private[sql] object JDBCWriteDetails extends Logging {
     /**
      * Returns a PreparedStatement that inserts a row into table via conn.
      */
-    private def insertStatement(conn: Connection, table: String, rddSchema: StructType):
+    def insertStatement(conn: Connection, table: String, rddSchema: StructType):
         PreparedStatement = {
       val sql = new StringBuilder(s"INSERT INTO $table VALUES (")
       var fieldsLeft = rddSchema.fields.length
@@ -56,7 +56,7 @@ package object jdbc {
      * non-Serializable.  Instead, we explicitly close over all variables that
      * are used.
      */
-    private[jdbc] def savePartition(url: String, table: String, iterator: Iterator[Row],
+    def savePartition(url: String, table: String, iterator: Iterator[Row],
         rddSchema: StructType, nullTypes: Array[Int]): Iterator[Byte] = {
       val conn = DriverManager.getConnection(url)
       var committed = false
@@ -117,19 +117,14 @@ package object jdbc {
       }
       Array[Byte]().iterator
     }
-  }
 
-  /**
-   * Make it so that you can call createJDBCTable and insertIntoJDBC on a DataFrame.
-   */
-  implicit class JDBCDataFrame(rdd: DataFrame) {
     /**
      * Compute the schema string for this RDD.
      */
-    private def schemaString(url: String): String = {
+    def schemaString(df: DataFrame, url: String): String = {
       val sb = new StringBuilder()
       val quirks = DriverQuirks.get(url)
-      rdd.schema.fields foreach { field => {
+      df.schema.fields foreach { field => {
         val name = field.name
         var typ: String = quirks.getJDBCType(field.dataType)._1
         if (typ == null) typ = field.dataType match {
@@ -156,9 +151,9 @@ package object jdbc {
     /**
      * Saves the RDD to the database in a single transaction.
      */
-    private def saveTable(url: String, table: String) {
+    def saveTable(df: DataFrame, url: String, table: String) {
       val quirks = DriverQuirks.get(url)
-      var nullTypes: Array[Int] = rdd.schema.fields.map(field => {
+      var nullTypes: Array[Int] = df.schema.fields.map(field => {
         var nullType: Option[Int] = quirks.getJDBCType(field.dataType)._2
         if (nullType.isEmpty) {
           field.dataType match {
@@ -175,61 +170,16 @@ package object jdbc {
             case DateType => java.sql.Types.DATE
             case DecimalType.Unlimited => java.sql.Types.DECIMAL
             case _ => throw new IllegalArgumentException(
-                s"Can't translate null value for field $field")
+              s"Can't translate null value for field $field")
           }
         } else nullType.get
       }).toArray
 
-      val rddSchema = rdd.schema
-      rdd.mapPartitions(iterator => JDBCWriteDetails.savePartition(
-          url, table, iterator, rddSchema, nullTypes)).collect()
-    }
-
-    /**
-     * Save this RDD to a JDBC database at `url` under the table name `table`.
-     * This will run a `CREATE TABLE` and a bunch of `INSERT INTO` statements.
-     * If you pass `true` for `allowExisting`, it will drop any table with the
-     * given name; if you pass `false`, it will throw if the table already
-     * exists.
-     */
-    def createJDBCTable(url: String, table: String, allowExisting: Boolean) {
-      val conn = DriverManager.getConnection(url)
-      try {
-        if (allowExisting) {
-          val sql = s"DROP TABLE IF EXISTS $table"
-          conn.prepareStatement(sql).executeUpdate()
-        }
-        val schema = schemaString(url)
-        val sql = s"CREATE TABLE $table ($schema)"
-        conn.prepareStatement(sql).executeUpdate()
-      } finally {
-        conn.close()
+      val rddSchema = df.schema
+      df.foreachPartition { iterator =>
+        JDBCWriteDetails.savePartition(url, table, iterator, rddSchema, nullTypes)
       }
-      saveTable(url, table)
     }
 
-    /**
-     * Save this RDD to a JDBC database at `url` under the table name `table`.
-     * Assumes the table already exists and has a compatible schema.  If you
-     * pass `true` for `overwrite`, it will `TRUNCATE` the table before
-     * performing the `INSERT`s.
-     *
-     * The table must already exist on the database.  It must have a schema
-     * that is compatible with the schema of this RDD; inserting the rows of
-     * the RDD in order via the simple statement
-     * `INSERT INTO table VALUES (?, ?, ..., ?)` should not fail.
-     */
-    def insertIntoJDBC(url: String, table: String, overwrite: Boolean) {
-      if (overwrite) {
-        val conn = DriverManager.getConnection(url)
-        try {
-          val sql = s"TRUNCATE TABLE $table"
-          conn.prepareStatement(sql).executeUpdate()
-        } finally {
-          conn.close()
-        }
-      }
-      saveTable(url, table)
-    }
-  } // implicit class JDBCDataFrame
+  }
 } // package object jdbc
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 7dd8bea49b8a5..65966458eb670 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -55,7 +55,7 @@ import org.apache.spark.{Logging, SerializableWritable, TaskContext}
  * Parquet table scan operator. Imports the file that backs the given
  * [[org.apache.spark.sql.parquet.ParquetRelation]] as a ``RDD[Row]``.
  */
-case class ParquetTableScan(
+private[sql] case class ParquetTableScan(
     attributes: Seq[Attribute],
     relation: ParquetRelation,
     columnPruningPred: Seq[Expression])
@@ -210,7 +210,7 @@ case class ParquetTableScan(
  * (only detected via filename pattern so will not catch all cases).
  */
 @DeveloperApi
-case class InsertIntoParquetTable(
+private[sql] case class InsertIntoParquetTable(
     relation: ParquetRelation,
     child: SparkPlan,
     overwrite: Boolean = false)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
index d0856df8d4f43..052728c5d5ceb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
@@ -34,7 +34,7 @@ import org.apache.spark.util.Utils
  * convenient to use tuples rather than special case classes when writing test cases/suites.
  * Especially, `Tuple1.apply` can be used to easily wrap a single type/value.
  */
-trait ParquetTest {
+private[sql] trait ParquetTest {
   val sqlContext: SQLContext
 
   import sqlContext.implicits.{localSeqToDataFrameHolder, rddToDataFrameHolder}
@@ -121,7 +121,7 @@ trait ParquetTest {
       (data: Seq[T], tableName: String)
       (f: => Unit): Unit = {
     withParquetRDD(data) { rdd =>
-      sqlContext.registerRDDAsTable(rdd, tableName)
+      sqlContext.registerDataFrameAsTable(rdd, tableName)
       withTempTable(tableName)(f)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 9bb34e2df9a26..95bea9201163d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -72,7 +72,7 @@ import org.apache.spark.{Logging, Partition => SparkPartition, SerializableWrita
  *    null or empty string. This is similar to the `hive.exec.default.partition.name` configuration
  *    in Hive.
  */
-class DefaultSource
+private[sql] class DefaultSource
     extends RelationProvider
     with SchemaRelationProvider
     with CreatableRelationProvider {
@@ -147,7 +147,7 @@ private[sql] case class PartitionSpec(partitionColumns: StructType, partitions:
  *    discovery.
  */
 @DeveloperApi
-case class ParquetRelation2(
+private[sql] case class ParquetRelation2(
     paths: Seq[String],
     parameters: Map[String, String],
     maybeSchema: Option[StructType] = None,
@@ -600,7 +600,7 @@ case class ParquetRelation2(
   }
 }
 
-object ParquetRelation2 {
+private[sql] object ParquetRelation2 {
   // Whether we should merge schemas collected from all Parquet part-files.
   val MERGE_SCHEMA = "mergeSchema"
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/timestamp/NanoTime.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/timestamp/NanoTime.scala
index 887161684429f..e24475292ceaf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/timestamp/NanoTime.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/timestamp/NanoTime.scala
@@ -53,7 +53,7 @@ private[parquet] class NanoTime extends Serializable {
     "NanoTime{julianDay=" + julianDay + ", timeOfDayNanos=" + timeOfDayNanos + "}"
 }
 
-object NanoTime {
+private[sql] object NanoTime {
   def fromBinary(bytes: Binary): NanoTime = {
     Preconditions.checkArgument(bytes.length() == 12, "Must be 12 bytes")
     val buf = bytes.toByteBuffer
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index dd8b3d211be64..5020689f7a105 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -374,7 +374,7 @@ private[sql] case class CreateTempTableUsing(
 
   def run(sqlContext: SQLContext) = {
     val resolved = ResolvedDataSource(sqlContext, userSpecifiedSchema, provider, options)
-    sqlContext.registerRDDAsTable(
+    sqlContext.registerDataFrameAsTable(
       DataFrame(sqlContext, LogicalRelation(resolved.relation)), tableName)
     Seq.empty
   }
@@ -390,7 +390,7 @@ private[sql] case class CreateTempTableUsingAsSelect(
   def run(sqlContext: SQLContext) = {
     val df = DataFrame(sqlContext, query)
     val resolved = ResolvedDataSource(sqlContext, provider, mode, options, df)
-    sqlContext.registerRDDAsTable(
+    sqlContext.registerDataFrameAsTable(
       DataFrame(sqlContext, LogicalRelation(resolved.relation)), tableName)
 
     Seq.empty
diff --git a/sql/core/src/test/java/org/apache/spark/sql/jdbc/JavaJDBCTest.java b/sql/core/src/test/java/org/apache/spark/sql/jdbc/JavaJDBCTest.java
deleted file mode 100644
index 80bd74f5b5525..0000000000000
--- a/sql/core/src/test/java/org/apache/spark/sql/jdbc/JavaJDBCTest.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.jdbc;
-
-import org.junit.*;
-import static org.junit.Assert.*;
-import java.sql.Connection;
-import java.sql.DriverManager;
-
-import org.apache.spark.SparkEnv;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.api.java.*;
-import org.apache.spark.sql.test.TestSQLContext$;
-
-public class JavaJDBCTest {
-  static String url = "jdbc:h2:mem:testdb1";
-
-  static Connection conn = null;
-
-  // This variable will always be null if TestSQLContext is intact when running
-  // these tests.  Some Java tests do not play nicely with others, however;
-  // they create a SparkContext of their own at startup and stop it at exit.
-  // This renders TestSQLContext inoperable, meaning we have to do the same
-  // thing.  If this variable is nonnull, that means we allocated a
-  // SparkContext of our own and that we need to stop it at teardown.
-  static JavaSparkContext localSparkContext = null;
-
-  static SQLContext sql = TestSQLContext$.MODULE$;
-
-  @Before
-  public void beforeTest() throws Exception {
-    if (SparkEnv.get() == null) { // A previous test destroyed TestSQLContext.
-      localSparkContext = new JavaSparkContext("local", "JavaAPISuite");
-      sql = new SQLContext(localSparkContext);
-    }
-    Class.forName("org.h2.Driver");
-    conn = DriverManager.getConnection(url);
-    conn.prepareStatement("create schema test").executeUpdate();
-    conn.prepareStatement("create table test.people (name TEXT(32) NOT NULL, theid INTEGER NOT NULL)").executeUpdate();
-    conn.prepareStatement("insert into test.people values ('fred', 1)").executeUpdate();
-    conn.prepareStatement("insert into test.people values ('mary', 2)").executeUpdate();
-    conn.prepareStatement("insert into test.people values ('joe', 3)").executeUpdate();
-    conn.commit();
-  }
-
-  @After
-  public void afterTest() throws Exception {
-    if (localSparkContext != null) {
-      localSparkContext.stop();
-      localSparkContext = null;
-    }
-    try {
-      conn.close();
-    } finally {
-      conn = null;
-    }
-  }
-
-  @Test
-  public void basicTest() throws Exception {
-    DataFrame rdd = JDBCUtils.jdbcRDD(sql, url, "TEST.PEOPLE");
-    Row[] rows = rdd.collect();
-    assertEquals(rows.length, 3);
-  }
-
-  @Test
-  public void partitioningTest() throws Exception {
-    String[] parts = new String[2];
-    parts[0] = "THEID < 2";
-    parts[1] = "THEID = 2"; // Deliberately forget about one of them.
-    DataFrame rdd = JDBCUtils.jdbcRDD(sql, url, "TEST.PEOPLE", parts);
-    Row[] rows = rdd.collect();
-    assertEquals(rows.length, 2);
-  }
-
-  @Test
-  public void writeTest() throws Exception {
-    DataFrame rdd = JDBCUtils.jdbcRDD(sql, url, "TEST.PEOPLE");
-    JDBCUtils.createJDBCTable(rdd, url, "TEST.PEOPLECOPY", false);
-    DataFrame rdd2 = JDBCUtils.jdbcRDD(sql, url, "TEST.PEOPLECOPY");
-    Row[] rows = rdd2.collect();
-    assertEquals(rows.length, 3);
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index d25c1390db15c..07db672217bc1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -164,17 +164,16 @@ class JDBCSuite extends FunSuite with BeforeAndAfter {
   }
 
   test("Basic API") {
-    assert(TestSQLContext.jdbcRDD(url, "TEST.PEOPLE").collect.size == 3)
+    assert(TestSQLContext.jdbc(url, "TEST.PEOPLE").collect.size == 3)
   }
 
   test("Partitioning via JDBCPartitioningInfo API") {
-    val parts = JDBCPartitioningInfo("THEID", 0, 4, 3)
-    assert(TestSQLContext.jdbcRDD(url, "TEST.PEOPLE", parts).collect.size == 3)
+    assert(TestSQLContext.jdbc(url, "TEST.PEOPLE", "THEID", 0, 4, 3).collect.size == 3)
   }
 
   test("Partitioning via list-of-where-clauses API") {
     val parts = Array[String]("THEID < 2", "THEID >= 2")
-    assert(TestSQLContext.jdbcRDD(url, "TEST.PEOPLE", parts).collect.size == 3)
+    assert(TestSQLContext.jdbc(url, "TEST.PEOPLE", parts).collect.size == 3)
   }
 
   test("H2 integral types") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index 21e70936102fd..ad2fbc3f04a9c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -57,8 +57,8 @@ class JDBCWriteSuite extends FunSuite with BeforeAndAfter {
     val srdd = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
 
     srdd.createJDBCTable(url, "TEST.BASICCREATETEST", false)
-    assert(2 == TestSQLContext.jdbcRDD(url, "TEST.BASICCREATETEST").count)
-    assert(2 == TestSQLContext.jdbcRDD(url, "TEST.BASICCREATETEST").collect()(0).length)
+    assert(2 == TestSQLContext.jdbc(url, "TEST.BASICCREATETEST").count)
+    assert(2 == TestSQLContext.jdbc(url, "TEST.BASICCREATETEST").collect()(0).length)
   }
 
   test("CREATE with overwrite") {
@@ -66,12 +66,12 @@ class JDBCWriteSuite extends FunSuite with BeforeAndAfter {
     val srdd2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
 
     srdd.createJDBCTable(url, "TEST.DROPTEST", false)
-    assert(2 == TestSQLContext.jdbcRDD(url, "TEST.DROPTEST").count)
-    assert(3 == TestSQLContext.jdbcRDD(url, "TEST.DROPTEST").collect()(0).length)
+    assert(2 == TestSQLContext.jdbc(url, "TEST.DROPTEST").count)
+    assert(3 == TestSQLContext.jdbc(url, "TEST.DROPTEST").collect()(0).length)
 
     srdd2.createJDBCTable(url, "TEST.DROPTEST", true)
-    assert(1 == TestSQLContext.jdbcRDD(url, "TEST.DROPTEST").count)
-    assert(2 == TestSQLContext.jdbcRDD(url, "TEST.DROPTEST").collect()(0).length)
+    assert(1 == TestSQLContext.jdbc(url, "TEST.DROPTEST").count)
+    assert(2 == TestSQLContext.jdbc(url, "TEST.DROPTEST").collect()(0).length)
   }
 
   test("CREATE then INSERT to append") {
@@ -80,8 +80,8 @@ class JDBCWriteSuite extends FunSuite with BeforeAndAfter {
 
     srdd.createJDBCTable(url, "TEST.APPENDTEST", false)
     srdd2.insertIntoJDBC(url, "TEST.APPENDTEST", false)
-    assert(3 == TestSQLContext.jdbcRDD(url, "TEST.APPENDTEST").count)
-    assert(2 == TestSQLContext.jdbcRDD(url, "TEST.APPENDTEST").collect()(0).length)
+    assert(3 == TestSQLContext.jdbc(url, "TEST.APPENDTEST").count)
+    assert(2 == TestSQLContext.jdbc(url, "TEST.APPENDTEST").collect()(0).length)
   }
 
   test("CREATE then INSERT to truncate") {
@@ -90,8 +90,8 @@ class JDBCWriteSuite extends FunSuite with BeforeAndAfter {
 
     srdd.createJDBCTable(url, "TEST.TRUNCATETEST", false)
     srdd2.insertIntoJDBC(url, "TEST.TRUNCATETEST", true)
-    assert(1 == TestSQLContext.jdbcRDD(url, "TEST.TRUNCATETEST").count)
-    assert(2 == TestSQLContext.jdbcRDD(url, "TEST.TRUNCATETEST").collect()(0).length)
+    assert(1 == TestSQLContext.jdbc(url, "TEST.TRUNCATETEST").count)
+    assert(2 == TestSQLContext.jdbc(url, "TEST.TRUNCATETEST").collect()(0).length)
   }
 
   test("Incompatible INSERT to append") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegration.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegration.scala
index 89920f2650c3a..4f38110c80cc6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegration.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegration.scala
@@ -143,7 +143,7 @@ class MySQLDatabase {
   }
 
   test("Basic test") {
-    val rdd = TestSQLContext.jdbcRDD(url(ip, "foo"), "tbl")
+    val rdd = TestSQLContext.jdbc(url(ip, "foo"), "tbl")
     val rows = rdd.collect
     assert(rows.length == 2)
     val types = rows(0).toSeq.map(x => x.getClass.toString)
@@ -153,7 +153,7 @@ class MySQLDatabase {
   }
 
   test("Numeric types") {
-    val rdd = TestSQLContext.jdbcRDD(url(ip, "foo"), "numbers")
+    val rdd = TestSQLContext.jdbc(url(ip, "foo"), "numbers")
     val rows = rdd.collect
     assert(rows.length == 1)
     val types = rows(0).toSeq.map(x => x.getClass.toString)
@@ -181,7 +181,7 @@ class MySQLDatabase {
   }
 
   test("Date types") {
-    val rdd = TestSQLContext.jdbcRDD(url(ip, "foo"), "dates")
+    val rdd = TestSQLContext.jdbc(url(ip, "foo"), "dates")
     val rows = rdd.collect
     assert(rows.length == 1)
     val types = rows(0).toSeq.map(x => x.getClass.toString)
@@ -199,7 +199,7 @@ class MySQLDatabase {
   }
 
   test("String types") {
-    val rdd = TestSQLContext.jdbcRDD(url(ip, "foo"), "strings")
+    val rdd = TestSQLContext.jdbc(url(ip, "foo"), "strings")
     val rows = rdd.collect
     assert(rows.length == 1)
     val types = rows(0).toSeq.map(x => x.getClass.toString)
@@ -225,9 +225,9 @@ class MySQLDatabase {
   }
 
   test("Basic write test") {
-    val rdd1 = TestSQLContext.jdbcRDD(url(ip, "foo"), "numbers")
-    val rdd2 = TestSQLContext.jdbcRDD(url(ip, "foo"), "dates")
-    val rdd3 = TestSQLContext.jdbcRDD(url(ip, "foo"), "strings")
+    val rdd1 = TestSQLContext.jdbc(url(ip, "foo"), "numbers")
+    val rdd2 = TestSQLContext.jdbc(url(ip, "foo"), "dates")
+    val rdd3 = TestSQLContext.jdbc(url(ip, "foo"), "strings")
     rdd1.createJDBCTable(url(ip, "foo"), "numberscopy", false)
     rdd2.createJDBCTable(url(ip, "foo"), "datescopy", false)
     rdd3.createJDBCTable(url(ip, "foo"), "stringscopy", false)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegration.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegration.scala
index c174d7adb7204..7b47feeb7887e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegration.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegration.scala
@@ -113,7 +113,7 @@ class PostgresDatabase {
   }
 
   test("Type mapping for various types") {
-    val rdd = TestSQLContext.jdbcRDD(url(db.ip), "public.bar")
+    val rdd = TestSQLContext.jdbc(url(db.ip), "public.bar")
     val rows = rdd.collect
     assert(rows.length == 1)
     val types = rows(0).toSeq.map(x => x.getClass.toString)
@@ -142,7 +142,7 @@ class PostgresDatabase {
   }
 
   test("Basic write test") {
-    val rdd = TestSQLContext.jdbcRDD(url(db.ip), "public.bar")
+    val rdd = TestSQLContext.jdbc(url(db.ip), "public.bar")
     rdd.createJDBCTable(url(db.ip), "public.barcopy", false)
     // Test only that it doesn't bomb out.
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala
index bfacc51ef57ab..07b5a84fb6602 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/DescribeHiveTableCommand.scala
@@ -29,9 +29,9 @@ import org.apache.spark.sql.hive.HiveShim
 import org.apache.spark.sql.SQLContext
 
 /**
- * Implementation for "describe [extended] table".
- *
  * :: DeveloperApi ::
+ *
+ * Implementation for "describe [extended] table".
  */
 @DeveloperApi
 case class DescribeHiveTableCommand(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 0aa5f7f7b88bd..6afd8eea05418 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -102,6 +102,10 @@ case class AddFile(path: String) extends RunnableCommand {
   }
 }
 
+/**
+ * :: DeveloperApi ::
+ */
+@DeveloperApi
 case class CreateMetastoreDataSource(
     tableName: String,
     userSpecifiedSchema: Option[StructType],
@@ -141,6 +145,10 @@ case class CreateMetastoreDataSource(
   }
 }
 
+/**
+ * :: DeveloperApi ::
+ */
+@DeveloperApi
 case class CreateMetastoreDataSourceAsSelect(
     tableName: String,
     provider: String,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JavaJDBCTrampoline.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/package.scala
similarity index 66%
rename from sql/core/src/main/scala/org/apache/spark/sql/jdbc/JavaJDBCTrampoline.scala
rename to sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/package.scala
index 86bb67ec74256..4989c42e964ec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JavaJDBCTrampoline.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/package.scala
@@ -15,16 +15,11 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.jdbc
+package org.apache.spark.sql.hive
 
-import org.apache.spark.sql.DataFrame
-
-private[jdbc] class JavaJDBCTrampoline {
-  def createJDBCTable(rdd: DataFrame, url: String, table: String, allowExisting: Boolean) {
-    rdd.createJDBCTable(url, table, allowExisting);
-  }
-
-  def insertIntoJDBC(rdd: DataFrame, url: String, table: String, overwrite: Boolean) {
-    rdd.insertIntoJDBC(url, table, overwrite);
-  }
-}
+/**
+ * Physical execution operators used for running queries against data stored in Hive.  These
+ * are not intended for use by users, but are documents so that it is easier to understand
+ * the output of EXPLAIN queries.
+ */
+package object execution
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/package.scala
index a6c8ed4f7e866..db074361ef03c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/package.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/package.scala
@@ -17,4 +17,14 @@
 
 package org.apache.spark.sql
 
+/**
+ * Support for running Spark SQL queries using functionality from Apache Hive (does not require an
+ * existing Hive installation).  Supported Hive features include:
+ *  - Using HiveQL to express queries.
+ *  - Reading metadata from the Hive Metastore using HiveSerDes.
+ *  - Hive UDFs, UDAs, UDTs
+ *
+ * Users that would like access to this functionality should create a
+ * [[hive.HiveContext HiveContext]] instead of a [[SQLContext]].
+ */
 package object hive
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/parquet/FakeParquetSerDe.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/parquet/FakeParquetSerDe.scala
deleted file mode 100644
index 2a16c9d1a27c9..0000000000000
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/parquet/FakeParquetSerDe.scala
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.parquet
-
-import java.util.Properties
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category
-import org.apache.hadoop.hive.serde2.{SerDeStats, SerDe}
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector
-import org.apache.hadoop.io.Writable
-
-/**
- * A placeholder that allows Spark SQL users to create metastore tables that are stored as
- * parquet files.  It is only intended to pass the checks that the serde is valid and exists
- * when a CREATE TABLE is run.  The actual work of decoding will be done by ParquetTableScan
- * when "spark.sql.hive.convertMetastoreParquet" is set to true.
- */
-@deprecated("No code should depend on FakeParquetHiveSerDe as it is only intended as a " +
-            "placeholder in the Hive MetaStore", "1.2.0")
-class FakeParquetSerDe extends SerDe {
-  override def getObjectInspector: ObjectInspector = new ObjectInspector {
-    override def getCategory: Category = Category.PRIMITIVE
-
-    override def getTypeName: String = "string"
-  }
-
-  override def deserialize(p1: Writable): AnyRef = throwError
-
-  override def initialize(p1: Configuration, p2: Properties): Unit = {}
-
-  override def getSerializedClass: Class[_ <: Writable] = throwError
-
-  override def getSerDeStats: SerDeStats = throwError
-
-  override def serialize(p1: scala.Any, p2: ObjectInspector): Writable = throwError
-
-  private def throwError =
-    sys.error(
-      "spark.sql.hive.convertMetastoreParquet must be set to true to use FakeParquetSerDe")
-}
diff --git a/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala b/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala
index 8534c7d7064e5..30646ddbc29d8 100644
--- a/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala
+++ b/sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala
@@ -43,7 +43,9 @@ import org.apache.hadoop.mapred.InputFormat
 
 import org.apache.spark.sql.types.{Decimal, DecimalType}
 
-case class HiveFunctionWrapper(functionClassName: String) extends java.io.Serializable {
+private[hive] case class HiveFunctionWrapper(functionClassName: String)
+  extends java.io.Serializable {
+
   // for Serialization
   def this() = this(null)
 
@@ -249,6 +251,9 @@ private[hive] object HiveShim {
   def setTblNullFormat(crtTbl: CreateTableDesc, tbl: Table) = {}
 }
 
-class ShimFileSinkDesc(var dir: String, var tableInfo: TableDesc, var compressed: Boolean)
+private[hive] class ShimFileSinkDesc(
+    var dir: String,
+    var tableInfo: TableDesc,
+    var compressed: Boolean)
   extends FileSinkDesc(dir, tableInfo, compressed) {
 }
diff --git a/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala b/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala
index 72104f5b55761..f9fcbdae15745 100644
--- a/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala
+++ b/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala
@@ -56,7 +56,9 @@ import org.apache.spark.sql.types.{Decimal, DecimalType}
  *
  * @param functionClassName UDF class name
  */
-case class HiveFunctionWrapper(var functionClassName: String) extends java.io.Externalizable {
+private[hive] case class HiveFunctionWrapper(var functionClassName: String)
+  extends java.io.Externalizable {
+
   // for Serialization
   def this() = this(null)
 
@@ -423,7 +425,10 @@ private[hive] object HiveShim {
  * Bug introduced in hive-0.13. FileSinkDesc is serilizable, but its member path is not.
  * Fix it through wrapper.
  */
-class ShimFileSinkDesc(var dir: String, var tableInfo: TableDesc, var compressed: Boolean)
+private[hive] class ShimFileSinkDesc(
+    var dir: String,
+    var tableInfo: TableDesc,
+    var compressed: Boolean)
   extends Serializable with Logging {
   var compressCodec: String = _
   var compressType: String = _

From d8adefefcc2a4af32295440ed1d4917a6968f017 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 17 Feb 2015 10:22:48 -0800
Subject: [PATCH 215/357] [SPARK-5859] [PySpark] [SQL] fix DataFrame Python API

1. added explain()
2. add isLocal()
3. do not call show() in __repl__
4. add foreach() and foreachPartition()
5. add distinct()
6. fix functions.col()/column()/lit()
7. fix unit tests in sql/functions.py
8. fix unicode in showString()

Author: Davies Liu <davies@databricks.com>

Closes #4645 from davies/df6 and squashes the following commits:

6b46a2c [Davies Liu] fix DataFrame Python API
---
 python/pyspark/sql/dataframe.py | 65 +++++++++++++++++++++++++++------
 python/pyspark/sql/functions.py | 12 +++---
 2 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 28a59e73a3410..841724095f693 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -238,6 +238,22 @@ def printSchema(self):
         """
         print (self._jdf.schema().treeString())
 
+    def explain(self, extended=False):
+        """
+        Prints the plans (logical and physical) to the console for
+        debugging purpose.
+
+        If extended is False, only prints the physical plan.
+        """
+        self._jdf.explain(extended)
+
+    def isLocal(self):
+        """
+        Returns True if the `collect` and `take` methods can be run locally
+        (without any Spark executors).
+        """
+        return self._jdf.isLocal()
+
     def show(self):
         """
         Print the first 20 rows.
@@ -247,14 +263,12 @@ def show(self):
         2   Alice
         5   Bob
         >>> df
-        age name
-        2   Alice
-        5   Bob
+        DataFrame[age: int, name: string]
         """
-        print (self)
+        print self._jdf.showString().encode('utf8', 'ignore')
 
     def __repr__(self):
-        return self._jdf.showString()
+        return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes))
 
     def count(self):
         """Return the number of elements in this RDD.
@@ -336,6 +350,8 @@ def mapPartitions(self, f, preservesPartitioning=False):
         """
         Return a new RDD by applying a function to each partition.
 
+        It's a shorthand for df.rdd.mapPartitions()
+
         >>> rdd = sc.parallelize([1, 2, 3, 4], 4)
         >>> def f(iterator): yield 1
         >>> rdd.mapPartitions(f).sum()
@@ -343,6 +359,31 @@ def mapPartitions(self, f, preservesPartitioning=False):
         """
         return self.rdd.mapPartitions(f, preservesPartitioning)
 
+    def foreach(self, f):
+        """
+        Applies a function to all rows of this DataFrame.
+
+        It's a shorthand for df.rdd.foreach()
+
+        >>> def f(person):
+        ...     print person.name
+        >>> df.foreach(f)
+        """
+        return self.rdd.foreach(f)
+
+    def foreachPartition(self, f):
+        """
+        Applies a function to each partition of this DataFrame.
+
+        It's a shorthand for df.rdd.foreachPartition()
+
+        >>> def f(people):
+        ...     for person in people:
+        ...         print person.name
+        >>> df.foreachPartition(f)
+        """
+        return self.rdd.foreachPartition(f)
+
     def cache(self):
         """ Persist with the default storage level (C{MEMORY_ONLY_SER}).
         """
@@ -377,8 +418,13 @@ def repartition(self, numPartitions):
         """ Return a new :class:`DataFrame` that has exactly `numPartitions`
         partitions.
         """
-        rdd = self._jdf.repartition(numPartitions, None)
-        return DataFrame(rdd, self.sql_ctx)
+        return DataFrame(self._jdf.repartition(numPartitions, None), self.sql_ctx)
+
+    def distinct(self):
+        """
+        Return a new :class:`DataFrame` containing the distinct rows in this DataFrame.
+        """
+        return DataFrame(self._jdf.distinct(), self.sql_ctx)
 
     def sample(self, withReplacement, fraction, seed=None):
         """
@@ -957,10 +1003,7 @@ def cast(self, dataType):
         return Column(jc, self.sql_ctx)
 
     def __repr__(self):
-        if self._jdf.isComputable():
-            return self._jdf.samples()
-        else:
-            return 'Column<%s>' % self._jdf.toString()
+        return 'Column<%s>' % self._jdf.toString().encode('utf8')
 
     def toPandas(self):
         """
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index d0e090607ff4f..fc61162f0b827 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -37,7 +37,7 @@ def _create_function(name, doc=""):
     """ Create a function for aggregator by name"""
     def _(col):
         sc = SparkContext._active_spark_context
-        jc = getattr(sc._jvm.functions, name)(_to_java_column(col))
+        jc = getattr(sc._jvm.functions, name)(col._jc if isinstance(col, Column) else col)
         return Column(jc)
     _.__name__ = name
     _.__doc__ = doc
@@ -140,6 +140,7 @@ def __call__(self, *cols):
 def udf(f, returnType=StringType()):
     """Create a user defined function (UDF)
 
+    >>> from pyspark.sql.types import IntegerType
     >>> slen = udf(lambda s: len(s), IntegerType())
     >>> df.select(slen(df.name).alias('slen')).collect()
     [Row(slen=5), Row(slen=3)]
@@ -151,17 +152,14 @@ def _test():
     import doctest
     from pyspark.context import SparkContext
     from pyspark.sql import Row, SQLContext
-    import pyspark.sql.dataframe
-    globs = pyspark.sql.dataframe.__dict__.copy()
+    import pyspark.sql.functions
+    globs = pyspark.sql.functions.__dict__.copy()
     sc = SparkContext('local[4]', 'PythonTest')
     globs['sc'] = sc
     globs['sqlCtx'] = SQLContext(sc)
     globs['df'] = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF()
-    globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF()
-    globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80),
-                                  Row(name='Bob', age=5, height=85)]).toDF()
     (failure_count, test_count) = doctest.testmod(
-        pyspark.sql.dataframe, globs=globs,
+        pyspark.sql.functions, globs=globs,
         optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
     globs['sc'].stop()
     if failure_count:

From d8f69cf78862d13a48392a0b94388b8d403523da Mon Sep 17 00:00:00 2001
From: Ryan Williams <ryan.blake.williams@gmail.com>
Date: Tue, 17 Feb 2015 10:57:16 -0800
Subject: [PATCH 216/357] [SPARK-5778] throw if nonexistent metrics config file
 provided

previous behavior was to log an error; this is fine in the general
case where no `spark.metrics.conf` parameter was specified, in which
case a default `metrics.properties` is looked for, and the execption
logged and suppressed if it doesn't exist.

if the user has purposefully specified a metrics.conf file, however,
it makes more sense to show them an error when said file doesn't
exist.

Author: Ryan Williams <ryan.blake.williams@gmail.com>

Closes #4571 from ryan-williams/metrics and squashes the following commits:

5bccb14 [Ryan Williams] private-ize some MetricsConfig members
08ff998 [Ryan Williams] rename METRICS_CONF: DEFAULT_METRICS_CONF_FILENAME
f4d7fab [Ryan Williams] fix tests
ad24b0e [Ryan Williams] add "metrics.properties" to .rat-excludes
94e810b [Ryan Williams] throw if nonexistent Sink class is specified
31d2c30 [Ryan Williams] metrics code review feedback
56287db [Ryan Williams] throw if nonexistent metrics config file provided
---
 .rat-excludes                                 |  1 +
 .../apache/spark/metrics/MetricsConfig.scala  | 32 ++++++++++---------
 .../apache/spark/metrics/MetricsSystem.scala  |  5 ++-
 .../resources/test_metrics_system.properties  |  2 --
 .../spark/metrics/MetricsConfigSuite.scala    |  2 +-
 5 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/.rat-excludes b/.rat-excludes
index a788e8273d8a2..8c61e67a0c7d1 100644
--- a/.rat-excludes
+++ b/.rat-excludes
@@ -19,6 +19,7 @@ fairscheduler.xml.template
 spark-defaults.conf.template
 log4j.properties
 log4j.properties.template
+metrics.properties
 metrics.properties.template
 slaves
 slaves.template
diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala
index 1b7a5d1f1980a..8edf493780687 100644
--- a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala
@@ -28,12 +28,12 @@ import org.apache.spark.util.Utils
 
 private[spark] class MetricsConfig(val configFile: Option[String]) extends Logging {
 
-  val DEFAULT_PREFIX = "*"
-  val INSTANCE_REGEX = "^(\\*|[a-zA-Z]+)\\.(.+)".r
-  val METRICS_CONF = "metrics.properties"
+  private val DEFAULT_PREFIX = "*"
+  private val INSTANCE_REGEX = "^(\\*|[a-zA-Z]+)\\.(.+)".r
+  private val DEFAULT_METRICS_CONF_FILENAME = "metrics.properties"
 
-  val properties = new Properties()
-  var propertyCategories: mutable.HashMap[String, Properties] = null
+  private[metrics] val properties = new Properties()
+  private[metrics] var propertyCategories: mutable.HashMap[String, Properties] = null
 
   private def setDefaultProperties(prop: Properties) {
     prop.setProperty("*.sink.servlet.class", "org.apache.spark.metrics.sink.MetricsServlet")
@@ -47,20 +47,22 @@ private[spark] class MetricsConfig(val configFile: Option[String]) extends Loggi
     setDefaultProperties(properties)
 
     // If spark.metrics.conf is not set, try to get file in class path
-    var is: InputStream = null
-    try {
-      is = configFile match {
-        case Some(f) => new FileInputStream(f)
-        case None => Utils.getSparkClassLoader.getResourceAsStream(METRICS_CONF)
+    val isOpt: Option[InputStream] = configFile.map(new FileInputStream(_)).orElse {
+      try {
+        Option(Utils.getSparkClassLoader.getResourceAsStream(DEFAULT_METRICS_CONF_FILENAME))
+      } catch {
+        case e: Exception =>
+          logError("Error loading default configuration file", e)
+          None
       }
+    }
 
-      if (is != null) {
+    isOpt.foreach { is =>
+      try {
         properties.load(is)
+      } finally {
+        is.close()
       }
-    } catch {
-      case e: Exception => logError("Error loading configure file", e)
-    } finally {
-      if (is != null) is.close()
     }
 
     propertyCategories = subProperties(properties, INSTANCE_REGEX)
diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
index 83e8eb71260eb..345db36630fd5 100644
--- a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala
@@ -191,7 +191,10 @@ private[spark] class MetricsSystem private (
             sinks += sink.asInstanceOf[Sink]
           }
         } catch {
-          case e: Exception => logError("Sink class " + classPath + " cannot be instantialized", e)
+          case e: Exception => {
+            logError("Sink class " + classPath + " cannot be instantialized")
+            throw e
+          }
         }
       }
     }
diff --git a/core/src/test/resources/test_metrics_system.properties b/core/src/test/resources/test_metrics_system.properties
index 35d0bd3b8d0b8..4e8b8465696e5 100644
--- a/core/src/test/resources/test_metrics_system.properties
+++ b/core/src/test/resources/test_metrics_system.properties
@@ -18,7 +18,5 @@
 *.sink.console.period = 10
 *.sink.console.unit = seconds
 test.sink.console.class = org.apache.spark.metrics.sink.ConsoleSink
-test.sink.dummy.class = org.apache.spark.metrics.sink.DummySink
-test.source.dummy.class = org.apache.spark.metrics.source.DummySource
 test.sink.console.period = 20
 test.sink.console.unit = minutes
diff --git a/core/src/test/scala/org/apache/spark/metrics/MetricsConfigSuite.scala b/core/src/test/scala/org/apache/spark/metrics/MetricsConfigSuite.scala
index 1a9ce8c607dcd..37e528435aa5d 100644
--- a/core/src/test/scala/org/apache/spark/metrics/MetricsConfigSuite.scala
+++ b/core/src/test/scala/org/apache/spark/metrics/MetricsConfigSuite.scala
@@ -27,7 +27,7 @@ class MetricsConfigSuite extends FunSuite with BeforeAndAfter {
   }
 
   test("MetricsConfig with default properties") {
-    val conf = new MetricsConfig(Option("dummy-file"))
+    val conf = new MetricsConfig(None)
     conf.initialize()
 
     assert(conf.properties.size() === 4)

From b271c265b742fa6947522eda4592e9e6a7fd1f3a Mon Sep 17 00:00:00 2001
From: xukun 00228947 <xukun.xu@huawei.com>
Date: Tue, 17 Feb 2015 18:59:41 +0000
Subject: [PATCH 217/357] [SPARK-5661]function hasShutdownDeleteTachyonDir
 should use shutdownDeleteTachyonPaths to determine whether contains file

hasShutdownDeleteTachyonDir(file: TachyonFile) should use shutdownDeleteTachyonPaths(not shutdownDeletePaths) to determine Whether contain file. To solve it ,delete two unused function.

Author: xukun 00228947 <xukun.xu@huawei.com>
Author: viper-kun <xukun.xu@huawei.com>

Closes #4418 from viper-kun/deleteunusedfun and squashes the following commits:

87340eb [viper-kun] fix style
3d6c69e [xukun 00228947] fix bug
2bc397e [xukun 00228947] deleteunusedfun
---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index c06bd6fab0cc9..df21ed37e76b1 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -213,8 +213,8 @@ private[spark] object Utils extends Logging {
   // Is the path already registered to be deleted via a shutdown hook ?
   def hasShutdownDeleteTachyonDir(file: TachyonFile): Boolean = {
     val absolutePath = file.getPath()
-    shutdownDeletePaths.synchronized {
-      shutdownDeletePaths.contains(absolutePath)
+    shutdownDeleteTachyonPaths.synchronized {
+      shutdownDeleteTachyonPaths.contains(absolutePath)
     }
   }
 

From 9b746f380869b54d673e3758ca5e4475f76c864a Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 17 Feb 2015 11:19:23 -0800
Subject: [PATCH 218/357] [SPARK-3381] [MLlib] Eliminate bins for unordered
 features in DecisionTrees

For unordered features, it is sufficient to use splits since the threshold of the split corresponds the threshold of the HighSplit of the bin and there is no use of the LowSplit.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #4231 from MechCoder/spark-3381 and squashes the following commits:

58c19a5 [MechCoder] COSMIT
c274b74 [MechCoder] Remove unordered feature calculation in labeledPointToTreePoint
b2b9b89 [MechCoder] COSMIT
d3ee042 [MechCoder] [SPARK-3381] [MLlib] Eliminate bins for unordered features
---
 .../spark/mllib/tree/DecisionTree.scala       | 37 +++++--------------
 .../spark/mllib/tree/impl/TreePoint.scala     | 14 ++-----
 .../spark/mllib/tree/DecisionTreeSuite.scala  | 37 +------------------
 3 files changed, 15 insertions(+), 73 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index f1f85994e61b2..b9d0c56dd1ea3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -327,14 +327,14 @@ object DecisionTree extends Serializable with Logging {
    * @param agg  Array storing aggregate calculation, with a set of sufficient statistics for
    *             each (feature, bin).
    * @param treePoint  Data point being aggregated.
-   * @param bins possible bins for all features, indexed (numFeatures)(numBins)
+   * @param splits possible splits indexed (numFeatures)(numSplits)
    * @param unorderedFeatures  Set of indices of unordered features.
    * @param instanceWeight  Weight (importance) of instance in dataset.
    */
   private def mixedBinSeqOp(
       agg: DTStatsAggregator,
       treePoint: TreePoint,
-      bins: Array[Array[Bin]],
+      splits: Array[Array[Split]],
       unorderedFeatures: Set[Int],
       instanceWeight: Double,
       featuresForNode: Option[Array[Int]]): Unit = {
@@ -362,7 +362,7 @@ object DecisionTree extends Serializable with Logging {
         val numSplits = agg.metadata.numSplits(featureIndex)
         var splitIndex = 0
         while (splitIndex < numSplits) {
-          if (bins(featureIndex)(splitIndex).highSplit.categories.contains(featureValue)) {
+          if (splits(featureIndex)(splitIndex).categories.contains(featureValue)) {
             agg.featureUpdate(leftNodeFeatureOffset, splitIndex, treePoint.label,
               instanceWeight)
           } else {
@@ -506,8 +506,8 @@ object DecisionTree extends Serializable with Logging {
         if (metadata.unorderedFeatures.isEmpty) {
           orderedBinSeqOp(agg(aggNodeIndex), baggedPoint.datum, instanceWeight, featuresForNode)
         } else {
-          mixedBinSeqOp(agg(aggNodeIndex), baggedPoint.datum, bins, metadata.unorderedFeatures,
-            instanceWeight, featuresForNode)
+          mixedBinSeqOp(agg(aggNodeIndex), baggedPoint.datum, splits,
+            metadata.unorderedFeatures, instanceWeight, featuresForNode)
         }
       }
     }
@@ -1024,35 +1024,15 @@ object DecisionTree extends Serializable with Logging {
             // Categorical feature
             val featureArity = metadata.featureArity(featureIndex)
             if (metadata.isUnordered(featureIndex)) {
-              // TODO: The second half of the bins are unused.  Actually, we could just use
-              //       splits and not build bins for unordered features.  That should be part of
-              //       a later PR since it will require changing other code (using splits instead
-              //       of bins in a few places).
               // Unordered features
-              //   2^(maxFeatureValue - 1) - 1 combinations
+              // 2^(maxFeatureValue - 1) - 1 combinations
               splits(featureIndex) = new Array[Split](numSplits)
-              bins(featureIndex) = new Array[Bin](numBins)
               var splitIndex = 0
               while (splitIndex < numSplits) {
                 val categories: List[Double] =
                   extractMultiClassCategories(splitIndex + 1, featureArity)
                 splits(featureIndex)(splitIndex) =
                   new Split(featureIndex, Double.MinValue, Categorical, categories)
-                bins(featureIndex)(splitIndex) = {
-                  if (splitIndex == 0) {
-                    new Bin(
-                      new DummyCategoricalSplit(featureIndex, Categorical),
-                      splits(featureIndex)(0),
-                      Categorical,
-                      Double.MinValue)
-                  } else {
-                    new Bin(
-                      splits(featureIndex)(splitIndex - 1),
-                      splits(featureIndex)(splitIndex),
-                      Categorical,
-                      Double.MinValue)
-                  }
-                }
                 splitIndex += 1
               }
             } else {
@@ -1060,8 +1040,11 @@ object DecisionTree extends Serializable with Logging {
               //   Bins correspond to feature values, so we do not need to compute splits or bins
               //   beforehand.  Splits are constructed as needed during training.
               splits(featureIndex) = new Array[Split](0)
-              bins(featureIndex) = new Array[Bin](0)
             }
+            // For ordered features, bins correspond to feature values.
+            // For unordered categorical features, there is no need to construct the bins.
+            // since there is a one-to-one correspondence between the splits and the bins.
+            bins(featureIndex) = new Array[Bin](0)
           }
           featureIndex += 1
         }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TreePoint.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TreePoint.scala
index 35e361ae309cc..50b292e71b067 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TreePoint.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TreePoint.scala
@@ -55,17 +55,15 @@ private[tree] object TreePoint {
       input: RDD[LabeledPoint],
       bins: Array[Array[Bin]],
       metadata: DecisionTreeMetadata): RDD[TreePoint] = {
-    // Construct arrays for featureArity and isUnordered for efficiency in the inner loop.
+    // Construct arrays for featureArity for efficiency in the inner loop.
     val featureArity: Array[Int] = new Array[Int](metadata.numFeatures)
-    val isUnordered: Array[Boolean] = new Array[Boolean](metadata.numFeatures)
     var featureIndex = 0
     while (featureIndex < metadata.numFeatures) {
       featureArity(featureIndex) = metadata.featureArity.getOrElse(featureIndex, 0)
-      isUnordered(featureIndex) = metadata.isUnordered(featureIndex)
       featureIndex += 1
     }
     input.map { x =>
-      TreePoint.labeledPointToTreePoint(x, bins, featureArity, isUnordered)
+      TreePoint.labeledPointToTreePoint(x, bins, featureArity)
     }
   }
 
@@ -74,19 +72,17 @@ private[tree] object TreePoint {
    * @param bins      Bins for features, of size (numFeatures, numBins).
    * @param featureArity  Array indexed by feature, with value 0 for continuous and numCategories
    *                      for categorical features.
-   * @param isUnordered  Array index by feature, with value true for unordered categorical features.
    */
   private def labeledPointToTreePoint(
       labeledPoint: LabeledPoint,
       bins: Array[Array[Bin]],
-      featureArity: Array[Int],
-      isUnordered: Array[Boolean]): TreePoint = {
+      featureArity: Array[Int]): TreePoint = {
     val numFeatures = labeledPoint.features.size
     val arr = new Array[Int](numFeatures)
     var featureIndex = 0
     while (featureIndex < numFeatures) {
       arr(featureIndex) = findBin(featureIndex, labeledPoint, featureArity(featureIndex),
-        isUnordered(featureIndex), bins)
+        bins)
       featureIndex += 1
     }
     new TreePoint(labeledPoint.label, arr)
@@ -96,14 +92,12 @@ private[tree] object TreePoint {
    * Find bin for one (labeledPoint, feature).
    *
    * @param featureArity  0 for continuous features; number of categories for categorical features.
-   * @param isUnorderedFeature  (only applies if feature is categorical)
    * @param bins   Bins for features, of size (numFeatures, numBins).
    */
   private def findBin(
       featureIndex: Int,
       labeledPoint: LabeledPoint,
       featureArity: Int,
-      isUnorderedFeature: Boolean,
       bins: Array[Array[Bin]]): Int = {
 
     /**
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
index 7b1aed5ffeb3e..4c162df810bb2 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
@@ -190,7 +190,7 @@ class DecisionTreeSuite extends FunSuite with MLlibTestSparkContext {
     assert(splits.length === 2)
     assert(bins.length === 2)
     assert(splits(0).length === 3)
-    assert(bins(0).length === 6)
+    assert(bins(0).length === 0)
 
     // Expecting 2^2 - 1 = 3 bins/splits
     assert(splits(0)(0).feature === 0)
@@ -228,41 +228,6 @@ class DecisionTreeSuite extends FunSuite with MLlibTestSparkContext {
     assert(splits(1)(2).categories.contains(0.0))
     assert(splits(1)(2).categories.contains(1.0))
 
-    // Check bins.
-
-    assert(bins(0)(0).category === Double.MinValue)
-    assert(bins(0)(0).lowSplit.categories.length === 0)
-    assert(bins(0)(0).highSplit.categories.length === 1)
-    assert(bins(0)(0).highSplit.categories.contains(0.0))
-    assert(bins(1)(0).category === Double.MinValue)
-    assert(bins(1)(0).lowSplit.categories.length === 0)
-    assert(bins(1)(0).highSplit.categories.length === 1)
-    assert(bins(1)(0).highSplit.categories.contains(0.0))
-
-    assert(bins(0)(1).category === Double.MinValue)
-    assert(bins(0)(1).lowSplit.categories.length === 1)
-    assert(bins(0)(1).lowSplit.categories.contains(0.0))
-    assert(bins(0)(1).highSplit.categories.length === 1)
-    assert(bins(0)(1).highSplit.categories.contains(1.0))
-    assert(bins(1)(1).category === Double.MinValue)
-    assert(bins(1)(1).lowSplit.categories.length === 1)
-    assert(bins(1)(1).lowSplit.categories.contains(0.0))
-    assert(bins(1)(1).highSplit.categories.length === 1)
-    assert(bins(1)(1).highSplit.categories.contains(1.0))
-
-    assert(bins(0)(2).category === Double.MinValue)
-    assert(bins(0)(2).lowSplit.categories.length === 1)
-    assert(bins(0)(2).lowSplit.categories.contains(1.0))
-    assert(bins(0)(2).highSplit.categories.length === 2)
-    assert(bins(0)(2).highSplit.categories.contains(1.0))
-    assert(bins(0)(2).highSplit.categories.contains(0.0))
-    assert(bins(1)(2).category === Double.MinValue)
-    assert(bins(1)(2).lowSplit.categories.length === 1)
-    assert(bins(1)(2).lowSplit.categories.contains(1.0))
-    assert(bins(1)(2).highSplit.categories.length === 2)
-    assert(bins(1)(2).highSplit.categories.contains(1.0))
-    assert(bins(1)(2).highSplit.categories.contains(0.0))
-
   }
 
   test("Multiclass classification with ordered categorical features: split and bin calculations") {

From 24f358b9d6bc7a72a4fb493b7f845a40ed941a5d Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Tue, 17 Feb 2015 11:35:26 -0800
Subject: [PATCH 219/357] MAINTENANCE: Automated closing of pull requests.

This commit exists to close the following pull requests on Github:

Closes #3297 (close requested by 'andrewor14')
Closes #3345 (close requested by 'pwendell')
Closes #2729 (close requested by 'srowen')
Closes #2320 (close requested by 'pwendell')
Closes #4529 (close requested by 'andrewor14')
Closes #2098 (close requested by 'srowen')
Closes #4120 (close requested by 'andrewor14')

From 49c19fdbad57f0609bbcc9278f9eaa8115a73604 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 17 Feb 2015 19:40:06 +0000
Subject: [PATCH 220/357] SPARK-5841 [CORE] [HOTFIX] Memory leak in
 DiskBlockManager

Avoid call to remove shutdown hook being called from shutdown hook

CC pwendell JoshRosen MattWhelan

Author: Sean Owen <sowen@cloudera.com>

Closes #4648 from srowen/SPARK-5841.2 and squashes the following commits:

51548db [Sean Owen] Avoid call to remove shutdown hook being called from shutdown hook
---
 .../scala/org/apache/spark/storage/DiskBlockManager.scala    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
index ae9df8cbe9821..b297f3fd9dd1e 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -138,7 +138,7 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon
     val shutdownHook = new Thread("delete Spark local dirs") {
       override def run(): Unit = Utils.logUncaughtExceptions {
         logDebug("Shutdown hook called")
-        DiskBlockManager.this.stop()
+        DiskBlockManager.this.doStop()
       }
     }
     Runtime.getRuntime.addShutdownHook(shutdownHook)
@@ -149,7 +149,10 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon
   private[spark] def stop() {
     // Remove the shutdown hook.  It causes memory leaks if we leave it around.
     Runtime.getRuntime.removeShutdownHook(shutdownHook)
+    doStop()
+  }
 
+  private def doStop(): Unit = {
     // Only perform cleanup if an external service is not serving our shuffle files.
     if (!blockManager.externalShuffleServiceEnabled || blockManager.blockManagerId.isDriver) {
       localDirs.foreach { localDir =>

From fc4eb9505adda192eb38cb4454d532027690bfa3 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 17 Feb 2015 12:05:06 -0800
Subject: [PATCH 221/357] [SPARK-5864] [PySpark] support .jar as python package

A jar file containing Python sources in it could be used as a Python package, just like zip file.

spark-submit already put the jar file into PYTHONPATH, this patch also put it in the sys.path, then it could be used in Python worker.

Author: Davies Liu <davies@databricks.com>

Closes #4652 from davies/jar and squashes the following commits:

17d3f76 [Davies Liu] support .jar as python package
---
 python/pyspark/context.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index bf1f61c8504ed..40b3152b23843 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -64,6 +64,8 @@ class SparkContext(object):
     _lock = Lock()
     _python_includes = None  # zip and egg files that need to be added to PYTHONPATH
 
+    PACKAGE_EXTENSIONS = ('.zip', '.egg', '.jar')
+
     def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
                  environment=None, batchSize=0, serializer=PickleSerializer(), conf=None,
                  gateway=None, jsc=None, profiler_cls=BasicProfiler):
@@ -185,7 +187,7 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
         for path in self._conf.get("spark.submit.pyFiles", "").split(","):
             if path != "":
                 (dirname, filename) = os.path.split(path)
-                if filename.lower().endswith("zip") or filename.lower().endswith("egg"):
+                if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
                     self._python_includes.append(filename)
                     sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))
 
@@ -705,7 +707,7 @@ def addPyFile(self, path):
         self.addFile(path)
         (dirname, filename) = os.path.split(path)  # dirname may be directory or HDFS/S3 prefix
 
-        if filename.endswith('.zip') or filename.endswith('.ZIP') or filename.endswith('.egg'):
+        if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
             self._python_includes.append(filename)
             # for tests in local mode
             sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))

From 31efb39c1deb253032b38e8fbafde4b2b1dde1f6 Mon Sep 17 00:00:00 2001
From: CodingCat <zhunansjtu@gmail.com>
Date: Tue, 17 Feb 2015 12:16:52 -0800
Subject: [PATCH 222/357] [Minor] fix typo in SQL document

Author: CodingCat <zhunansjtu@gmail.com>

Closes #4656 from CodingCat/fix_typo and squashes the following commits:

b41d15c [CodingCat] recover
689fe46 [CodingCat] fix typo
---
 docs/sql-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 8022c5ecc2430..0146a4ed1b745 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -159,7 +159,7 @@ you to construct DataFrames when the columns and their types are not known until
 
 <div data-lang="scala"  markdown="1">
 
-The Scala interaface for Spark SQL supports automatically converting an RDD containing case classes
+The Scala interface for Spark SQL supports automatically converting an RDD containing case classes
 to a DataFrame.  The case class
 defines the schema of the table.  The names of the arguments to the case class are read using
 reflection and become the names of the columns. Case classes can also be nested or contain complex

From 4611de1cef7363bc71ec608560dfd866ae477747 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 17 Feb 2015 12:23:18 -0800
Subject: [PATCH 223/357] [SPARK-5862][SQL] Only transformUp the given plan
 once in HiveMetastoreCatalog

Current `ParquetConversions` in `HiveMetastoreCatalog` will transformUp the given plan multiple times if there are many Metastore Parquet tables. Since the transformUp operation is recursive, it should be better to only perform it once.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #4651 from viirya/parquet_atonce and squashes the following commits:

c1ed29d [Liang-Chi Hsieh] Fix bug.
e0f919b [Liang-Chi Hsieh] Only transformUp the given plan once.
---
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 37 ++++++++++---------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 0e43faa8afdaf..cfd6f27371d0d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -430,33 +430,36 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
               hive.convertMetastoreParquet &&
               hive.conf.parquetUseDataSourceApi &&
               relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
-          relation
+          val parquetRelation = convertToParquetRelation(relation)
+          val attributedRewrites = relation.output.zip(parquetRelation.output)
+          (relation, parquetRelation, attributedRewrites)
 
         // Read path
         case p @ PhysicalOperation(_, _, relation: MetastoreRelation)
             if hive.convertMetastoreParquet &&
               hive.conf.parquetUseDataSourceApi &&
               relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
-          relation
+          val parquetRelation = convertToParquetRelation(relation)
+          val attributedRewrites = relation.output.zip(parquetRelation.output)
+          (relation, parquetRelation, attributedRewrites)
       }
 
+      val relationMap = toBeReplaced.map(r => (r._1, r._2)).toMap
+      val attributedRewrites = AttributeMap(toBeReplaced.map(_._3).fold(Nil)(_ ++: _))
+
       // Replaces all `MetastoreRelation`s with corresponding `ParquetRelation2`s, and fixes
       // attribute IDs referenced in other nodes.
-      toBeReplaced.distinct.foldLeft(plan) { (lastPlan, relation) =>
-        val parquetRelation = convertToParquetRelation(relation)
-        val attributedRewrites = AttributeMap(relation.output.zip(parquetRelation.output))
-
-        lastPlan.transformUp {
-          case r: MetastoreRelation if r == relation => {
-            val withAlias =
-              r.alias.map(a => Subquery(a, parquetRelation)).getOrElse(
-                Subquery(r.tableName, parquetRelation))
-
-            withAlias
-          }
-          case other => other.transformExpressions {
-            case a: Attribute if a.resolved => attributedRewrites.getOrElse(a, a)
-          }
+      plan.transformUp {
+        case r: MetastoreRelation if relationMap.contains(r) => {
+          val parquetRelation = relationMap(r)
+          val withAlias =
+            r.alias.map(a => Subquery(a, parquetRelation)).getOrElse(
+              Subquery(r.tableName, parquetRelation))
+
+          withAlias
+        }
+        case other => other.transformExpressions {
+          case a: Attribute if a.resolved => attributedRewrites.getOrElse(a, a)
         }
       }
     }

From ac506b7c2846f656e03839bbd0e93827c7cc613e Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 17 Feb 2015 12:24:13 -0800
Subject: [PATCH 224/357] [Minor][SQL] Use same function to check path
 parameter in JSONRelation

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #4649 from viirya/use_checkpath and squashes the following commits:

0f9a1a1 [Liang-Chi Hsieh] Use same function to check path parameter.
---
 .../main/scala/org/apache/spark/sql/json/JSONRelation.scala   | 4 ++--
 .../org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
index 24848634de9cf..3b68b7c275016 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
@@ -37,7 +37,7 @@ private[sql] class DefaultSource
   override def createRelation(
       sqlContext: SQLContext,
       parameters: Map[String, String]): BaseRelation = {
-    val path = parameters.getOrElse("path", sys.error("Option 'path' not specified"))
+    val path = checkPath(parameters)
     val samplingRatio = parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0)
 
     JSONRelation(path, samplingRatio, None)(sqlContext)
@@ -48,7 +48,7 @@ private[sql] class DefaultSource
       sqlContext: SQLContext,
       parameters: Map[String, String],
       schema: StructType): BaseRelation = {
-    val path = parameters.getOrElse("path", sys.error("Option 'path' not specified"))
+    val path = checkPath(parameters)
     val samplingRatio = parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0)
 
     JSONRelation(path, samplingRatio, Some(schema))(sqlContext)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 0263e3bb56617..485d5c95bfc44 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -547,7 +547,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
         Map.empty[String, String])
     }.getMessage
     assert(
-      message.contains("Option 'path' not specified"),
+      message.contains("'path' must be specified for json data."),
       "We should complain that path is not specified.")
 
     sql("DROP TABLE savedJsonTable")

From 9d281fa56022800dc008a3de233fec44379a2bd7 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Tue, 17 Feb 2015 12:25:35 -0800
Subject: [PATCH 225/357] [SQL] [Minor] Update the HiveContext Unittest

In unit test, the table src(key INT, value STRING) is not the same as HIVE src(key STRING, value STRING)
https://github.com/apache/hive/blob/branch-0.13/data/scripts/q_test_init.sql

And in the reflect.q, test failed for expression `reflect("java.lang.Integer", "valueOf", key, 16)`, which expect the argument `key` as STRING not INT.

This PR doesn't aim to change the `src` schema, we can do that after 1.3 released, however, we probably need to re-generate all the golden files.

Author: Cheng Hao <hao.cheng@intel.com>

Closes #4584 from chenghao-intel/reflect and squashes the following commits:

e5bdc3a [Cheng Hao] Move the test case reflect into blacklist
184abfd [Cheng Hao] revert the change to table src1
d9bcf92 [Cheng Hao] Update the HiveContext Unittest
---
 .../spark/sql/hive/execution/HiveCompatibilitySuite.scala   | 6 ++++++
 .../scala/org/apache/spark/sql/hive/test/TestHive.scala     | 1 +
 .../golden/udf_reflect2-0-50131c0ba7b7a6b65c789a5a8497bada  | 1 +
 .../golden/udf_reflect2-1-7bec330c7bc6f71cbaf9bf1883d1b184  | 1 +
 .../golden/udf_reflect2-2-c5a05379f482215a5a484bed0299bf19  | 3 +++
 .../golden/udf_reflect2-3-effc057c78c00b0af26a4ac0f5f116ca  | 0
 .../golden/udf_reflect2-4-73d466e70e96e9e5f0cd373b37d4e1f4  | 5 +++++
 7 files changed, 17 insertions(+)
 create mode 100644 sql/hive/src/test/resources/golden/udf_reflect2-0-50131c0ba7b7a6b65c789a5a8497bada
 create mode 100644 sql/hive/src/test/resources/golden/udf_reflect2-1-7bec330c7bc6f71cbaf9bf1883d1b184
 create mode 100644 sql/hive/src/test/resources/golden/udf_reflect2-2-c5a05379f482215a5a484bed0299bf19
 create mode 100644 sql/hive/src/test/resources/golden/udf_reflect2-3-effc057c78c00b0af26a4ac0f5f116ca
 create mode 100644 sql/hive/src/test/resources/golden/udf_reflect2-4-73d466e70e96e9e5f0cd373b37d4e1f4

diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 133f2d3c84a2e..c6ead4562d51e 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -225,6 +225,11 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     // Needs constant object inspectors
     "udf_round",
 
+    // the table src(key INT, value STRING) is not the same as HIVE unittest. In Hive
+    // is src(key STRING, value STRING), and in the reflect.q, it failed in
+    // Integer.valueOf, which expect the first argument passed as STRING type not INT.
+    "udf_reflect",
+
     // Sort with Limit clause causes failure.
     "ctas",
     "ctas_hadoop20",
@@ -886,6 +891,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_power",
     "udf_radians",
     "udf_rand",
+    "udf_reflect2",
     "udf_regexp",
     "udf_regexp_extract",
     "udf_regexp_replace",
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 840fbc197259a..a2d99f1f4b28d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -196,6 +196,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
 
   // The test tables that are defined in the Hive QTestUtil.
   // /itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java
+  // https://github.com/apache/hive/blob/branch-0.13/data/scripts/q_test_init.sql
   val hiveQTestUtilTables = Seq(
     TestTable("src",
       "CREATE TABLE src (key INT, value STRING)".cmd,
diff --git a/sql/hive/src/test/resources/golden/udf_reflect2-0-50131c0ba7b7a6b65c789a5a8497bada b/sql/hive/src/test/resources/golden/udf_reflect2-0-50131c0ba7b7a6b65c789a5a8497bada
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_reflect2-0-50131c0ba7b7a6b65c789a5a8497bada
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/udf_reflect2-1-7bec330c7bc6f71cbaf9bf1883d1b184 b/sql/hive/src/test/resources/golden/udf_reflect2-1-7bec330c7bc6f71cbaf9bf1883d1b184
new file mode 100644
index 0000000000000..cd35e5b290db5
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_reflect2-1-7bec330c7bc6f71cbaf9bf1883d1b184
@@ -0,0 +1 @@
+reflect2(arg0,method[,arg1[,arg2..]]) calls method of arg0 with reflection
diff --git a/sql/hive/src/test/resources/golden/udf_reflect2-2-c5a05379f482215a5a484bed0299bf19 b/sql/hive/src/test/resources/golden/udf_reflect2-2-c5a05379f482215a5a484bed0299bf19
new file mode 100644
index 0000000000000..48ef97292ab62
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_reflect2-2-c5a05379f482215a5a484bed0299bf19
@@ -0,0 +1,3 @@
+reflect2(arg0,method[,arg1[,arg2..]]) calls method of arg0 with reflection
+Use this UDF to call Java methods by matching the argument signature
+
diff --git a/sql/hive/src/test/resources/golden/udf_reflect2-3-effc057c78c00b0af26a4ac0f5f116ca b/sql/hive/src/test/resources/golden/udf_reflect2-3-effc057c78c00b0af26a4ac0f5f116ca
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/udf_reflect2-4-73d466e70e96e9e5f0cd373b37d4e1f4 b/sql/hive/src/test/resources/golden/udf_reflect2-4-73d466e70e96e9e5f0cd373b37d4e1f4
new file mode 100644
index 0000000000000..176ea0358d7ea
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/udf_reflect2-4-73d466e70e96e9e5f0cd373b37d4e1f4
@@ -0,0 +1,5 @@
+238	-18	238	238	238	238.0	238.0	238	val_238	val_238_concat	false	true	false	false	false	val_238	-1	-1	VALUE_238	al_238	al_2	VAL_238	val_238	2013-02-15 19:41:20	113	1	5	19	41	20	1360986080000
+86	86	86	86	86	86.0	86.0	86	val_86	val_86_concat	true	true	true	true	true	val_86	-1	-1	VALUE_86	al_86	al_8	VAL_86	val_86	2013-02-15 19:41:20	113	1	5	19	41	20	1360986080000
+311	55	311	311	311	311.0	311.0	311	val_311	val_311_concat	false	true	false	false	false	val_311	5	6	VALUE_311	al_311	al_3	VAL_311	val_311	2013-02-15 19:41:20	113	1	5	19	41	20	1360986080000
+27	27	27	27	27	27.0	27.0	27	val_27	val_27_concat	false	true	false	false	false	val_27	-1	-1	VALUE_27	al_27	al_2	VAL_27	val_27	2013-02-15 19:41:20	113	1	5	19	41	20	1360986080000
+165	-91	165	165	165	165.0	165.0	165	val_165	val_165_concat	false	true	false	false	false	val_165	4	4	VALUE_165	al_165	al_1	VAL_165	val_165	2013-02-15 19:41:20	113	1	5	19	41	20	1360986080000

From de4836f8f12c36c1b350cef288a75b5e59155735 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 17 Feb 2015 13:23:45 -0800
Subject: [PATCH 226/357] [SPARK-5868][SQL] Fix python UDFs in HiveContext and
 checks in SQLContext

Author: Michael Armbrust <michael@databricks.com>

Closes #4657 from marmbrus/pythonUdfs and squashes the following commits:

a7823a8 [Michael Armbrust] [SPARK-5868][SQL] Fix python UDFs in HiveContext and checks in SQLContext
---
 sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala  | 3 ++-
 .../main/scala/org/apache/spark/sql/execution/pythonUdfs.scala | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 31afa0eb59a8e..709b350144c75 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -113,6 +113,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
   protected[sql] lazy val analyzer: Analyzer =
     new Analyzer(catalog, functionRegistry, caseSensitive = true) {
       override val extendedResolutionRules =
+        ExtractPythonUdfs ::
         sources.PreWriteCheck(catalog) ::
         sources.PreInsertCastAndRename ::
         Nil
@@ -1059,7 +1060,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
   @DeveloperApi
   protected[sql] class QueryExecution(val logical: LogicalPlan) {
 
-    lazy val analyzed: LogicalPlan = ExtractPythonUdfs(analyzer(logical))
+    lazy val analyzed: LogicalPlan = analyzer(logical)
     lazy val withCachedData: LogicalPlan = cacheManager.useCachedData(analyzed)
     lazy val optimizedPlan: LogicalPlan = optimizer(withCachedData)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
index 3a2f8d75dac5e..69de4d168a372 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
@@ -205,6 +205,9 @@ case class EvaluatePython(
   extends logical.UnaryNode {
 
   def output = child.output :+ resultAttribute
+
+  // References should not include the produced attribute.
+  override def references = udf.references
 }
 
 /**

From 445a755b884885b88c1778fd56a3151045b0b0ed Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 17 Feb 2015 13:36:43 -0800
Subject: [PATCH 227/357] [SPARK-4172] [PySpark] Progress API in Python

This patch bring the pull based progress API into Python, also a example in Python.

Author: Davies Liu <davies@databricks.com>

Closes #3027 from davies/progress_api and squashes the following commits:

b1ba984 [Davies Liu] fix style
d3b9253 [Davies Liu] add tests, mute the exception after stop
4297327 [Davies Liu] Merge branch 'master' of github.com:apache/spark into progress_api
969fa9d [Davies Liu] Merge branch 'master' of github.com:apache/spark into progress_api
25590c9 [Davies Liu] update with Java API
360de2d [Davies Liu] Merge branch 'master' of github.com:apache/spark into progress_api
c0f1021 [Davies Liu] Merge branch 'master' of github.com:apache/spark into progress_api
023afb3 [Davies Liu] add Python API and example for progress API
---
 .../spark/scheduler/TaskResultGetter.scala    | 40 ++++----
 examples/src/main/python/status_api_demo.py   | 67 +++++++++++++
 python/pyspark/__init__.py                    | 15 +--
 python/pyspark/context.py                     |  7 ++
 python/pyspark/status.py                      | 96 +++++++++++++++++++
 python/pyspark/tests.py                       | 31 ++++++
 6 files changed, 232 insertions(+), 24 deletions(-)
 create mode 100644 examples/src/main/python/status_api_demo.py
 create mode 100644 python/pyspark/status.py

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
index 774f3d8cdb275..3938580aeea59 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.scheduler
 
 import java.nio.ByteBuffer
+import java.util.concurrent.RejectedExecutionException
 
 import scala.language.existentials
 import scala.util.control.NonFatal
@@ -95,25 +96,30 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
   def enqueueFailedTask(taskSetManager: TaskSetManager, tid: Long, taskState: TaskState,
     serializedData: ByteBuffer) {
     var reason : TaskEndReason = UnknownReason
-    getTaskResultExecutor.execute(new Runnable {
-      override def run(): Unit = Utils.logUncaughtExceptions {
-        try {
-          if (serializedData != null && serializedData.limit() > 0) {
-            reason = serializer.get().deserialize[TaskEndReason](
-              serializedData, Utils.getSparkClassLoader)
+    try {
+      getTaskResultExecutor.execute(new Runnable {
+        override def run(): Unit = Utils.logUncaughtExceptions {
+          try {
+            if (serializedData != null && serializedData.limit() > 0) {
+              reason = serializer.get().deserialize[TaskEndReason](
+                serializedData, Utils.getSparkClassLoader)
+            }
+          } catch {
+            case cnd: ClassNotFoundException =>
+              // Log an error but keep going here -- the task failed, so not catastrophic
+              // if we can't deserialize the reason.
+              val loader = Utils.getContextOrSparkClassLoader
+              logError(
+                "Could not deserialize TaskEndReason: ClassNotFound with classloader " + loader)
+            case ex: Exception => {}
           }
-        } catch {
-          case cnd: ClassNotFoundException =>
-            // Log an error but keep going here -- the task failed, so not catastrophic if we can't
-            // deserialize the reason.
-            val loader = Utils.getContextOrSparkClassLoader
-            logError(
-              "Could not deserialize TaskEndReason: ClassNotFound with classloader " + loader)
-          case ex: Exception => {}
+          scheduler.handleFailedTask(taskSetManager, tid, taskState, reason)
         }
-        scheduler.handleFailedTask(taskSetManager, tid, taskState, reason)
-      }
-    })
+      })
+    } catch {
+      case e: RejectedExecutionException if sparkEnv.isStopped =>
+        // ignore it
+    }
   }
 
   def stop() {
diff --git a/examples/src/main/python/status_api_demo.py b/examples/src/main/python/status_api_demo.py
new file mode 100644
index 0000000000000..a33bdc475a06d
--- /dev/null
+++ b/examples/src/main/python/status_api_demo.py
@@ -0,0 +1,67 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import time
+import threading
+import Queue
+
+from pyspark import SparkConf, SparkContext
+
+
+def delayed(seconds):
+    def f(x):
+        time.sleep(seconds)
+        return x
+    return f
+
+
+def call_in_background(f, *args):
+    result = Queue.Queue(1)
+    t = threading.Thread(target=lambda: result.put(f(*args)))
+    t.daemon = True
+    t.start()
+    return result
+
+
+def main():
+    conf = SparkConf().set("spark.ui.showConsoleProgress", "false")
+    sc = SparkContext(appName="PythonStatusAPIDemo", conf=conf)
+
+    def run():
+        rdd = sc.parallelize(range(10), 10).map(delayed(2))
+        reduced = rdd.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
+        return reduced.map(delayed(2)).collect()
+
+    result = call_in_background(run)
+    status = sc.statusTracker()
+    while result.empty():
+        ids = status.getJobIdsForGroup()
+        for id in ids:
+            job = status.getJobInfo(id)
+            print "Job", id, "status: ", job.status
+            for sid in job.stageIds:
+                info = status.getStageInfo(sid)
+                if info:
+                    print "Stage %d: %d tasks total (%d active, %d complete)" % \
+                          (sid, info.numTasks, info.numActiveTasks, info.numCompletedTasks)
+        time.sleep(1)
+
+    print "Job results are:", result.get()
+    sc.stop()
+
+if __name__ == "__main__":
+    main()
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index d3efcdf221d82..5f70ac6ed8fe6 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -22,17 +22,17 @@
 
   - :class:`SparkContext`:
       Main entry point for Spark functionality.
-  - L{RDD}
+  - :class:`RDD`:
       A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
-  - L{Broadcast}
+  - :class:`Broadcast`:
       A broadcast variable that gets reused across tasks.
-  - L{Accumulator}
+  - :class:`Accumulator`:
       An "add-only" shared variable that tasks can only add values to.
-  - L{SparkConf}
+  - :class:`SparkConf`:
       For configuring Spark.
-  - L{SparkFiles}
+  - :class:`SparkFiles`:
       Access files shipped with jobs.
-  - L{StorageLevel}
+  - :class:`StorageLevel`:
       Finer-grained cache persistence levels.
 
 """
@@ -45,6 +45,7 @@
 from pyspark.accumulators import Accumulator, AccumulatorParam
 from pyspark.broadcast import Broadcast
 from pyspark.serializers import MarshalSerializer, PickleSerializer
+from pyspark.status import *
 from pyspark.profiler import Profiler, BasicProfiler
 
 # for back compatibility
@@ -53,5 +54,5 @@
 __all__ = [
     "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", "Broadcast",
     "Accumulator", "AccumulatorParam", "MarshalSerializer", "PickleSerializer",
-    "Profiler", "BasicProfiler",
+    "StatusTracker", "SparkJobInfo", "SparkStageInfo", "Profiler", "BasicProfiler",
 ]
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 40b3152b23843..6011caf9f1c5a 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -32,6 +32,7 @@
 from pyspark.storagelevel import StorageLevel
 from pyspark.rdd import RDD
 from pyspark.traceback_utils import CallSite, first_spark_call
+from pyspark.status import StatusTracker
 from pyspark.profiler import ProfilerCollector, BasicProfiler
 
 from py4j.java_collections import ListConverter
@@ -810,6 +811,12 @@ def cancelAllJobs(self):
         """
         self._jsc.sc().cancelAllJobs()
 
+    def statusTracker(self):
+        """
+        Return :class:`StatusTracker` object
+        """
+        return StatusTracker(self._jsc.statusTracker())
+
     def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False):
         """
         Executes the given partitionFunc on the specified set of partitions,
diff --git a/python/pyspark/status.py b/python/pyspark/status.py
new file mode 100644
index 0000000000000..a6fa7dd3144d4
--- /dev/null
+++ b/python/pyspark/status.py
@@ -0,0 +1,96 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from collections import namedtuple
+
+__all__ = ["SparkJobInfo", "SparkStageInfo", "StatusTracker"]
+
+
+class SparkJobInfo(namedtuple("SparkJobInfo", "jobId stageIds status")):
+    """
+    Exposes information about Spark Jobs.
+    """
+
+
+class SparkStageInfo(namedtuple("SparkStageInfo",
+                                "stageId currentAttemptId name numTasks numActiveTasks "
+                                "numCompletedTasks numFailedTasks")):
+    """
+    Exposes information about Spark Stages.
+    """
+
+
+class StatusTracker(object):
+    """
+    Low-level status reporting APIs for monitoring job and stage progress.
+
+    These APIs intentionally provide very weak consistency semantics;
+    consumers of these APIs should be prepared to handle empty / missing
+    information. For example, a job's stage ids may be known but the status
+    API may not have any information about the details of those stages, so
+    `getStageInfo` could potentially return `None` for a valid stage id.
+
+    To limit memory usage, these APIs only provide information on recent
+    jobs / stages.  These APIs will provide information for the last
+    `spark.ui.retainedStages` stages and `spark.ui.retainedJobs` jobs.
+    """
+    def __init__(self, jtracker):
+        self._jtracker = jtracker
+
+    def getJobIdsForGroup(self, jobGroup=None):
+        """
+        Return a list of all known jobs in a particular job group.  If
+        `jobGroup` is None, then returns all known jobs that are not
+        associated with a job group.
+
+        The returned list may contain running, failed, and completed jobs,
+        and may vary across invocations of this method. This method does
+        not guarantee the order of the elements in its result.
+        """
+        return list(self._jtracker.getJobIdsForGroup(jobGroup))
+
+    def getActiveStageIds(self):
+        """
+        Returns an array containing the ids of all active stages.
+        """
+        return sorted(list(self._jtracker.getActiveStageIds()))
+
+    def getActiveJobsIds(self):
+        """
+        Returns an array containing the ids of all active jobs.
+        """
+        return sorted((list(self._jtracker.getActiveJobIds())))
+
+    def getJobInfo(self, jobId):
+        """
+        Returns a :class:`SparkJobInfo` object, or None if the job info
+        could not be found or was garbage collected.
+        """
+        job = self._jtracker.getJobInfo(jobId)
+        if job is not None:
+            return SparkJobInfo(jobId, job.stageIds(), str(job.status()))
+
+    def getStageInfo(self, stageId):
+        """
+        Returns a :class:`SparkStageInfo` object, or None if the stage
+        info could not be found or was garbage collected.
+        """
+        stage = self._jtracker.getStageInfo(stageId)
+        if stage is not None:
+            # TODO: fetch them in batch for better performance
+            attrs = [getattr(stage, f)() for f in SparkStageInfo._fields[1:]]
+            return SparkStageInfo(stageId, *attrs)
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index b5e28c498040b..d6afc1cdaa4dc 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -1550,6 +1550,37 @@ def test_with_stop(self):
             sc.stop()
         self.assertEqual(SparkContext._active_spark_context, None)
 
+    def test_progress_api(self):
+        with SparkContext() as sc:
+            sc.setJobGroup('test_progress_api', '', True)
+
+            rdd = sc.parallelize(range(10)).map(lambda x: time.sleep(100))
+            t = threading.Thread(target=rdd.collect)
+            t.daemon = True
+            t.start()
+            # wait for scheduler to start
+            time.sleep(1)
+
+            tracker = sc.statusTracker()
+            jobIds = tracker.getJobIdsForGroup('test_progress_api')
+            self.assertEqual(1, len(jobIds))
+            job = tracker.getJobInfo(jobIds[0])
+            self.assertEqual(1, len(job.stageIds))
+            stage = tracker.getStageInfo(job.stageIds[0])
+            self.assertEqual(rdd.getNumPartitions(), stage.numTasks)
+
+            sc.cancelAllJobs()
+            t.join()
+            # wait for event listener to update the status
+            time.sleep(1)
+
+            job = tracker.getJobInfo(jobIds[0])
+            self.assertEqual('FAILED', job.status)
+            self.assertEqual([], tracker.getActiveJobsIds())
+            self.assertEqual([], tracker.getActiveStageIds())
+
+            sc.stop()
+
 
 @unittest.skipIf(not _have_scipy, "SciPy not installed")
 class SciPyTests(PySparkTestCase):

From 3df85dccbc8fd1ba19bbcdb8d359c073b1494d98 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 17 Feb 2015 13:48:38 -0800
Subject: [PATCH 228/357] [SPARK-5871] output explain in Python

Author: Davies Liu <davies@databricks.com>

Closes #4658 from davies/explain and squashes the following commits:

db87ea2 [Davies Liu] output explain in Python
---
 python/pyspark/sql/dataframe.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 841724095f693..388033d385e13 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -244,8 +244,25 @@ def explain(self, extended=False):
         debugging purpose.
 
         If extended is False, only prints the physical plan.
-        """
-        self._jdf.explain(extended)
+
+        >>> df.explain()
+        PhysicalRDD [age#0,name#1], MapPartitionsRDD[...] at mapPartitions at SQLContext.scala:...
+
+        >>> df.explain(True)
+        == Parsed Logical Plan ==
+        ...
+        == Analyzed Logical Plan ==
+        ...
+        == Optimized Logical Plan ==
+        ...
+        == Physical Plan ==
+        ...
+        == RDD ==
+        """
+        if extended:
+            print self._jdf.queryExecution().toString()
+        else:
+            print self._jdf.queryExecution().executedPlan().toString()
 
     def isLocal(self):
         """
@@ -1034,7 +1051,7 @@ def _test():
                                   Row(name='Bob', age=5, height=85)]).toDF()
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.dataframe, globs=globs,
-        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
+        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
     globs['sc'].stop()
     if failure_count:
         exit(-1)

From 4d4cc760fa9687ce563320094557ef9144488676 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 17 Feb 2015 15:44:37 -0800
Subject: [PATCH 229/357] [SPARK-5872] [SQL] create a sqlCtx in pyspark shell

The sqlCtx will be HiveContext if hive is built in assembly jar, or SQLContext if not.

It also skip the Hive tests in pyspark.sql.tests if no hive is available.

Author: Davies Liu <davies@databricks.com>

Closes #4659 from davies/sqlctx and squashes the following commits:

0e6629a [Davies Liu] sqlCtx in pyspark
---
 python/pyspark/shell.py     | 13 ++++++++++++-
 python/pyspark/sql/tests.py | 12 ++++++++++--
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py
index 4cf4b89ccfaa7..1a02fece9c5a5 100644
--- a/python/pyspark/shell.py
+++ b/python/pyspark/shell.py
@@ -31,8 +31,12 @@
 import atexit
 import os
 import platform
+
+import py4j
+
 import pyspark
 from pyspark.context import SparkContext
+from pyspark.sql import SQLContext, HiveContext
 from pyspark.storagelevel import StorageLevel
 
 # this is the deprecated equivalent of ADD_JARS
@@ -46,6 +50,13 @@
 sc = SparkContext(appName="PySparkShell", pyFiles=add_files)
 atexit.register(lambda: sc.stop())
 
+try:
+    # Try to access HiveConf, it will raise exception if Hive is not added
+    sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
+    sqlCtx = HiveContext(sc)
+except py4j.protocol.Py4JError:
+    sqlCtx = SQLContext(sc)
+
 print("""Welcome to
       ____              __
      / __/__  ___ _____/ /__
@@ -57,7 +68,7 @@
     platform.python_version(),
     platform.python_build()[0],
     platform.python_build()[1]))
-print("SparkContext available as sc.")
+print("SparkContext available as sc, %s available as sqlCtx." % sqlCtx.__class__.__name__)
 
 if add_files is not None:
     print("Warning: ADD_FILES environment variable is deprecated, use --py-files argument instead")
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index aa80bca34655d..52f7e65d9ca78 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -25,6 +25,8 @@
 import shutil
 import tempfile
 
+import py4j
+
 if sys.version_info[:2] <= (2, 6):
     try:
         import unittest2 as unittest
@@ -329,9 +331,12 @@ class HiveContextSQLTests(ReusedPySparkTestCase):
     def setUpClass(cls):
         ReusedPySparkTestCase.setUpClass()
         cls.tempdir = tempfile.NamedTemporaryFile(delete=False)
+        try:
+            cls.sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
+        except py4j.protocol.Py4JError:
+            cls.sqlCtx = None
+            return
         os.unlink(cls.tempdir.name)
-        print "type", type(cls.sc)
-        print "type", type(cls.sc._jsc)
         _scala_HiveContext =\
             cls.sc._jvm.org.apache.spark.sql.hive.test.TestHiveContext(cls.sc._jsc.sc())
         cls.sqlCtx = HiveContext(cls.sc, _scala_HiveContext)
@@ -344,6 +349,9 @@ def tearDownClass(cls):
         shutil.rmtree(cls.tempdir.name, ignore_errors=True)
 
     def test_save_and_load_table(self):
+        if self.sqlCtx is None:
+            return  # no hive available, skipped
+
         df = self.df
         tmpPath = tempfile.mkdtemp()
         shutil.rmtree(tmpPath)

From 117121a4ecaadda156a82255333670775e7727db Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 17 Feb 2015 15:47:59 -0800
Subject: [PATCH 230/357] [SPARK-5852][SQL]Fail to convert a newly created
 empty metastore parquet table to a data source parquet table.

The problem is that after we create an empty hive metastore parquet table (e.g. `CREATE TABLE test (a int) STORED AS PARQUET`), Hive will create an empty dir for us, which cause our data source `ParquetRelation2` fail to get the schema of the table. See JIRA for the case to reproduce the bug and the exception.

This PR is based on #4562 from chenghao-intel.

JIRA: https://issues.apache.org/jira/browse/SPARK-5852

Author: Yin Huai <yhuai@databricks.com>
Author: Cheng Hao <hao.cheng@intel.com>

Closes #4655 from yhuai/CTASParquet and squashes the following commits:

b8b3450 [Yin Huai] Update tests.
2ac94f7 [Yin Huai] Update tests.
3db3d20 [Yin Huai] Minor update.
d7e2308 [Yin Huai] Revert changes in HiveMetastoreCatalog.scala.
36978d1 [Cheng Hao] Update the code as feedback
a04930b [Cheng Hao] fix bug of scan an empty parquet based table
442ffe0 [Cheng Hao] passdown the schema for Parquet File in HiveContext
---
 .../apache/spark/sql/parquet/newParquet.scala |  18 ++-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  38 ++++++
 .../spark/sql/parquet/parquetSuites.scala     | 114 +++++++++++++++++-
 3 files changed, 164 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 95bea9201163d..16b771344bfcd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -287,7 +287,16 @@ private[sql] case class ParquetRelation2(
         }
       }
 
-      parquetSchema = maybeSchema.getOrElse(readSchema())
+      // To get the schema. We first try to get the schema defined in maybeSchema.
+      // If maybeSchema is not defined, we will try to get the schema from existing parquet data
+      // (through readSchema). If data does not exist, we will try to get the schema defined in
+      // maybeMetastoreSchema (defined in the options of the data source).
+      // Finally, if we still could not get the schema. We throw an error.
+      parquetSchema =
+        maybeSchema
+          .orElse(readSchema())
+          .orElse(maybeMetastoreSchema)
+          .getOrElse(sys.error("Failed to get the schema."))
 
       partitionKeysIncludedInParquetSchema =
         isPartitioned &&
@@ -308,7 +317,7 @@ private[sql] case class ParquetRelation2(
       }
     }
 
-    private def readSchema(): StructType = {
+    private def readSchema(): Option[StructType] = {
       // Sees which file(s) we need to touch in order to figure out the schema.
       val filesToTouch =
       // Always tries the summary files first if users don't require a merged schema.  In this case,
@@ -611,7 +620,8 @@ private[sql] object ParquetRelation2 {
   // internally.
   private[sql] val METASTORE_SCHEMA = "metastoreSchema"
 
-  private[parquet] def readSchema(footers: Seq[Footer], sqlContext: SQLContext): StructType = {
+  private[parquet] def readSchema(
+      footers: Seq[Footer], sqlContext: SQLContext): Option[StructType] = {
     footers.map { footer =>
       val metadata = footer.getParquetMetadata.getFileMetaData
       val parquetSchema = metadata.getSchema
@@ -630,7 +640,7 @@ private[sql] object ParquetRelation2 {
             sqlContext.conf.isParquetBinaryAsString,
             sqlContext.conf.isParquetINT96AsTimestamp))
       }
-    }.reduce { (left, right) =>
+    }.reduceOption { (left, right) =>
       try left.merge(right) catch { case e: Throwable =>
         throw new SparkException(s"Failed to merge incompatible schemas $left and $right", e)
       }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 485d5c95bfc44..c30090fabbc8d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive
 
 import java.io.File
+
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.commons.io.FileUtils
@@ -30,6 +31,8 @@ import org.apache.spark.util.Utils
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.parquet.ParquetRelation2
+import org.apache.spark.sql.sources.LogicalRelation
 
 /**
  * Tests for persisting tables created though the data sources API into the metastore.
@@ -553,4 +556,39 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
     sql("DROP TABLE savedJsonTable")
     conf.setConf(SQLConf.DEFAULT_DATA_SOURCE_NAME, originalDefaultSource)
   }
+
+  if (HiveShim.version == "0.13.1") {
+    test("scan a parquet table created through a CTAS statement") {
+      val originalConvertMetastore = getConf("spark.sql.hive.convertMetastoreParquet", "true")
+      val originalUseDataSource = getConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
+      setConf("spark.sql.hive.convertMetastoreParquet", "true")
+      setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
+
+      val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
+      jsonRDD(rdd).registerTempTable("jt")
+      sql(
+        """
+          |create table test_parquet_ctas STORED AS parquET
+          |AS select tmp.a from jt tmp where tmp.a < 5
+        """.stripMargin)
+
+      checkAnswer(
+        sql(s"SELECT a FROM test_parquet_ctas WHERE a > 2 "),
+        Row(3) :: Row(4) :: Nil
+      )
+
+      table("test_parquet_ctas").queryExecution.analyzed match {
+        case LogicalRelation(p: ParquetRelation2) => // OK
+        case _ =>
+          fail(
+            s"test_parquet_ctas should be converted to ${classOf[ParquetRelation2].getCanonicalName}")
+      }
+
+      // Clenup and reset confs.
+      sql("DROP TABLE IF EXISTS jt")
+      sql("DROP TABLE IF EXISTS test_parquet_ctas")
+      setConf("spark.sql.hive.convertMetastoreParquet", originalConvertMetastore)
+      setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalUseDataSource)
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
index 2acf1a7767c19..653f4b47367c4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
@@ -20,15 +20,15 @@ package org.apache.spark.sql.parquet
 
 import java.io.File
 
-import org.apache.spark.sql.catalyst.expressions.Row
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.{SQLConf, QueryTest}
+import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.execution.PhysicalRDD
 import org.apache.spark.sql.hive.execution.HiveTableScan
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
-
+import org.apache.spark.sql.sources.LogicalRelation
 
 // The data where the partitioning key exists only in the directory structure.
 case class ParquetData(intField: Int, stringField: String)
@@ -121,13 +121,123 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
 
   override def beforeAll(): Unit = {
     super.beforeAll()
+
+    val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
+    jsonRDD(rdd).registerTempTable("jt")
+
+    sql(
+      """
+        |create table test_parquet
+        |(
+        |  intField INT,
+        |  stringField STRING
+        |)
+        |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+        |STORED AS
+        |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+        |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+      """.stripMargin)
+
     conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
   }
 
   override def afterAll(): Unit = {
     super.afterAll()
+    sql("DROP TABLE IF EXISTS jt")
+    sql("DROP TABLE IF EXISTS test_parquet")
+
     setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
   }
+
+  test("scan an empty parquet table") {
+    checkAnswer(sql("SELECT count(*) FROM test_parquet"), Row(0))
+  }
+
+  test("scan an empty parquet table with upper case") {
+    checkAnswer(sql("SELECT count(INTFIELD) FROM TEST_parquet"), Row(0))
+  }
+
+  test("insert into an empty parquet table") {
+    sql(
+      """
+        |create table test_insert_parquet
+        |(
+        |  intField INT,
+        |  stringField STRING
+        |)
+        |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+        |STORED AS
+        |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+        |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+      """.stripMargin)
+
+    // Insert into am empty table.
+    sql("insert into table test_insert_parquet select a, b from jt where jt.a > 5")
+    checkAnswer(
+      sql(s"SELECT intField, stringField FROM test_insert_parquet WHERE intField < 8"),
+      Row(6, "str6") :: Row(7, "str7") :: Nil
+    )
+    // Insert overwrite.
+    sql("insert overwrite table test_insert_parquet select a, b from jt where jt.a < 5")
+    checkAnswer(
+      sql(s"SELECT intField, stringField FROM test_insert_parquet WHERE intField > 2"),
+      Row(3, "str3") :: Row(4, "str4") :: Nil
+    )
+    sql("DROP TABLE IF EXISTS test_insert_parquet")
+
+    // Create it again.
+    sql(
+      """
+        |create table test_insert_parquet
+        |(
+        |  intField INT,
+        |  stringField STRING
+        |)
+        |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+        |STORED AS
+        |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+        |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+      """.stripMargin)
+    // Insert overwrite an empty table.
+    sql("insert overwrite table test_insert_parquet select a, b from jt where jt.a < 5")
+    checkAnswer(
+      sql(s"SELECT intField, stringField FROM test_insert_parquet WHERE intField > 2"),
+      Row(3, "str3") :: Row(4, "str4") :: Nil
+    )
+    // Insert into the table.
+    sql("insert into table test_insert_parquet select a, b from jt")
+    checkAnswer(
+      sql(s"SELECT intField, stringField FROM test_insert_parquet"),
+      (1 to 10).map(i => Row(i, s"str$i")) ++ (1 to 4).map(i => Row(i, s"str$i"))
+    )
+    sql("DROP TABLE IF EXISTS test_insert_parquet")
+  }
+
+  test("scan a parquet table created through a CTAS statement") {
+    sql(
+      """
+        |create table test_parquet_ctas ROW FORMAT
+        |SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+        |STORED AS
+        |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+        |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+        |AS select * from jt
+      """.stripMargin)
+
+    checkAnswer(
+      sql(s"SELECT a, b FROM test_parquet_ctas WHERE a = 1"),
+      Seq(Row(1, "str1"))
+    )
+
+    table("test_parquet_ctas").queryExecution.analyzed match {
+      case LogicalRelation(p: ParquetRelation2) => // OK
+      case _ =>
+        fail(
+          s"test_parquet_ctas should be converted to ${classOf[ParquetRelation2].getCanonicalName}")
+    }
+
+    sql("DROP TABLE IF EXISTS test_parquet_ctas")
+  }
 }
 
 class ParquetDataSourceOffMetastoreSuite extends ParquetMetastoreSuiteBase {

From c3d2b90bde2e11823909605d518167548df66bd8 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 17 Feb 2015 16:54:57 -0800
Subject: [PATCH 231/357] [SPARK-5785] [PySpark] narrow dependency for
 cogroup/join in PySpark

Currently, PySpark does not support narrow dependency during cogroup/join when the two RDDs have the partitioner, another unnecessary shuffle stage will come in.

The Python implementation of cogroup/join is different than Scala one, it depends on union() and partitionBy(). This patch will try to use PartitionerAwareUnionRDD() in union(), when all the RDDs have the same partitioner. It also fix `reservePartitioner` in all the map() or mapPartitions(), then partitionBy() can skip the unnecessary shuffle stage.

Author: Davies Liu <davies@databricks.com>

Closes #4629 from davies/narrow and squashes the following commits:

dffe34e [Davies Liu] improve test, check number of stages for join/cogroup
1ed3ba2 [Davies Liu] Merge branch 'master' of github.com:apache/spark into narrow
4d29932 [Davies Liu] address comment
cc28d97 [Davies Liu] add unit tests
940245e [Davies Liu] address comments
ff5a0a6 [Davies Liu] skip the partitionBy() on Python side
eb26c62 [Davies Liu] narrow dependency in PySpark
---
 .../scala/org/apache/spark/SparkContext.scala | 11 ++++-
 .../apache/spark/api/python/PythonRDD.scala   | 10 ++++
 .../main/scala/org/apache/spark/rdd/RDD.scala |  8 ++-
 python/pyspark/join.py                        |  8 +--
 python/pyspark/rdd.py                         | 49 +++++++++++++------
 python/pyspark/streaming/dstream.py           |  2 +-
 python/pyspark/tests.py                       | 38 +++++++++++++-
 7 files changed, 101 insertions(+), 25 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index fd8fac6df0d0e..d59b466830fdc 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -961,11 +961,18 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   }
 
   /** Build the union of a list of RDDs. */
-  def union[T: ClassTag](rdds: Seq[RDD[T]]): RDD[T] = new UnionRDD(this, rdds)
+  def union[T: ClassTag](rdds: Seq[RDD[T]]): RDD[T] = {
+    val partitioners = rdds.flatMap(_.partitioner).toSet
+    if (partitioners.size == 1) {
+      new PartitionerAwareUnionRDD(this, rdds)
+    } else {
+      new UnionRDD(this, rdds)
+    }
+  }
 
   /** Build the union of a list of RDDs passed as variable-length arguments. */
   def union[T: ClassTag](first: RDD[T], rest: RDD[T]*): RDD[T] =
-    new UnionRDD(this, Seq(first) ++ rest)
+    union(Seq(first) ++ rest)
 
   /** Get an RDD that has no partitions or elements. */
   def emptyRDD[T: ClassTag] = new EmptyRDD[T](this)
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 252721192904f..dcb6e6313a1d2 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -303,6 +303,7 @@ private class PythonException(msg: String, cause: Exception) extends RuntimeExce
 private class PairwiseRDD(prev: RDD[Array[Byte]]) extends
   RDD[(Long, Array[Byte])](prev) {
   override def getPartitions = prev.partitions
+  override val partitioner = prev.partitioner
   override def compute(split: Partition, context: TaskContext) =
     prev.iterator(split, context).grouped(2).map {
       case Seq(a, b) => (Utils.deserializeLongValue(a), b)
@@ -329,6 +330,15 @@ private[spark] object PythonRDD extends Logging {
     }
   }
 
+  /**
+   * Return an RDD of values from an RDD of (Long, Array[Byte]), with preservePartitions=true
+   *
+   * This is useful for PySpark to have the partitioner after partitionBy()
+   */
+  def valueOfPair(pair: JavaPairRDD[Long, Array[Byte]]): JavaRDD[Array[Byte]] = {
+    pair.rdd.mapPartitions(it => it.map(_._2), true)
+  }
+
   /**
    * Adapter for calling SparkContext#runJob from Python.
    *
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index fe55a5124f3b6..3ab9e54f0ec56 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -462,7 +462,13 @@ abstract class RDD[T: ClassTag](
    * Return the union of this RDD and another one. Any identical elements will appear multiple
    * times (use `.distinct()` to eliminate them).
    */
-  def union(other: RDD[T]): RDD[T] = new UnionRDD(sc, Array(this, other))
+  def union(other: RDD[T]): RDD[T] = {
+    if (partitioner.isDefined && other.partitioner == partitioner) {
+      new PartitionerAwareUnionRDD(sc, Array(this, other))
+    } else {
+      new UnionRDD(sc, Array(this, other))
+    }
+  }
 
   /**
    * Return the union of this RDD and another one. Any identical elements will appear multiple
diff --git a/python/pyspark/join.py b/python/pyspark/join.py
index b4a844713745a..efc1ef9396412 100644
--- a/python/pyspark/join.py
+++ b/python/pyspark/join.py
@@ -35,8 +35,8 @@
 
 
 def _do_python_join(rdd, other, numPartitions, dispatch):
-    vs = rdd.map(lambda (k, v): (k, (1, v)))
-    ws = other.map(lambda (k, v): (k, (2, v)))
+    vs = rdd.mapValues(lambda v: (1, v))
+    ws = other.mapValues(lambda v: (2, v))
     return vs.union(ws).groupByKey(numPartitions).flatMapValues(lambda x: dispatch(x.__iter__()))
 
 
@@ -98,8 +98,8 @@ def dispatch(seq):
 
 def python_cogroup(rdds, numPartitions):
     def make_mapper(i):
-        return lambda (k, v): (k, (i, v))
-    vrdds = [rdd.map(make_mapper(i)) for i, rdd in enumerate(rdds)]
+        return lambda v: (i, v)
+    vrdds = [rdd.mapValues(make_mapper(i)) for i, rdd in enumerate(rdds)]
     union_vrdds = reduce(lambda acc, other: acc.union(other), vrdds)
     rdd_len = len(vrdds)
 
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index bd4f16e058045..ba2347ae76844 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -111,6 +111,19 @@ def _parse_memory(s):
     return int(float(s[:-1]) * units[s[-1].lower()])
 
 
+class Partitioner(object):
+    def __init__(self, numPartitions, partitionFunc):
+        self.numPartitions = numPartitions
+        self.partitionFunc = partitionFunc
+
+    def __eq__(self, other):
+        return (isinstance(other, Partitioner) and self.numPartitions == other.numPartitions
+                and self.partitionFunc == other.partitionFunc)
+
+    def __call__(self, k):
+        return self.partitionFunc(k) % self.numPartitions
+
+
 class RDD(object):
 
     """
@@ -126,7 +139,7 @@ def __init__(self, jrdd, ctx, jrdd_deserializer=AutoBatchedSerializer(PickleSeri
         self.ctx = ctx
         self._jrdd_deserializer = jrdd_deserializer
         self._id = jrdd.id()
-        self._partitionFunc = None
+        self.partitioner = None
 
     def _pickled(self):
         return self._reserialize(AutoBatchedSerializer(PickleSerializer()))
@@ -450,14 +463,17 @@ def union(self, other):
         if self._jrdd_deserializer == other._jrdd_deserializer:
             rdd = RDD(self._jrdd.union(other._jrdd), self.ctx,
                       self._jrdd_deserializer)
-            return rdd
         else:
             # These RDDs contain data in different serialized formats, so we
             # must normalize them to the default serializer.
             self_copy = self._reserialize()
             other_copy = other._reserialize()
-            return RDD(self_copy._jrdd.union(other_copy._jrdd), self.ctx,
-                       self.ctx.serializer)
+            rdd = RDD(self_copy._jrdd.union(other_copy._jrdd), self.ctx,
+                      self.ctx.serializer)
+        if (self.partitioner == other.partitioner and
+                self.getNumPartitions() == rdd.getNumPartitions()):
+            rdd.partitioner = self.partitioner
+        return rdd
 
     def intersection(self, other):
         """
@@ -1588,6 +1604,9 @@ def partitionBy(self, numPartitions, partitionFunc=portable_hash):
         """
         if numPartitions is None:
             numPartitions = self._defaultReducePartitions()
+        partitioner = Partitioner(numPartitions, partitionFunc)
+        if self.partitioner == partitioner:
+            return self
 
         # Transferring O(n) objects to Java is too expensive.
         # Instead, we'll form the hash buckets in Python,
@@ -1632,18 +1651,16 @@ def add_shuffle_key(split, iterator):
                 yield pack_long(split)
                 yield outputSerializer.dumps(items)
 
-        keyed = self.mapPartitionsWithIndex(add_shuffle_key)
+        keyed = self.mapPartitionsWithIndex(add_shuffle_key, preservesPartitioning=True)
         keyed._bypass_serializer = True
         with SCCallSiteSync(self.context) as css:
             pairRDD = self.ctx._jvm.PairwiseRDD(
                 keyed._jrdd.rdd()).asJavaPairRDD()
-            partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
-                                                          id(partitionFunc))
-        jrdd = pairRDD.partitionBy(partitioner).values()
+            jpartitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
+                                                           id(partitionFunc))
+        jrdd = self.ctx._jvm.PythonRDD.valueOfPair(pairRDD.partitionBy(jpartitioner))
         rdd = RDD(jrdd, self.ctx, BatchedSerializer(outputSerializer))
-        # This is required so that id(partitionFunc) remains unique,
-        # even if partitionFunc is a lambda:
-        rdd._partitionFunc = partitionFunc
+        rdd.partitioner = partitioner
         return rdd
 
     # TODO: add control over map-side aggregation
@@ -1689,7 +1706,7 @@ def combineLocally(iterator):
             merger.mergeValues(iterator)
             return merger.iteritems()
 
-        locally_combined = self.mapPartitions(combineLocally)
+        locally_combined = self.mapPartitions(combineLocally, preservesPartitioning=True)
         shuffled = locally_combined.partitionBy(numPartitions)
 
         def _mergeCombiners(iterator):
@@ -1698,7 +1715,7 @@ def _mergeCombiners(iterator):
             merger.mergeCombiners(iterator)
             return merger.iteritems()
 
-        return shuffled.mapPartitions(_mergeCombiners, True)
+        return shuffled.mapPartitions(_mergeCombiners, preservesPartitioning=True)
 
     def aggregateByKey(self, zeroValue, seqFunc, combFunc, numPartitions=None):
         """
@@ -2077,8 +2094,8 @@ def lookup(self, key):
         """
         values = self.filter(lambda (k, v): k == key).values()
 
-        if self._partitionFunc is not None:
-            return self.ctx.runJob(values, lambda x: x, [self._partitionFunc(key)], False)
+        if self.partitioner is not None:
+            return self.ctx.runJob(values, lambda x: x, [self.partitioner(key)], False)
 
         return values.collect()
 
@@ -2243,7 +2260,7 @@ def pipeline_func(split, iterator):
         self._id = None
         self._jrdd_deserializer = self.ctx.serializer
         self._bypass_serializer = False
-        self._partitionFunc = prev._partitionFunc if self.preservesPartitioning else None
+        self.partitioner = prev.partitioner if self.preservesPartitioning else None
         self._broadcast = None
 
     def __del__(self):
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index 2fe39392ff081..3fa42444239f7 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -578,7 +578,7 @@ def reduceFunc(t, a, b):
             if a is None:
                 g = b.groupByKey(numPartitions).mapValues(lambda vs: (list(vs), None))
             else:
-                g = a.cogroup(b, numPartitions)
+                g = a.cogroup(b.partitionBy(numPartitions), numPartitions)
                 g = g.mapValues(lambda (va, vb): (list(vb), list(va)[0] if len(va) else None))
             state = g.mapValues(lambda (vs, s): updateFunc(vs, s))
             return state.filter(lambda (k, v): v is not None)
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index d6afc1cdaa4dc..f64e25c60740a 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -727,7 +727,6 @@ def test_multiple_python_java_RDD_conversions(self):
             (u'1', {u'director': u'David Lean'}),
             (u'2', {u'director': u'Andrew Dominik'})
         ]
-        from pyspark.rdd import RDD
         data_rdd = self.sc.parallelize(data)
         data_java_rdd = data_rdd._to_java_object_rdd()
         data_python_rdd = self.sc._jvm.SerDe.javaToPython(data_java_rdd)
@@ -740,6 +739,43 @@ def test_multiple_python_java_RDD_conversions(self):
         converted_rdd = RDD(data_python_rdd, self.sc)
         self.assertEqual(2, converted_rdd.count())
 
+    def test_narrow_dependency_in_join(self):
+        rdd = self.sc.parallelize(range(10)).map(lambda x: (x, x))
+        parted = rdd.partitionBy(2)
+        self.assertEqual(2, parted.union(parted).getNumPartitions())
+        self.assertEqual(rdd.getNumPartitions() + 2, parted.union(rdd).getNumPartitions())
+        self.assertEqual(rdd.getNumPartitions() + 2, rdd.union(parted).getNumPartitions())
+
+        self.sc.setJobGroup("test1", "test", True)
+        tracker = self.sc.statusTracker()
+
+        d = sorted(parted.join(parted).collect())
+        self.assertEqual(10, len(d))
+        self.assertEqual((0, (0, 0)), d[0])
+        jobId = tracker.getJobIdsForGroup("test1")[0]
+        self.assertEqual(2, len(tracker.getJobInfo(jobId).stageIds))
+
+        self.sc.setJobGroup("test2", "test", True)
+        d = sorted(parted.join(rdd).collect())
+        self.assertEqual(10, len(d))
+        self.assertEqual((0, (0, 0)), d[0])
+        jobId = tracker.getJobIdsForGroup("test2")[0]
+        self.assertEqual(3, len(tracker.getJobInfo(jobId).stageIds))
+
+        self.sc.setJobGroup("test3", "test", True)
+        d = sorted(parted.cogroup(parted).collect())
+        self.assertEqual(10, len(d))
+        self.assertEqual([[0], [0]], map(list, d[0][1]))
+        jobId = tracker.getJobIdsForGroup("test3")[0]
+        self.assertEqual(2, len(tracker.getJobInfo(jobId).stageIds))
+
+        self.sc.setJobGroup("test4", "test", True)
+        d = sorted(parted.cogroup(rdd).collect())
+        self.assertEqual(10, len(d))
+        self.assertEqual([[0], [0]], map(list, d[0][1]))
+        jobId = tracker.getJobIdsForGroup("test4")[0]
+        self.assertEqual(3, len(tracker.getJobInfo(jobId).stageIds))
+
 
 class ProfilerTests(PySparkTestCase):
 

From ae6cfb3acdbc2721d25793698a4a440f0519dbec Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Tue, 17 Feb 2015 17:15:43 -0800
Subject: [PATCH 232/357] [SPARK-5811] Added documentation for maven
 coordinates and added Spark Packages support

Documentation for maven coordinates + Spark Package support. Added pyspark tests for `--packages`

Author: Burak Yavuz <brkyvz@gmail.com>
Author: Davies Liu <davies@databricks.com>

Closes #4662 from brkyvz/SPARK-5811 and squashes the following commits:

56ccccd [Burak Yavuz] fixed broken test
64cb8ee [Burak Yavuz] passed pep8 on local
c07b81e [Burak Yavuz] fixed pep8
a8bd6b7 [Burak Yavuz] submit PR
4ef4046 [Burak Yavuz] ready for PR
8fb02e5 [Burak Yavuz] merged master
25c9b9f [Burak Yavuz] Merge branch 'master' of github.com:apache/spark into python-jar
560d13b [Burak Yavuz] before PR
17d3f76 [Davies Liu] support .jar as python package
a3eb717 [Burak Yavuz] Merge branch 'master' of github.com:apache/spark into SPARK-5811
c60156d [Burak Yavuz] [SPARK-5811] Added documentation for maven coordinates
---
 .../org/apache/spark/deploy/SparkSubmit.scala | 52 ++++++++++----
 .../spark/deploy/SparkSubmitUtilsSuite.scala  | 13 ++--
 docs/programming-guide.md                     | 19 ++++-
 docs/submitting-applications.md               |  5 ++
 python/pyspark/tests.py                       | 69 +++++++++++++++++--
 5 files changed, 131 insertions(+), 27 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 012a89a31b046..4c4110812e0a1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -252,6 +252,26 @@ object SparkSubmit {
 
     val isYarnCluster = clusterManager == YARN && deployMode == CLUSTER
 
+    // Resolve maven dependencies if there are any and add classpath to jars. Add them to py-files
+    // too for packages that include Python code
+    val resolvedMavenCoordinates =
+      SparkSubmitUtils.resolveMavenCoordinates(
+        args.packages, Option(args.repositories), Option(args.ivyRepoPath))
+    if (!resolvedMavenCoordinates.trim.isEmpty) {
+      if (args.jars == null || args.jars.trim.isEmpty) {
+        args.jars = resolvedMavenCoordinates
+      } else {
+        args.jars += s",$resolvedMavenCoordinates"
+      }
+      if (args.isPython) {
+        if (args.pyFiles == null || args.pyFiles.trim.isEmpty) {
+          args.pyFiles = resolvedMavenCoordinates
+        } else {
+          args.pyFiles += s",$resolvedMavenCoordinates"
+        }
+      }
+    }
+
     // Require all python files to be local, so we can add them to the PYTHONPATH
     // In YARN cluster mode, python files are distributed as regular files, which can be non-local
     if (args.isPython && !isYarnCluster) {
@@ -307,18 +327,6 @@ object SparkSubmit {
     // Special flag to avoid deprecation warnings at the client
     sysProps("SPARK_SUBMIT") = "true"
 
-    // Resolve maven dependencies if there are any and add classpath to jars
-    val resolvedMavenCoordinates =
-      SparkSubmitUtils.resolveMavenCoordinates(
-        args.packages, Option(args.repositories), Option(args.ivyRepoPath))
-    if (!resolvedMavenCoordinates.trim.isEmpty) {
-      if (args.jars == null || args.jars.trim.isEmpty) {
-        args.jars = resolvedMavenCoordinates
-      } else {
-        args.jars += s",$resolvedMavenCoordinates"
-      }
-    }
-
     // A list of rules to map each argument to system properties or command-line options in
     // each deploy mode; we iterate through these below
     val options = List[OptionAssigner](
@@ -646,13 +654,15 @@ private[spark] object SparkSubmitUtils {
   private[spark] case class MavenCoordinate(groupId: String, artifactId: String, version: String)
 
 /**
- * Extracts maven coordinates from a comma-delimited string
+ * Extracts maven coordinates from a comma-delimited string. Coordinates should be provided
+ * in the format `groupId:artifactId:version` or `groupId/artifactId:version`. The latter provides
+ * simplicity for Spark Package users.
  * @param coordinates Comma-delimited string of maven coordinates
  * @return Sequence of Maven coordinates
  */
   private[spark] def extractMavenCoordinates(coordinates: String): Seq[MavenCoordinate] = {
     coordinates.split(",").map { p =>
-      val splits = p.split(":")
+      val splits = p.replace("/", ":").split(":")
       require(splits.length == 3, s"Provided Maven Coordinates must be in the form " +
         s"'groupId:artifactId:version'. The coordinate provided is: $p")
       require(splits(0) != null && splits(0).trim.nonEmpty, s"The groupId cannot be null or " +
@@ -682,6 +692,13 @@ private[spark] object SparkSubmitUtils {
     br.setName("central")
     cr.add(br)
 
+    val sp: IBiblioResolver = new IBiblioResolver
+    sp.setM2compatible(true)
+    sp.setUsepoms(true)
+    sp.setRoot("http://dl.bintray.com/spark-packages/maven")
+    sp.setName("spark-packages")
+    cr.add(sp)
+
     val repositoryList = remoteRepos.getOrElse("")
     // add any other remote repositories other than maven central
     if (repositoryList.trim.nonEmpty) {
@@ -794,14 +811,19 @@ private[spark] object SparkSubmitUtils {
       val md = getModuleDescriptor
       md.setDefaultConf(ivyConfName)
 
-      // Add an exclusion rule for Spark
+      // Add an exclusion rule for Spark and Scala Library
       val sparkArtifacts = new ArtifactId(new ModuleId("org.apache.spark", "*"), "*", "*", "*")
       val sparkDependencyExcludeRule =
         new DefaultExcludeRule(sparkArtifacts, ivySettings.getMatcher("glob"), null)
       sparkDependencyExcludeRule.addConfiguration(ivyConfName)
+      val scalaArtifacts = new ArtifactId(new ModuleId("*", "scala-library"), "*", "*", "*")
+      val scalaDependencyExcludeRule =
+        new DefaultExcludeRule(scalaArtifacts, ivySettings.getMatcher("glob"), null)
+      scalaDependencyExcludeRule.addConfiguration(ivyConfName)
 
       // Exclude any Spark dependencies, and add all supplied maven artifacts as dependencies
       md.addExcludeRule(sparkDependencyExcludeRule)
+      md.addExcludeRule(scalaDependencyExcludeRule)
       addDependenciesToIvy(md, artifacts, ivyConfName)
 
       // resolve dependencies
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
index 53665350013cd..ad62b35f624f6 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
@@ -57,20 +57,23 @@ class SparkSubmitUtilsSuite extends FunSuite with BeforeAndAfterAll {
 
   test("create repo resolvers") {
     val resolver1 = SparkSubmitUtils.createRepoResolvers(None)
-    // should have central by default
-    assert(resolver1.getResolvers.size() === 1)
+    // should have central and spark-packages by default
+    assert(resolver1.getResolvers.size() === 2)
     assert(resolver1.getResolvers.get(0).asInstanceOf[IBiblioResolver].getName === "central")
+    assert(resolver1.getResolvers.get(1).asInstanceOf[IBiblioResolver].getName === "spark-packages")
 
     val repos = "a/1,b/2,c/3"
     val resolver2 = SparkSubmitUtils.createRepoResolvers(Option(repos))
-    assert(resolver2.getResolvers.size() === 4)
+    assert(resolver2.getResolvers.size() === 5)
     val expected = repos.split(",").map(r => s"$r/")
     resolver2.getResolvers.toArray.zipWithIndex.foreach { case (resolver: IBiblioResolver, i) =>
       if (i == 0) {
         assert(resolver.getName === "central")
+      } else if (i == 1) {
+        assert(resolver.getName === "spark-packages")
       } else {
-        assert(resolver.getName === s"repo-$i")
-        assert(resolver.getRoot === expected(i - 1))
+        assert(resolver.getName === s"repo-${i - 1}")
+        assert(resolver.getRoot === expected(i - 2))
       }
     }
   }
diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 118701549a759..4e4af76316863 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -173,8 +173,11 @@ in-process.
 In the Spark shell, a special interpreter-aware SparkContext is already created for you, in the
 variable called `sc`. Making your own SparkContext will not work. You can set which master the
 context connects to using the `--master` argument, and you can add JARs to the classpath
-by passing a comma-separated list to the `--jars` argument.
-For example, to run `bin/spark-shell` on exactly four cores, use:
+by passing a comma-separated list to the `--jars` argument. You can also add dependencies 
+(e.g. Spark Packages) to your shell session by supplying a comma-separated list of maven coordinates 
+to the `--packages` argument. Any additional repositories where dependencies might exist (e.g. SonaType)
+can be passed to the `--repositories` argument. For example, to run `bin/spark-shell` on exactly
+four cores, use:
 
 {% highlight bash %}
 $ ./bin/spark-shell --master local[4]
@@ -186,6 +189,12 @@ Or, to also add `code.jar` to its classpath, use:
 $ ./bin/spark-shell --master local[4] --jars code.jar
 {% endhighlight %}
 
+To include a dependency using maven coordinates:
+
+{% highlight bash %}
+$ ./bin/spark-shell --master local[4] --packages "org.example:example:0.1"
+{% endhighlight %}
+
 For a complete list of options, run `spark-shell --help`. Behind the scenes,
 `spark-shell` invokes the more general [`spark-submit` script](submitting-applications.html).
 
@@ -196,7 +205,11 @@ For a complete list of options, run `spark-shell --help`. Behind the scenes,
 In the PySpark shell, a special interpreter-aware SparkContext is already created for you, in the
 variable called `sc`. Making your own SparkContext will not work. You can set which master the
 context connects to using the `--master` argument, and you can add Python .zip, .egg or .py files
-to the runtime path by passing a comma-separated list to `--py-files`.
+to the runtime path by passing a comma-separated list to `--py-files`. You can also add dependencies
+(e.g. Spark Packages) to your shell session by supplying a comma-separated list of maven coordinates
+to the `--packages` argument. Any additional repositories where dependencies might exist (e.g. SonaType)
+can be passed to the `--repositories` argument. Any python dependencies a Spark Package has (listed in 
+the requirements.txt of that package) must be manually installed using pip when necessary.
 For example, to run `bin/pyspark` on exactly four cores, use:
 
 {% highlight bash %}
diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md
index 14a87f8436984..57b074778f2b0 100644
--- a/docs/submitting-applications.md
+++ b/docs/submitting-applications.md
@@ -174,6 +174,11 @@ This can use up a significant amount of space over time and will need to be clea
 is handled automatically, and with Spark standalone, automatic cleanup can be configured with the
 `spark.worker.cleanup.appDataTtl` property.
 
+Users may also include any other dependencies by supplying a comma-delimited list of maven coordinates 
+with `--packages`. All transitive dependencies will be handled when using this command. Additional 
+repositories (or resolvers in SBT) can be added in a comma-delimited fashion with the flag `--repositories`. 
+These commands can be used with `pyspark`, `spark-shell`, and `spark-submit` to include Spark Packages.
+
 For Python, the equivalent `--py-files` option can be used to distribute `.egg`, `.zip` and `.py` libraries
 to executors.
 
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index f64e25c60740a..52e82091c9f81 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -1440,31 +1440,59 @@ def setUp(self):
     def tearDown(self):
         shutil.rmtree(self.programDir)
 
-    def createTempFile(self, name, content):
+    def createTempFile(self, name, content, dir=None):
         """
         Create a temp file with the given name and content and return its path.
         Strips leading spaces from content up to the first '|' in each line.
         """
         pattern = re.compile(r'^ *\|', re.MULTILINE)
         content = re.sub(pattern, '', content.strip())
-        path = os.path.join(self.programDir, name)
+        if dir is None:
+            path = os.path.join(self.programDir, name)
+        else:
+            os.makedirs(os.path.join(self.programDir, dir))
+            path = os.path.join(self.programDir, dir, name)
         with open(path, "w") as f:
             f.write(content)
         return path
 
-    def createFileInZip(self, name, content):
+    def createFileInZip(self, name, content, ext=".zip", dir=None, zip_name=None):
         """
         Create a zip archive containing a file with the given content and return its path.
         Strips leading spaces from content up to the first '|' in each line.
         """
         pattern = re.compile(r'^ *\|', re.MULTILINE)
         content = re.sub(pattern, '', content.strip())
-        path = os.path.join(self.programDir, name + ".zip")
+        if dir is None:
+            path = os.path.join(self.programDir, name + ext)
+        else:
+            path = os.path.join(self.programDir, dir, zip_name + ext)
         zip = zipfile.ZipFile(path, 'w')
         zip.writestr(name, content)
         zip.close()
         return path
 
+    def create_spark_package(self, artifact_name):
+        group_id, artifact_id, version = artifact_name.split(":")
+        self.createTempFile("%s-%s.pom" % (artifact_id, version), ("""
+            |<?xml version="1.0" encoding="UTF-8"?>
+            |<project xmlns="http://maven.apache.org/POM/4.0.0"
+            |       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+            |       xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
+            |       http://maven.apache.org/xsd/maven-4.0.0.xsd">
+            |   <modelVersion>4.0.0</modelVersion>
+            |   <groupId>%s</groupId>
+            |   <artifactId>%s</artifactId>
+            |   <version>%s</version>
+            |</project>
+            """ % (group_id, artifact_id, version)).lstrip(),
+            os.path.join(group_id, artifact_id, version))
+        self.createFileInZip("%s.py" % artifact_id, """
+            |def myfunc(x):
+            |    return x + 1
+            """, ".jar", os.path.join(group_id, artifact_id, version),
+                             "%s-%s" % (artifact_id, version))
+
     def test_single_script(self):
         """Submit and test a single script file"""
         script = self.createTempFile("test.py", """
@@ -1533,6 +1561,39 @@ def test_module_dependency_on_cluster(self):
         self.assertEqual(0, proc.returncode)
         self.assertIn("[2, 3, 4]", out)
 
+    def test_package_dependency(self):
+        """Submit and test a script with a dependency on a Spark Package"""
+        script = self.createTempFile("test.py", """
+            |from pyspark import SparkContext
+            |from mylib import myfunc
+            |
+            |sc = SparkContext()
+            |print sc.parallelize([1, 2, 3]).map(myfunc).collect()
+            """)
+        self.create_spark_package("a:mylib:0.1")
+        proc = subprocess.Popen([self.sparkSubmit, "--packages", "a:mylib:0.1", "--repositories",
+                                 "file:" + self.programDir, script], stdout=subprocess.PIPE)
+        out, err = proc.communicate()
+        self.assertEqual(0, proc.returncode)
+        self.assertIn("[2, 3, 4]", out)
+
+    def test_package_dependency_on_cluster(self):
+        """Submit and test a script with a dependency on a Spark Package on a cluster"""
+        script = self.createTempFile("test.py", """
+            |from pyspark import SparkContext
+            |from mylib import myfunc
+            |
+            |sc = SparkContext()
+            |print sc.parallelize([1, 2, 3]).map(myfunc).collect()
+            """)
+        self.create_spark_package("a:mylib:0.1")
+        proc = subprocess.Popen([self.sparkSubmit, "--packages", "a:mylib:0.1", "--repositories",
+                                 "file:" + self.programDir, "--master",
+                                 "local-cluster[1,1,512]", script], stdout=subprocess.PIPE)
+        out, err = proc.communicate()
+        self.assertEqual(0, proc.returncode)
+        self.assertIn("[2, 3, 4]", out)
+
     def test_single_script_on_cluster(self):
         """Submit and test a single script on a cluster"""
         script = self.createTempFile("test.py", """

From d46d6246d225ff3af09ebae1a09d4de2430c502d Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 17 Feb 2015 17:39:58 -0800
Subject: [PATCH 233/357] [SPARK-4454] Properly synchronize accesses to
 DAGScheduler cacheLocs map

This patch addresses a race condition in DAGScheduler by properly synchronizing accesses to its `cacheLocs` map.

This map is accessed by the `getCacheLocs` and `clearCacheLocs()` methods, which can be called by separate threads, since DAGScheduler's `getPreferredLocs()` method is called by SparkContext and indirectly calls `getCacheLocs()`.  If this map is cleared by the DAGScheduler event processing thread while a user thread is submitting a job and computing preferred locations, then this can cause the user thread to throw "NoSuchElementException: key not found" errors.

Most accesses to DAGScheduler's internal state do not need synchronization because that state is only accessed from the event processing loop's thread.  An alternative approach to fixing this bug would be to refactor this code so that SparkContext sends the DAGScheduler a message in order to get the list of preferred locations.  However, this would involve more extensive changes to this code and would be significantly harder to backport to maintenance branches since some of the related code has undergone significant refactoring (e.g. the introduction of EventLoop).  Since `cacheLocs` is the only state that's accessed in this way, adding simple synchronization seems like a better short-term fix.

See #3345 for additional context.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #4660 from JoshRosen/SPARK-4454 and squashes the following commits:

12d64ba [Josh Rosen] Properly synchronize accesses to DAGScheduler cacheLocs map.
---
 .../apache/spark/scheduler/DAGScheduler.scala | 34 +++++++++++++------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 79035571adb05..9c355d7c3e632 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -98,7 +98,13 @@ class DAGScheduler(
 
   private[scheduler] val activeJobs = new HashSet[ActiveJob]
 
-  // Contains the locations that each RDD's partitions are cached on
+  /**
+   * Contains the locations that each RDD's partitions are cached on.  This map's keys are RDD ids
+   * and its values are arrays indexed by partition numbers. Each array value is the set of
+   * locations where that RDD partition is cached.
+   *
+   * All accesses to this map should be guarded by synchronizing on it (see SPARK-4454).
+   */
   private val cacheLocs = new HashMap[Int, Array[Seq[TaskLocation]]]
 
   // For tracking failed nodes, we use the MapOutputTracker's epoch number, which is sent with
@@ -183,18 +189,17 @@ class DAGScheduler(
     eventProcessLoop.post(TaskSetFailed(taskSet, reason))
   }
 
-  private def getCacheLocs(rdd: RDD[_]): Array[Seq[TaskLocation]] = {
-    if (!cacheLocs.contains(rdd.id)) {
+  private def getCacheLocs(rdd: RDD[_]): Array[Seq[TaskLocation]] = cacheLocs.synchronized {
+    cacheLocs.getOrElseUpdate(rdd.id, {
       val blockIds = rdd.partitions.indices.map(index => RDDBlockId(rdd.id, index)).toArray[BlockId]
       val locs = BlockManager.blockIdsToBlockManagers(blockIds, env, blockManagerMaster)
-      cacheLocs(rdd.id) = blockIds.map { id =>
+      blockIds.map { id =>
         locs.getOrElse(id, Nil).map(bm => TaskLocation(bm.host, bm.executorId))
       }
-    }
-    cacheLocs(rdd.id)
+    })
   }
 
-  private def clearCacheLocs() {
+  private def clearCacheLocs(): Unit = cacheLocs.synchronized {
     cacheLocs.clear()
   }
 
@@ -1276,17 +1281,26 @@ class DAGScheduler(
   }
 
   /**
-   * Synchronized method that might be called from other threads.
+   * Gets the locality information associated with a partition of a particular RDD.
+   *
+   * This method is thread-safe and is called from both DAGScheduler and SparkContext.
+   *
    * @param rdd whose partitions are to be looked at
    * @param partition to lookup locality information for
    * @return list of machines that are preferred by the partition
    */
   private[spark]
-  def getPreferredLocs(rdd: RDD[_], partition: Int): Seq[TaskLocation] = synchronized {
+  def getPreferredLocs(rdd: RDD[_], partition: Int): Seq[TaskLocation] = {
     getPreferredLocsInternal(rdd, partition, new HashSet)
   }
 
-  /** Recursive implementation for getPreferredLocs. */
+  /**
+   * Recursive implementation for getPreferredLocs.
+   *
+   * This method is thread-safe because it only accesses DAGScheduler state through thread-safe
+   * methods (getCacheLocs()); please be careful when modifying this method, because any new
+   * DAGScheduler state accessed by it may require additional synchronization.
+   */
   private def getPreferredLocsInternal(
       rdd: RDD[_],
       partition: Int,

From a51fc7ef9adb6a41c4857918217f800858fced2c Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 17 Feb 2015 17:45:16 -0800
Subject: [PATCH 234/357] [SPARK-4454] Revert getOrElse() cleanup in
 DAGScheduler.getCacheLocs()

This method is performance-sensitive and this change wasn't necessary.
---
 .../scala/org/apache/spark/scheduler/DAGScheduler.scala   | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 9c355d7c3e632..8b62d2405ecb7 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -190,13 +190,15 @@ class DAGScheduler(
   }
 
   private def getCacheLocs(rdd: RDD[_]): Array[Seq[TaskLocation]] = cacheLocs.synchronized {
-    cacheLocs.getOrElseUpdate(rdd.id, {
+    // Note: this doesn't use `getOrElse()` because this method is called O(num tasks) times
+    if (!cacheLocs.contains(rdd.id)) {
       val blockIds = rdd.partitions.indices.map(index => RDDBlockId(rdd.id, index)).toArray[BlockId]
       val locs = BlockManager.blockIdsToBlockManagers(blockIds, env, blockManagerMaster)
-      blockIds.map { id =>
+      cacheLocs(rdd.id) = blockIds.map { id =>
         locs.getOrElse(id, Nil).map(bm => TaskLocation(bm.host, bm.executorId))
       }
-    })
+    }
+    cacheLocs(rdd.id)
   }
 
   private def clearCacheLocs(): Unit = cacheLocs.synchronized {

From d5f12bfe8f0a98d6fee114bb24376668ebe2898e Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 17 Feb 2015 17:50:39 -0800
Subject: [PATCH 235/357] [SPARK-5875][SQL]logical.Project should not be
 resolved if it contains aggregates or generators

https://issues.apache.org/jira/browse/SPARK-5875 has a case to reproduce the bug and explain the root cause.

Author: Yin Huai <yhuai@databricks.com>

Closes #4663 from yhuai/projectResolved and squashes the following commits:

472f7b6 [Yin Huai] If a logical.Project has any AggregateExpression or Generator, it's resolved field should be false.
---
 .../plans/logical/basicOperators.scala        | 10 ++++++
 .../sql/catalyst/analysis/AnalysisSuite.scala | 13 +++++++-
 .../sql/hive/execution/SQLQuerySuite.scala    | 32 ++++++++++++++++++-
 3 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
index 9628e93274a11..89544add74430 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -23,6 +23,16 @@ import org.apache.spark.sql.types._
 
 case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) extends UnaryNode {
   def output = projectList.map(_.toAttribute)
+
+  override lazy val resolved: Boolean = {
+    val containsAggregatesOrGenerators = projectList.exists ( _.collect {
+        case agg: AggregateExpression => agg
+        case generator: Generator => generator
+      }.nonEmpty
+    )
+
+    !expressions.exists(!_.resolved) && childrenResolved && !containsAggregatesOrGenerators
+  }
 }
 
 /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index e70c651e1486e..aec7847356cd4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.analysis
 import org.scalatest.{BeforeAndAfter, FunSuite}
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.expressions.{Literal, Alias, AttributeReference}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
 
@@ -58,6 +58,17 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
     assert(caseInsensitiveAnalyze(plan).resolved)
   }
 
+  test("check project's resolved") {
+    assert(Project(testRelation.output, testRelation).resolved)
+
+    assert(!Project(Seq(UnresolvedAttribute("a")), testRelation).resolved)
+
+    val explode = Explode(Nil, AttributeReference("a", IntegerType, nullable = true)())
+    assert(!Project(Seq(Alias(explode, "explode")()), testRelation).resolved)
+
+    assert(!Project(Seq(Alias(Count(Literal(1)), "count")()), testRelation).resolved)
+  }
+
   test("analyze project") {
     assert(
       caseSensitiveAnalyze(Project(Seq(UnresolvedAttribute("a")), testRelation)) ===
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index e8d9eec3d88ff..ae03bc5e9953f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.hive.execution
 
-import org.apache.spark.sql.hive.HiveShim
+import org.apache.spark.sql.hive.{MetastoreRelation, HiveShim}
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
@@ -316,4 +316,34 @@ class SQLQuerySuite extends QueryTest {
 
     dropTempTable("data")
   }
+
+  test("logical.Project should not be resolved if it contains aggregates or generators") {
+    // This test is used to test the fix of SPARK-5875.
+    // The original issue was that Project's resolved will be true when it contains
+    // AggregateExpressions or Generators. However, in this case, the Project
+    // is not in a valid state (cannot be executed). Because of this bug, the analysis rule of
+    // PreInsertionCasts will actually start to work before ImplicitGenerate and then
+    // generates an invalid query plan.
+    val rdd = sparkContext.makeRDD((1 to 5).map(i => s"""{"a":[$i, ${i+1}]}"""))
+    jsonRDD(rdd).registerTempTable("data")
+    val originalConf = getConf("spark.sql.hive.convertCTAS", "false")
+    setConf("spark.sql.hive.convertCTAS", "false")
+
+    sql("CREATE TABLE explodeTest (key bigInt)")
+    table("explodeTest").queryExecution.analyzed match {
+      case metastoreRelation: MetastoreRelation => // OK
+      case _ =>
+        fail("To correctly test the fix of SPARK-5875, explodeTest should be a MetastoreRelation")
+    }
+
+    sql(s"INSERT OVERWRITE TABLE explodeTest SELECT explode(a) AS val FROM data")
+    checkAnswer(
+      sql("SELECT key from explodeTest"),
+      (1 to 5).flatMap(i => Row(i) :: Row(i + 1) :: Nil)
+    )
+
+    sql("DROP TABLE explodeTest")
+    dropTempTable("data")
+    setConf("spark.sql.hive.convertCTAS", originalConf)
+  }
 }

From e50934f11e1e3ded21a631e5ab69db3c79467137 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 17 Feb 2015 18:14:33 -0800
Subject: [PATCH 236/357] [SPARK-5723][SQL]Change the default file format to
 Parquet for CTAS statements.

JIRA: https://issues.apache.org/jira/browse/SPARK-5723

Author: Yin Huai <yhuai@databricks.com>

This patch had conflicts when merged, resolved by
Committer: Michael Armbrust <michael@databricks.com>

Closes #4639 from yhuai/defaultCTASFileFormat and squashes the following commits:

a568137 [Yin Huai] Merge remote-tracking branch 'upstream/master' into defaultCTASFileFormat
ad2b07d [Yin Huai] Update tests and error messages.
8af5b2a [Yin Huai] Update conf key and unit test.
5a67903 [Yin Huai] Use data source write path for Hive's CTAS statements when no storage format/handler is specified.
---
 .../apache/spark/sql/hive/HiveContext.scala   | 15 ++++
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 75 +++++++++++++++----
 .../spark/sql/hive/execution/commands.scala   | 17 +++--
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  6 +-
 .../sql/hive/execution/SQLQuerySuite.scala    | 70 +++++++++++++++++
 5 files changed, 158 insertions(+), 25 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 6c55bc6be17f9..d3365b1e8f44c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -61,6 +61,21 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
   protected[sql] def convertMetastoreParquet: Boolean =
     getConf("spark.sql.hive.convertMetastoreParquet", "true") == "true"
 
+  /**
+   * When true, a table created by a Hive CTAS statement (no USING clause) will be
+   * converted to a data source table, using the data source set by spark.sql.sources.default.
+   * The table in CTAS statement will be converted when it meets any of the following conditions:
+   *   - The CTAS does not specify any of a SerDe (ROW FORMAT SERDE), a File Format (STORED AS), or
+   *     a Storage Hanlder (STORED BY), and the value of hive.default.fileformat in hive-site.xml
+   *     is either TextFile or SequenceFile.
+   *   - The CTAS statement specifies TextFile (STORED AS TEXTFILE) as the file format and no SerDe
+   *     is specified (no ROW FORMAT SERDE clause).
+   *   - The CTAS statement specifies SequenceFile (STORED AS SEQUENCEFILE) as the file format
+   *     and no SerDe is specified (no ROW FORMAT SERDE clause).
+   */
+  protected[sql] def convertCTAS: Boolean =
+    getConf("spark.sql.hive.convertCTAS", "false").toBoolean
+
   override protected[sql] def executePlan(plan: LogicalPlan): this.QueryExecution =
     new this.QueryExecution(plan)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index cfd6f27371d0d..f7ad2efc9544e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -31,7 +31,7 @@ import org.apache.hadoop.hive.serde2.{Deserializer, SerDeException}
 import org.apache.hadoop.util.ReflectionUtils
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.{AnalysisException, SQLContext}
+import org.apache.spark.sql.{SaveMode, AnalysisException, SQLContext}
 import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, Catalog, OverrideCatalog}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
@@ -39,7 +39,7 @@ import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.parquet.{ParquetRelation2, Partition => ParquetPartition, PartitionSpec}
-import org.apache.spark.sql.sources.{DDLParser, LogicalRelation, ResolvedDataSource}
+import org.apache.spark.sql.sources.{CreateTableUsingAsSelect, DDLParser, LogicalRelation, ResolvedDataSource}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -502,24 +502,69 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
           Some(sa.getQB().getTableDesc)
         }
 
-        execution.CreateTableAsSelect(
-          databaseName,
-          tableName,
-          child,
-          allowExisting,
-          desc)
+        // Check if the query specifies file format or storage handler.
+        val hasStorageSpec = desc match {
+          case Some(crtTbl) =>
+            crtTbl != null && (crtTbl.getSerName != null || crtTbl.getStorageHandler != null)
+          case None => false
+        }
+
+        if (hive.convertCTAS && !hasStorageSpec) {
+          // Do the conversion when spark.sql.hive.convertCTAS is true and the query
+          // does not specify any storage format (file format and storage handler).
+          if (dbName.isDefined) {
+            throw new AnalysisException(
+              "Cannot specify database name in a CTAS statement " +
+              "when spark.sql.hive.convertCTAS is set to true.")
+          }
+
+          val mode = if (allowExisting) SaveMode.Ignore else SaveMode.ErrorIfExists
+          CreateTableUsingAsSelect(
+            tblName,
+            hive.conf.defaultDataSourceName,
+            temporary = false,
+            mode,
+            options = Map.empty[String, String],
+            child
+          )
+        } else {
+          execution.CreateTableAsSelect(
+            databaseName,
+            tableName,
+            child,
+            allowExisting,
+            desc)
+        }
 
       case p: LogicalPlan if p.resolved => p
 
       case p @ CreateTableAsSelect(db, tableName, child, allowExisting, None) =>
         val (dbName, tblName) = processDatabaseAndTableName(db, tableName)
-        val databaseName = dbName.getOrElse(hive.sessionState.getCurrentDatabase)
-        execution.CreateTableAsSelect(
-          databaseName,
-          tableName,
-          child,
-          allowExisting,
-          None)
+        if (hive.convertCTAS) {
+          if (dbName.isDefined) {
+            throw new AnalysisException(
+              "Cannot specify database name in a CTAS statement " +
+              "when spark.sql.hive.convertCTAS is set to true.")
+          }
+
+          val mode = if (allowExisting) SaveMode.Ignore else SaveMode.ErrorIfExists
+          CreateTableUsingAsSelect(
+            tblName,
+            hive.conf.defaultDataSourceName,
+            temporary = false,
+            mode,
+            options = Map.empty[String, String],
+            child
+          )
+        } else {
+          val databaseName = dbName.getOrElse(hive.sessionState.getCurrentDatabase)
+          execution.CreateTableAsSelect(
+            databaseName,
+            tableName,
+            child,
+            allowExisting,
+            None)
+        }
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 6afd8eea05418..c88d0e6b79491 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.sources._
@@ -121,7 +122,7 @@ case class CreateMetastoreDataSource(
       if (allowExisting) {
         return Seq.empty[Row]
       } else {
-        sys.error(s"Table $tableName already exists.")
+        throw new AnalysisException(s"Table $tableName already exists.")
       }
     }
 
@@ -172,9 +173,11 @@ case class CreateMetastoreDataSourceAsSelect(
       // Check if we need to throw an exception or just return.
       mode match {
         case SaveMode.ErrorIfExists =>
-          sys.error(s"Table $tableName already exists. " +
-            s"If you want to append into it, please set mode to SaveMode.Append. " +
-            s"Or, if you want to overwrite it, please set mode to SaveMode.Overwrite.")
+          throw new AnalysisException(s"Table $tableName already exists. " +
+            s"If you are using saveAsTable, you can set SaveMode to SaveMode.Append to " +
+            s"insert data into the table or set SaveMode to SaveMode.Overwrite to overwrite" +
+            s"the existing data. " +
+            s"Or, if you are using SQL CREATE TABLE, you need to drop $tableName first.")
         case SaveMode.Ignore =>
           // Since the table already exists and the save mode is Ignore, we will just return.
           return Seq.empty[Row]
@@ -199,7 +202,7 @@ case class CreateMetastoreDataSourceAsSelect(
                 s"== Actual Schema ==" +:
                   createdRelation.schema.treeString.split("\\\n")).mkString("\n")}
               """.stripMargin
-                sys.error(errorMessage)
+                throw new AnalysisException(errorMessage)
               } else if (i != createdRelation.relation) {
                 val errorDescription =
                   s"Cannot append to table $tableName because the resolved relation does not " +
@@ -216,10 +219,10 @@ case class CreateMetastoreDataSourceAsSelect(
                 s"== Actual Relation ==" ::
                   createdRelation.toString :: Nil).mkString("\n")}
               """.stripMargin
-                sys.error(errorMessage)
+                throw new AnalysisException(errorMessage)
               }
             case o =>
-              sys.error(s"Saving data in ${o.toString} is not supported.")
+              throw new AnalysisException(s"Saving data in ${o.toString} is not supported.")
           }
         case SaveMode.Overwrite =>
           hiveContext.sql(s"DROP TABLE IF EXISTS $tableName")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index c30090fabbc8d..e5156ae821bf4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -306,8 +306,8 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
         |SELECT * FROM jsonTable
       """.stripMargin)
 
-    // Create the table again should trigger a AlreadyExistsException.
-    val message = intercept[RuntimeException] {
+    // Create the table again should trigger a AnalysisException.
+    val message = intercept[AnalysisException] {
       sql(
         s"""
         |CREATE TABLE ctasJsonTable
@@ -516,7 +516,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       sql("SELECT * FROM createdJsonTable"),
       df.collect())
 
-    var message = intercept[RuntimeException] {
+    var message = intercept[AnalysisException] {
       createExternalTable("createdJsonTable", filePath.toString)
     }.getMessage
     assert(message.contains("Table createdJsonTable already exists."),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index ae03bc5e9953f..f2bc73bf3bdf9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -17,10 +17,13 @@
 
 package org.apache.spark.sql.hive.execution
 
+import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
 import org.apache.spark.sql.hive.{MetastoreRelation, HiveShim}
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
+import org.apache.spark.sql.parquet.ParquetRelation2
+import org.apache.spark.sql.sources.LogicalRelation
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SQLConf}
 
@@ -42,6 +45,73 @@ class SQLQuerySuite extends QueryTest {
     )
   }
 
+  test("CTAS without serde") {
+    def checkRelation(tableName: String, isDataSourceParquet: Boolean): Unit = {
+      val relation = EliminateSubQueries(catalog.lookupRelation(Seq(tableName)))
+      relation match {
+        case LogicalRelation(r: ParquetRelation2) =>
+          if (!isDataSourceParquet) {
+            fail(
+              s"${classOf[MetastoreRelation].getCanonicalName} is expected, but found " +
+              s"${ParquetRelation2.getClass.getCanonicalName}.")
+          }
+
+        case r: MetastoreRelation =>
+          if (isDataSourceParquet) {
+            fail(
+              s"${ParquetRelation2.getClass.getCanonicalName} is expected, but found " +
+              s"${classOf[MetastoreRelation].getCanonicalName}.")
+          }
+      }
+    }
+
+    val originalConf = getConf("spark.sql.hive.convertCTAS", "false")
+
+    setConf("spark.sql.hive.convertCTAS", "true")
+
+    sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
+    sql("CREATE TABLE IF NOT EXISTS ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
+    var message = intercept[AnalysisException] {
+      sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
+    }.getMessage
+    assert(message.contains("Table ctas1 already exists"))
+    checkRelation("ctas1", true)
+    sql("DROP TABLE ctas1")
+
+    // Specifying database name for query can be converted to data source write path
+    // is not allowed right now.
+    message = intercept[AnalysisException] {
+      sql("CREATE TABLE default.ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
+    }.getMessage
+    assert(
+      message.contains("Cannot specify database name in a CTAS statement"),
+      "When spark.sql.hive.convertCTAS is true, we should not allow " +
+      "database name specified.")
+
+    sql("CREATE TABLE ctas1 stored as textfile AS SELECT key k, value FROM src ORDER BY k, value")
+    checkRelation("ctas1", true)
+    sql("DROP TABLE ctas1")
+
+    sql(
+      "CREATE TABLE ctas1 stored as sequencefile AS SELECT key k, value FROM src ORDER BY k, value")
+    checkRelation("ctas1", true)
+    sql("DROP TABLE ctas1")
+
+    sql("CREATE TABLE ctas1 stored as rcfile AS SELECT key k, value FROM src ORDER BY k, value")
+    checkRelation("ctas1", false)
+    sql("DROP TABLE ctas1")
+
+    sql("CREATE TABLE ctas1 stored as orc AS SELECT key k, value FROM src ORDER BY k, value")
+    checkRelation("ctas1", false)
+    sql("DROP TABLE ctas1")
+
+    sql("CREATE TABLE ctas1 stored as parquet AS SELECT key k, value FROM src ORDER BY k, value")
+    checkRelation("ctas1", false)
+    sql("DROP TABLE ctas1")
+
+    setConf("spark.sql.hive.convertCTAS", originalConf)
+  }
+
   test("CTAS with serde") {
     sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value").collect()
     sql(

From 3912d332464dcd124c60b734724c34d9742466a4 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 17 Feb 2015 22:44:16 -0800
Subject: [PATCH 237/357] [SPARK-5731][Streaming][Test] Fix incorrect test in
 DirectKafkaStreamSuite

The test was incorrect. Instead of counting the number of records, it counted the number of partitions of RDD generated by DStream. Which is not its intention. I will be testing this patch multiple times to understand its flakiness.

PS: This was caused by my refactoring in https://github.com/apache/spark/pull/4384/

koeninger check it out.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #4597 from tdas/kafka-flaky-test and squashes the following commits:

d236235 [Tathagata Das] Unignored last test.
e9a1820 [Tathagata Das] fix test
---
 .../kafka/DirectKafkaStreamSuite.scala        | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
index 926094449e7fc..17ca9d145d665 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
@@ -20,20 +20,21 @@ package org.apache.spark.streaming.kafka
 import java.io.File
 
 import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.duration._
 import scala.language.postfixOps
 
+import kafka.common.TopicAndPartition
+import kafka.message.MessageAndMetadata
 import kafka.serializer.StringDecoder
 import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
-import org.scalatest.concurrent.{Eventually, Timeouts}
+import org.scalatest.concurrent.Eventually
 
-import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.streaming.{Milliseconds, StreamingContext, Time}
-import org.apache.spark.streaming.dstream.{DStream, InputDStream}
+import org.apache.spark.streaming.dstream.DStream
 import org.apache.spark.util.Utils
-import kafka.common.TopicAndPartition
-import kafka.message.MessageAndMetadata
 
 class DirectKafkaStreamSuite extends KafkaStreamSuiteBase
   with BeforeAndAfter with BeforeAndAfterAll with Eventually {
@@ -67,13 +68,14 @@ class DirectKafkaStreamSuite extends KafkaStreamSuiteBase
   }
 
 
-  ignore("basic stream receiving with multiple topics and smallest starting offset") {
+  test("basic stream receiving with multiple topics and smallest starting offset") {
     val topics = Set("basic1", "basic2", "basic3")
     val data = Map("a" -> 7, "b" -> 9)
     topics.foreach { t =>
       createTopic(t)
       sendMessages(t, data)
     }
+    val totalSent = data.values.sum * topics.size
     val kafkaParams = Map(
       "metadata.broker.list" -> s"$brokerAddress",
       "auto.offset.reset" -> "smallest"
@@ -84,7 +86,8 @@ class DirectKafkaStreamSuite extends KafkaStreamSuiteBase
       KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
         ssc, kafkaParams, topics)
     }
-    var total = 0L
+
+    val allReceived = new ArrayBuffer[(String, String)]
 
     stream.foreachRDD { rdd =>
     // Get the offset ranges in the RDD
@@ -104,16 +107,17 @@ class DirectKafkaStreamSuite extends KafkaStreamSuiteBase
       collected.foreach { case (partSize, rangeSize) =>
         assert(partSize === rangeSize, "offset ranges are wrong")
       }
-      total += collected.size  // Add up all the collected items
     }
+    stream.foreachRDD { rdd => allReceived ++= rdd.collect() }
     ssc.start()
     eventually(timeout(20000.milliseconds), interval(200.milliseconds)) {
-      assert(total === data.values.sum * topics.size, "didn't get all messages")
+      assert(allReceived.size === totalSent,
+        "didn't get expected number of messages, messages:\n" + allReceived.mkString("\n"))
     }
     ssc.stop()
   }
 
-  ignore("receiving from largest starting offset") {
+  test("receiving from largest starting offset") {
     val topic = "largest"
     val topicPartition = TopicAndPartition(topic, 0)
     val data = Map("a" -> 10)
@@ -158,7 +162,7 @@ class DirectKafkaStreamSuite extends KafkaStreamSuiteBase
   }
 
 
-  ignore("creating stream by offset") {
+  test("creating stream by offset") {
     val topic = "offset"
     val topicPartition = TopicAndPartition(topic, 0)
     val data = Map("a" -> 10)
@@ -204,7 +208,7 @@ class DirectKafkaStreamSuite extends KafkaStreamSuiteBase
   }
 
   // Test to verify the offset ranges can be recovered from the checkpoints
-  ignore("offset recovery") {
+  test("offset recovery") {
     val topic = "recovery"
     createTopic(topic)
     testDir = Utils.createTempDir()

From 61ab08549cb6fceb6de1b5c490c55a89d4bd28fa Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 17 Feb 2015 23:36:20 -0800
Subject: [PATCH 238/357] [Minor] [SQL] Cleans up DataFrame variable names and
 toDF() calls

Although we've migrated to the DataFrame API, lots of code still uses `rdd` or `srdd` as local variable names. This PR tries to address these naming inconsistencies and some other minor DataFrame related style issues.

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4670)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>

Closes #4670 from liancheng/df-cleanup and squashes the following commits:

3e14448 [Cheng Lian] Cleans up DataFrame variable names and toDF() calls
---
 .../examples/ml/CrossValidatorExample.scala   |  2 +-
 .../examples/ml/DeveloperApiExample.scala     |  4 +-
 .../spark/examples/ml/MovieLensALS.scala      |  6 +--
 .../examples/ml/SimpleParamsExample.scala     |  6 +--
 .../ml/SimpleTextClassificationPipeline.scala |  4 +-
 .../spark/examples/mllib/DatasetExample.scala |  2 +-
 .../spark/examples/sql/RDDRelation.scala      |  2 +-
 .../examples/sql/hive/HiveFromSpark.scala     |  2 +-
 .../mllib/classification/NaiveBayes.scala     |  2 +-
 .../impl/GLMClassificationModel.scala         |  2 +-
 .../regression/impl/GLMRegressionModel.scala  |  2 +-
 .../mllib/tree/model/DecisionTreeModel.scala  |  2 +-
 .../mllib/tree/model/treeEnsembleModels.scala |  2 +-
 .../spark/ml/recommendation/ALSSuite.scala    |  4 +-
 .../org/apache/spark/sql/DataFrame.scala      |  2 +-
 .../org/apache/spark/sql/SQLContext.scala     |  4 +-
 .../spark/sql/parquet/ParquetTest.scala       |  6 +--
 .../apache/spark/sql/CachedTableSuite.scala   | 14 +++--
 .../org/apache/spark/sql/DataFrameSuite.scala | 26 +++++----
 .../org/apache/spark/sql/JoinSuite.scala      |  8 +--
 .../org/apache/spark/sql/QueryTest.scala      | 46 ++++++++--------
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  6 +--
 .../sql/ScalaReflectionRelationSuite.scala    | 10 ++--
 .../scala/org/apache/spark/sql/TestData.scala | 48 +++++++++--------
 .../spark/sql/jdbc/JDBCWriteSuite.scala       | 46 ++++++++--------
 .../spark/sql/jdbc/MySQLIntegration.scala     | 53 ++++++++-----------
 .../spark/sql/jdbc/PostgresIntegration.scala  | 30 +++++------
 .../sql/parquet/ParquetFilterSuite.scala      | 40 +++++++-------
 .../spark/sql/parquet/ParquetIOSuite.scala    | 28 +++++-----
 .../spark/sql/parquet/ParquetQuerySuite.scala |  2 +-
 .../sql/parquet/ParquetSchemaSuite.scala      |  4 +-
 .../sql/hive/InsertIntoHiveTableSuite.scala   |  4 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  8 +--
 .../spark/sql/hive/StatisticsSuite.scala      | 38 ++++++-------
 .../sql/hive/execution/HiveQuerySuite.scala   | 20 +++----
 .../hive/execution/HiveResolutionSuite.scala  |  6 +--
 .../sql/hive/execution/HiveUdfSuite.scala     | 18 +++----
 37 files changed, 250 insertions(+), 259 deletions(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
index f0241943ef410..7ab892cd7560c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
@@ -90,7 +90,7 @@ object CrossValidatorExample {
     crossval.setNumFolds(2) // Use 3+ in practice
 
     // Run cross-validation, and choose the best set of parameters.
-    val cvModel = crossval.fit(training.toDF)
+    val cvModel = crossval.fit(training.toDF())
 
     // Prepare test documents, which are unlabeled.
     val test = sc.parallelize(Seq(
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
index 54aadd2288817..df26798e41b7b 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
@@ -58,7 +58,7 @@ object DeveloperApiExample {
     lr.setMaxIter(10)
 
     // Learn a LogisticRegression model.  This uses the parameters stored in lr.
-    val model = lr.fit(training.toDF)
+    val model = lr.fit(training.toDF())
 
     // Prepare test data.
     val test = sc.parallelize(Seq(
@@ -67,7 +67,7 @@ object DeveloperApiExample {
       LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))))
 
     // Make predictions on test data.
-    val sumPredictions: Double = model.transform(test.toDF)
+    val sumPredictions: Double = model.transform(test.toDF())
       .select("features", "label", "prediction")
       .collect()
       .map { case Row(features: Vector, label: Double, prediction: Double) =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
index adaf796dc1896..96b2dd463e253 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
@@ -137,9 +137,9 @@ object MovieLensALS {
       .setRegParam(params.regParam)
       .setNumBlocks(params.numBlocks)
 
-    val model = als.fit(training.toDF)
+    val model = als.fit(training.toDF())
 
-    val predictions = model.transform(test.toDF).cache()
+    val predictions = model.transform(test.toDF()).cache()
 
     // Evaluate the model.
     // TODO: Create an evaluator to compute RMSE.
@@ -158,7 +158,7 @@ object MovieLensALS {
 
     // Inspect false positives.
     predictions.registerTempTable("prediction")
-    sc.textFile(params.movies).map(Movie.parseMovie).toDF.registerTempTable("movie")
+    sc.textFile(params.movies).map(Movie.parseMovie).toDF().registerTempTable("movie")
     sqlContext.sql(
       """
         |SELECT userId, prediction.movieId, title, rating, prediction
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
index c5bb5515b1930..e8af5c162586a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
@@ -58,7 +58,7 @@ object SimpleParamsExample {
       .setRegParam(0.01)
 
     // Learn a LogisticRegression model.  This uses the parameters stored in lr.
-    val model1 = lr.fit(training.toDF)
+    val model1 = lr.fit(training.toDF())
     // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
     // we can view the parameters it used during fit().
     // This prints the parameter (name: value) pairs, where names are unique IDs for this
@@ -77,7 +77,7 @@ object SimpleParamsExample {
 
     // Now learn a new model using the paramMapCombined parameters.
     // paramMapCombined overrides all parameters set earlier via lr.set* methods.
-    val model2 = lr.fit(training.toDF, paramMapCombined)
+    val model2 = lr.fit(training.toDF(), paramMapCombined)
     println("Model 2 was fit using parameters: " + model2.fittingParamMap)
 
     // Prepare test data.
@@ -90,7 +90,7 @@ object SimpleParamsExample {
     // LogisticRegression.transform will only use the 'features' column.
     // Note that model2.transform() outputs a 'myProbability' column instead of the usual
     // 'probability' column since we renamed the lr.probabilityCol parameter previously.
-    model2.transform(test.toDF)
+    model2.transform(test.toDF())
       .select("features", "label", "myProbability", "prediction")
       .collect()
       .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
index 8b47f88e48df1..a11db6fd5c382 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
@@ -69,7 +69,7 @@ object SimpleTextClassificationPipeline {
       .setStages(Array(tokenizer, hashingTF, lr))
 
     // Fit the pipeline to training documents.
-    val model = pipeline.fit(training.toDF)
+    val model = pipeline.fit(training.toDF())
 
     // Prepare test documents, which are unlabeled.
     val test = sc.parallelize(Seq(
@@ -79,7 +79,7 @@ object SimpleTextClassificationPipeline {
       Document(7L, "apache hadoop")))
 
     // Make predictions on test documents.
-    model.transform(test.toDF)
+    model.transform(test.toDF())
       .select("id", "text", "probability", "prediction")
       .collect()
       .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
index c98c68a02f2be..e943d6c889fab 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala
@@ -81,7 +81,7 @@ object DatasetExample {
     println(s"Loaded ${origData.count()} instances from file: ${params.input}")
 
     // Convert input data to DataFrame explicitly.
-    val df: DataFrame = origData.toDF
+    val df: DataFrame = origData.toDF()
     println(s"Inferred schema:\n${df.schema.prettyJson}")
     println(s"Converted to DataFrame with ${df.count()} records")
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
index 79d3d5a24ceaf..6331d1c0060f8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/RDDRelation.scala
@@ -34,7 +34,7 @@ object RDDRelation {
     // Importing the SQL context gives access to all the SQL functions and implicit conversions.
     import sqlContext.implicits._
 
-    val df = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i"))).toDF
+    val df = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i"))).toDF()
     // Any RDD containing case classes can be registered as a table.  The schema of the table is
     // automatically inferred using scala reflection.
     df.registerTempTable("records")
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
index 7128deba54da7..b7ba60ec28155 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/hive/HiveFromSpark.scala
@@ -68,7 +68,7 @@ object HiveFromSpark {
 
     // You can also register RDDs as temporary tables within a HiveContext.
     val rdd = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i")))
-    rdd.toDF.registerTempTable("records")
+    rdd.toDF().registerTempTable("records")
 
     // Queries can then join RDD data with data stored in Hive.
     println("Result of SELECT *:")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index dd7a9469d5041..b11fd4f128c56 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -102,7 +102,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
       sc.parallelize(Seq(metadata), 1).saveAsTextFile(metadataPath(path))
 
       // Create Parquet data.
-      val dataRDD: DataFrame = sc.parallelize(Seq(data), 1).toDF
+      val dataRDD: DataFrame = sc.parallelize(Seq(data), 1).toDF()
       dataRDD.saveAsParquetFile(dataPath(path))
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala
index 0a358f2e4f39c..8956189ff1158 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/impl/GLMClassificationModel.scala
@@ -62,7 +62,7 @@ private[classification] object GLMClassificationModel {
 
       // Create Parquet data.
       val data = Data(weights, intercept, threshold)
-      sc.parallelize(Seq(data), 1).toDF.saveAsParquetFile(Loader.dataPath(path))
+      sc.parallelize(Seq(data), 1).toDF().saveAsParquetFile(Loader.dataPath(path))
     }
 
     /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala
index 7b27aaa322b00..bd7e340ca2d8e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/impl/GLMRegressionModel.scala
@@ -58,7 +58,7 @@ private[regression] object GLMRegressionModel {
 
       // Create Parquet data.
       val data = Data(weights, intercept)
-      val dataRDD: DataFrame = sc.parallelize(Seq(data), 1).toDF
+      val dataRDD: DataFrame = sc.parallelize(Seq(data), 1).toDF()
       // TODO: repartition with 1 partition after SPARK-5532 gets fixed
       dataRDD.saveAsParquetFile(Loader.dataPath(path))
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
index 5dac62b0c42f0..060fd5b859a51 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
@@ -197,7 +197,7 @@ object DecisionTreeModel extends Loader[DecisionTreeModel] {
       val nodes = model.topNode.subtreeIterator.toSeq
       val dataRDD: DataFrame = sc.parallelize(nodes)
         .map(NodeData.apply(0, _))
-        .toDF
+        .toDF()
       dataRDD.saveAsParquetFile(Loader.dataPath(path))
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
index e507f247cca76..4897906aea5b3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -289,7 +289,7 @@ private[tree] object TreeEnsembleModel {
       // Create Parquet data.
       val dataRDD = sc.parallelize(model.trees.zipWithIndex).flatMap { case (tree, treeId) =>
         tree.topNode.subtreeIterator.toSeq.map(node => NodeData(treeId, node))
-      }.toDF
+      }.toDF()
       dataRDD.saveAsParquetFile(Loader.dataPath(path))
     }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index b118a8dcf1363..376c3626f9bbb 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -358,8 +358,8 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
       .setNumUserBlocks(numUserBlocks)
       .setNumItemBlocks(numItemBlocks)
     val alpha = als.getAlpha
-    val model = als.fit(training.toDF)
-    val predictions = model.transform(test.toDF)
+    val model = als.fit(training.toDF())
+    val predictions = model.transform(test.toDF())
       .select("rating", "prediction")
       .map { case Row(rating: Float, prediction: Float) =>
         (rating.toDouble, prediction.toDouble)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index fa5fe84263ece..5007a5a34de1a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -124,7 +124,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * from a RDD of tuples into a [[DataFrame]] with meaningful names. For example:
    * {{{
    *   val rdd: RDD[(Int, String)] = ...
-   *   rdd.toDF  // this implicit conversion creates a DataFrame with column name _1 and _2
+   *   rdd.toDF()  // this implicit conversion creates a DataFrame with column name _1 and _2
    *   rdd.toDF("id", "name")  // this creates a DataFrame with column name "id" and "name"
    * }}}
    * @group basic
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 709b350144c75..db32fa80dd3e7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -887,8 +887,8 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * Registers the given [[DataFrame]] as a temporary table in the catalog. Temporary tables exist
    * only during the lifetime of this instance of SQLContext.
    */
-  private[sql] def registerDataFrameAsTable(rdd: DataFrame, tableName: String): Unit = {
-    catalog.registerTable(Seq(tableName), rdd.logicalPlan)
+  private[sql] def registerDataFrameAsTable(df: DataFrame, tableName: String): Unit = {
+    catalog.registerTable(Seq(tableName), df.logicalPlan)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
index 052728c5d5ceb..0fa2fe90f9674 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
@@ -99,7 +99,7 @@ private[sql] trait ParquetTest {
    * Writes `data` to a Parquet file and reads it back as a [[DataFrame]],
    * which is then passed to `f`. The Parquet file will be deleted after `f` returns.
    */
-  protected def withParquetRDD[T <: Product: ClassTag: TypeTag]
+  protected def withParquetDataFrame[T <: Product: ClassTag: TypeTag]
       (data: Seq[T])
       (f: DataFrame => Unit): Unit = {
     withParquetFile(data)(path => f(sqlContext.parquetFile(path)))
@@ -120,8 +120,8 @@ private[sql] trait ParquetTest {
   protected def withParquetTable[T <: Product: ClassTag: TypeTag]
       (data: Seq[T], tableName: String)
       (f: => Unit): Unit = {
-    withParquetRDD(data) { rdd =>
-      sqlContext.registerDataFrameAsTable(rdd, tableName)
+    withParquetDataFrame(data) { df =>
+      sqlContext.registerDataFrameAsTable(df, tableName)
       withTempTable(tableName)(f)
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 691dae0a0561b..e70e866fdbf14 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -18,17 +18,15 @@
 package org.apache.spark.sql
 
 import scala.concurrent.duration._
-import scala.language.implicitConversions
-import scala.language.postfixOps
+import scala.language.{implicitConversions, postfixOps}
 
 import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark.sql.TestData._
 import org.apache.spark.sql.columnar._
-import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.TestSQLContext._
 import org.apache.spark.sql.test.TestSQLContext.implicits._
-import org.apache.spark.storage.{StorageLevel, RDDBlockId}
+import org.apache.spark.storage.{RDDBlockId, StorageLevel}
 
 case class BigData(s: String)
 
@@ -59,15 +57,15 @@ class CachedTableSuite extends QueryTest {
 
   test("unpersist an uncached table will not raise exception") {
     assert(None == cacheManager.lookupCachedData(testData))
-    testData.unpersist(true)
+    testData.unpersist(blocking = true)
     assert(None == cacheManager.lookupCachedData(testData))
-    testData.unpersist(false)
+    testData.unpersist(blocking = false)
     assert(None == cacheManager.lookupCachedData(testData))
     testData.persist()
     assert(None != cacheManager.lookupCachedData(testData))
-    testData.unpersist(true)
+    testData.unpersist(blocking = true)
     assert(None == cacheManager.lookupCachedData(testData))
-    testData.unpersist(false)
+    testData.unpersist(blocking = false)
     assert(None == cacheManager.lookupCachedData(testData))
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 0da619def1d09..f31bc38922d4e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.sql.TestData._
-
 import scala.language.postfixOps
 
 import org.apache.spark.sql.functions._
@@ -251,20 +249,20 @@ class DataFrameSuite extends QueryTest {
       Seq(Row(3,1), Row(3,2), Row(2,1), Row(2,2), Row(1,1), Row(1,2)))
 
     checkAnswer(
-      arrayData.toDF.orderBy('data.getItem(0).asc),
-      arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(0)).toSeq)
+      arrayData.toDF().orderBy('data.getItem(0).asc),
+      arrayData.toDF().collect().sortBy(_.getAs[Seq[Int]](0)(0)).toSeq)
 
     checkAnswer(
-      arrayData.toDF.orderBy('data.getItem(0).desc),
-      arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(0)).reverse.toSeq)
+      arrayData.toDF().orderBy('data.getItem(0).desc),
+      arrayData.toDF().collect().sortBy(_.getAs[Seq[Int]](0)(0)).reverse.toSeq)
 
     checkAnswer(
-      arrayData.toDF.orderBy('data.getItem(1).asc),
-      arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(1)).toSeq)
+      arrayData.toDF().orderBy('data.getItem(1).asc),
+      arrayData.toDF().collect().sortBy(_.getAs[Seq[Int]](0)(1)).toSeq)
 
     checkAnswer(
-      arrayData.toDF.orderBy('data.getItem(1).desc),
-      arrayData.toDF.collect().sortBy(_.getAs[Seq[Int]](0)(1)).reverse.toSeq)
+      arrayData.toDF().orderBy('data.getItem(1).desc),
+      arrayData.toDF().collect().sortBy(_.getAs[Seq[Int]](0)(1)).reverse.toSeq)
   }
 
   test("limit") {
@@ -273,11 +271,11 @@ class DataFrameSuite extends QueryTest {
       testData.take(10).toSeq)
 
     checkAnswer(
-      arrayData.toDF.limit(1),
+      arrayData.toDF().limit(1),
       arrayData.take(1).map(r => Row.fromSeq(r.productIterator.toSeq)))
 
     checkAnswer(
-      mapData.toDF.limit(1),
+      mapData.toDF().limit(1),
       mapData.take(1).map(r => Row.fromSeq(r.productIterator.toSeq)))
   }
 
@@ -411,7 +409,7 @@ class DataFrameSuite extends QueryTest {
   }
 
   test("addColumn") {
-    val df = testData.toDF.withColumn("newCol", col("key") + 1)
+    val df = testData.toDF().withColumn("newCol", col("key") + 1)
     checkAnswer(
       df,
       testData.collect().map { case Row(key: Int, value: String) =>
@@ -421,7 +419,7 @@ class DataFrameSuite extends QueryTest {
   }
 
   test("renameColumn") {
-    val df = testData.toDF.withColumn("newCol", col("key") + 1)
+    val df = testData.toDF().withColumn("newCol", col("key") + 1)
       .withColumnRenamed("value", "valueRenamed")
     checkAnswer(
       df,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index fd73065c4ada3..dd0948ad824be 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -40,8 +40,8 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
   }
 
   def assertJoin(sqlString: String, c: Class[_]): Any = {
-    val rdd = sql(sqlString)
-    val physical = rdd.queryExecution.sparkPlan
+    val df = sql(sqlString)
+    val physical = df.queryExecution.sparkPlan
     val operators = physical.collect {
       case j: ShuffledHashJoin => j
       case j: HashOuterJoin => j
@@ -410,8 +410,8 @@ class JoinSuite extends QueryTest with BeforeAndAfterEach {
   }
 
   test("left semi join") {
-    val rdd = sql("SELECT * FROM testData2 LEFT SEMI JOIN testData ON key = a")
-    checkAnswer(rdd,
+    val df = sql("SELECT * FROM testData2 LEFT SEMI JOIN testData ON key = a")
+    checkAnswer(df,
       Row(1, 1) ::
         Row(1, 2) ::
         Row(2, 1) ::
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
index dfb6858957fb9..9b4dd6c620fec 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -35,36 +35,36 @@ class QueryTest extends PlanTest {
   /**
    * Runs the plan and makes sure the answer contains all of the keywords, or the
    * none of keywords are listed in the answer
-   * @param rdd the [[DataFrame]] to be executed
+   * @param df the [[DataFrame]] to be executed
    * @param exists true for make sure the keywords are listed in the output, otherwise
    *               to make sure none of the keyword are not listed in the output
    * @param keywords keyword in string array
    */
-  def checkExistence(rdd: DataFrame, exists: Boolean, keywords: String*) {
-    val outputs = rdd.collect().map(_.mkString).mkString
+  def checkExistence(df: DataFrame, exists: Boolean, keywords: String*) {
+    val outputs = df.collect().map(_.mkString).mkString
     for (key <- keywords) {
       if (exists) {
-        assert(outputs.contains(key), s"Failed for $rdd ($key doens't exist in result)")
+        assert(outputs.contains(key), s"Failed for $df ($key doesn't exist in result)")
       } else {
-        assert(!outputs.contains(key), s"Failed for $rdd ($key existed in the result)")
+        assert(!outputs.contains(key), s"Failed for $df ($key existed in the result)")
       }
     }
   }
 
   /**
    * Runs the plan and makes sure the answer matches the expected result.
-   * @param rdd the [[DataFrame]] to be executed
+   * @param df the [[DataFrame]] to be executed
    * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
    */
-  protected def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Unit = {
-    QueryTest.checkAnswer(rdd, expectedAnswer) match {
+  protected def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row]): Unit = {
+    QueryTest.checkAnswer(df, expectedAnswer) match {
       case Some(errorMessage) => fail(errorMessage)
       case None =>
     }
   }
 
-  protected def checkAnswer(rdd: DataFrame, expectedAnswer: Row): Unit = {
-    checkAnswer(rdd, Seq(expectedAnswer))
+  protected def checkAnswer(df: DataFrame, expectedAnswer: Row): Unit = {
+    checkAnswer(df, Seq(expectedAnswer))
   }
 
   def sqlTest(sqlString: String, expectedAnswer: Seq[Row])(implicit sqlContext: SQLContext): Unit = {
@@ -95,11 +95,11 @@ object QueryTest {
    * If there was exception during the execution or the contents of the DataFrame does not
    * match the expected result, an error message will be returned. Otherwise, a [[None]] will
    * be returned.
-   * @param rdd the [[DataFrame]] to be executed
+   * @param df the [[DataFrame]] to be executed
    * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s.
    */
-  def checkAnswer(rdd: DataFrame, expectedAnswer: Seq[Row]): Option[String] = {
-    val isSorted = rdd.logicalPlan.collect { case s: logical.Sort => s }.nonEmpty
+  def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row]): Option[String] = {
+    val isSorted = df.logicalPlan.collect { case s: logical.Sort => s }.nonEmpty
     def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
       // Converts data to types that we can do equality comparison using Scala collections.
       // For BigDecimal type, the Scala type has a better definition of equality test (similar to
@@ -110,14 +110,14 @@ object QueryTest {
           case o => o
         })
       }
-      if (!isSorted) converted.sortBy(_.toString) else converted
+      if (!isSorted) converted.sortBy(_.toString()) else converted
     }
-    val sparkAnswer = try rdd.collect().toSeq catch {
+    val sparkAnswer = try df.collect().toSeq catch {
       case e: Exception =>
         val errorMessage =
           s"""
             |Exception thrown while executing query:
-            |${rdd.queryExecution}
+            |${df.queryExecution}
             |== Exception ==
             |$e
             |${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
@@ -129,17 +129,17 @@ object QueryTest {
       val errorMessage =
         s"""
         |Results do not match for query:
-        |${rdd.logicalPlan}
+        |${df.logicalPlan}
         |== Analyzed Plan ==
-        |${rdd.queryExecution.analyzed}
+        |${df.queryExecution.analyzed}
         |== Physical Plan ==
-        |${rdd.queryExecution.executedPlan}
+        |${df.queryExecution.executedPlan}
         |== Results ==
         |${sideBySide(
           s"== Correct Answer - ${expectedAnswer.size} ==" +:
-            prepareAnswer(expectedAnswer).map(_.toString),
+            prepareAnswer(expectedAnswer).map(_.toString()),
           s"== Spark Answer - ${sparkAnswer.size} ==" +:
-            prepareAnswer(sparkAnswer).map(_.toString)).mkString("\n")}
+            prepareAnswer(sparkAnswer).map(_.toString())).mkString("\n")}
       """.stripMargin
       return Some(errorMessage)
     }
@@ -147,8 +147,8 @@ object QueryTest {
     return None
   }
 
-  def checkAnswer(rdd: DataFrame, expectedAnswer: java.util.List[Row]): String = {
-    checkAnswer(rdd, expectedAnswer.toSeq) match {
+  def checkAnswer(df: DataFrame, expectedAnswer: java.util.List[Row]): String = {
+    checkAnswer(df, expectedAnswer.toSeq) match {
       case Some(errorMessage) => errorMessage
       case None => null
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 97684f75e79fe..097bf0dd23c89 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1034,10 +1034,10 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("Supporting relational operator '<=>' in Spark SQL") {
     val nullCheckData1 = TestData(1,"1") :: TestData(2,null) :: Nil
     val rdd1 = sparkContext.parallelize((0 to 1).map(i => nullCheckData1(i)))
-    rdd1.toDF.registerTempTable("nulldata1")
+    rdd1.toDF().registerTempTable("nulldata1")
     val nullCheckData2 = TestData(1,"1") :: TestData(2,null) :: Nil
     val rdd2 = sparkContext.parallelize((0 to 1).map(i => nullCheckData2(i)))
-    rdd2.toDF.registerTempTable("nulldata2")
+    rdd2.toDF().registerTempTable("nulldata2")
     checkAnswer(sql("SELECT nulldata1.key FROM nulldata1 join " +
       "nulldata2 on nulldata1.value <=> nulldata2.value"),
         (1 to 2).map(i => Row(i)))
@@ -1046,7 +1046,7 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll {
   test("Multi-column COUNT(DISTINCT ...)") {
     val data = TestData(1,"val_1") :: TestData(2,"val_2") :: Nil
     val rdd = sparkContext.parallelize((0 to 1).map(i => data(i)))
-    rdd.toDF.registerTempTable("distinctData")
+    rdd.toDF().registerTempTable("distinctData")
     checkAnswer(sql("SELECT COUNT(DISTINCT key,value) FROM distinctData"), Row(2))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
index 9a48f8d0634cb..23df6e7eac043 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala
@@ -82,7 +82,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
     val data = ReflectData("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true,
                            new java.math.BigDecimal(1), new Date(12345), new Timestamp(12345), Seq(1,2,3))
     val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.toDF.registerTempTable("reflectData")
+    rdd.toDF().registerTempTable("reflectData")
 
     assert(sql("SELECT * FROM reflectData").collect().head ===
       Row("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true,
@@ -93,7 +93,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
   test("query case class RDD with nulls") {
     val data = NullReflectData(null, null, null, null, null, null, null)
     val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.toDF.registerTempTable("reflectNullData")
+    rdd.toDF().registerTempTable("reflectNullData")
 
     assert(sql("SELECT * FROM reflectNullData").collect().head === Row.fromSeq(Seq.fill(7)(null)))
   }
@@ -101,7 +101,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
   test("query case class RDD with Nones") {
     val data = OptionalReflectData(None, None, None, None, None, None, None)
     val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.toDF.registerTempTable("reflectOptionalData")
+    rdd.toDF().registerTempTable("reflectOptionalData")
 
     assert(sql("SELECT * FROM reflectOptionalData").collect().head === Row.fromSeq(Seq.fill(7)(null)))
   }
@@ -109,7 +109,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
   // Equality is broken for Arrays, so we test that separately.
   test("query binary data") {
     val rdd = sparkContext.parallelize(ReflectBinary(Array[Byte](1)) :: Nil)
-    rdd.toDF.registerTempTable("reflectBinary")
+    rdd.toDF().registerTempTable("reflectBinary")
 
     val result = sql("SELECT data FROM reflectBinary").collect().head(0).asInstanceOf[Array[Byte]]
     assert(result.toSeq === Seq[Byte](1))
@@ -128,7 +128,7 @@ class ScalaReflectionRelationSuite extends FunSuite {
         Map(10 -> Some(100L), 20 -> Some(200L), 30 -> None),
         Nested(None, "abc")))
     val rdd = sparkContext.parallelize(data :: Nil)
-    rdd.toDF.registerTempTable("reflectComplexData")
+    rdd.toDF().registerTempTable("reflectComplexData")
 
     assert(sql("SELECT * FROM reflectComplexData").collect().head ===
       new GenericRow(Array[Any](
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
index c511eb1469167..637f59b2e68ca 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
@@ -29,11 +29,11 @@ case class TestData(key: Int, value: String)
 
 object TestData {
   val testData = TestSQLContext.sparkContext.parallelize(
-    (1 to 100).map(i => TestData(i, i.toString))).toDF
+    (1 to 100).map(i => TestData(i, i.toString))).toDF()
   testData.registerTempTable("testData")
 
   val negativeData = TestSQLContext.sparkContext.parallelize(
-    (1 to 100).map(i => TestData(-i, (-i).toString))).toDF
+    (1 to 100).map(i => TestData(-i, (-i).toString))).toDF()
   negativeData.registerTempTable("negativeData")
 
   case class LargeAndSmallInts(a: Int, b: Int)
@@ -44,7 +44,7 @@ object TestData {
       LargeAndSmallInts(2147483645, 1) ::
       LargeAndSmallInts(2, 2) ::
       LargeAndSmallInts(2147483646, 1) ::
-      LargeAndSmallInts(3, 2) :: Nil).toDF
+      LargeAndSmallInts(3, 2) :: Nil).toDF()
   largeAndSmallInts.registerTempTable("largeAndSmallInts")
 
   case class TestData2(a: Int, b: Int)
@@ -55,7 +55,7 @@ object TestData {
       TestData2(2, 1) ::
       TestData2(2, 2) ::
       TestData2(3, 1) ::
-      TestData2(3, 2) :: Nil, 2).toDF
+      TestData2(3, 2) :: Nil, 2).toDF()
   testData2.registerTempTable("testData2")
 
   case class DecimalData(a: BigDecimal, b: BigDecimal)
@@ -67,7 +67,7 @@ object TestData {
       DecimalData(2, 1) ::
       DecimalData(2, 2) ::
       DecimalData(3, 1) ::
-      DecimalData(3, 2) :: Nil).toDF
+      DecimalData(3, 2) :: Nil).toDF()
   decimalData.registerTempTable("decimalData")
 
   case class BinaryData(a: Array[Byte], b: Int)
@@ -77,14 +77,14 @@ object TestData {
       BinaryData("22".getBytes(), 5) ::
       BinaryData("122".getBytes(), 3) ::
       BinaryData("121".getBytes(), 2) ::
-      BinaryData("123".getBytes(), 4) :: Nil).toDF
+      BinaryData("123".getBytes(), 4) :: Nil).toDF()
   binaryData.registerTempTable("binaryData")
 
   case class TestData3(a: Int, b: Option[Int])
   val testData3 =
     TestSQLContext.sparkContext.parallelize(
       TestData3(1, None) ::
-      TestData3(2, Some(2)) :: Nil).toDF
+      TestData3(2, Some(2)) :: Nil).toDF()
   testData3.registerTempTable("testData3")
 
   val emptyTableData = logical.LocalRelation($"a".int, $"b".int)
@@ -97,7 +97,7 @@ object TestData {
       UpperCaseData(3, "C") ::
       UpperCaseData(4, "D") ::
       UpperCaseData(5, "E") ::
-      UpperCaseData(6, "F") :: Nil).toDF
+      UpperCaseData(6, "F") :: Nil).toDF()
   upperCaseData.registerTempTable("upperCaseData")
 
   case class LowerCaseData(n: Int, l: String)
@@ -106,7 +106,7 @@ object TestData {
       LowerCaseData(1, "a") ::
       LowerCaseData(2, "b") ::
       LowerCaseData(3, "c") ::
-      LowerCaseData(4, "d") :: Nil).toDF
+      LowerCaseData(4, "d") :: Nil).toDF()
   lowerCaseData.registerTempTable("lowerCaseData")
 
   case class ArrayData(data: Seq[Int], nestedData: Seq[Seq[Int]])
@@ -114,7 +114,7 @@ object TestData {
     TestSQLContext.sparkContext.parallelize(
       ArrayData(Seq(1,2,3), Seq(Seq(1,2,3))) ::
       ArrayData(Seq(2,3,4), Seq(Seq(2,3,4))) :: Nil)
-  arrayData.toDF.registerTempTable("arrayData")
+  arrayData.toDF().registerTempTable("arrayData")
 
   case class MapData(data: scala.collection.Map[Int, String])
   val mapData =
@@ -124,18 +124,18 @@ object TestData {
       MapData(Map(1 -> "a3", 2 -> "b3", 3 -> "c3")) ::
       MapData(Map(1 -> "a4", 2 -> "b4")) ::
       MapData(Map(1 -> "a5")) :: Nil)
-  mapData.toDF.registerTempTable("mapData")
+  mapData.toDF().registerTempTable("mapData")
 
   case class StringData(s: String)
   val repeatedData =
     TestSQLContext.sparkContext.parallelize(List.fill(2)(StringData("test")))
-  repeatedData.toDF.registerTempTable("repeatedData")
+  repeatedData.toDF().registerTempTable("repeatedData")
 
   val nullableRepeatedData =
     TestSQLContext.sparkContext.parallelize(
       List.fill(2)(StringData(null)) ++
       List.fill(2)(StringData("test")))
-  nullableRepeatedData.toDF.registerTempTable("nullableRepeatedData")
+  nullableRepeatedData.toDF().registerTempTable("nullableRepeatedData")
 
   case class NullInts(a: Integer)
   val nullInts =
@@ -144,7 +144,7 @@ object TestData {
       NullInts(2) ::
       NullInts(3) ::
       NullInts(null) :: Nil
-    ).toDF
+    ).toDF()
   nullInts.registerTempTable("nullInts")
 
   val allNulls =
@@ -152,7 +152,7 @@ object TestData {
       NullInts(null) ::
       NullInts(null) ::
       NullInts(null) ::
-      NullInts(null) :: Nil).toDF
+      NullInts(null) :: Nil).toDF()
   allNulls.registerTempTable("allNulls")
 
   case class NullStrings(n: Int, s: String)
@@ -160,11 +160,15 @@ object TestData {
     TestSQLContext.sparkContext.parallelize(
       NullStrings(1, "abc") ::
       NullStrings(2, "ABC") ::
-      NullStrings(3, null) :: Nil).toDF
+      NullStrings(3, null) :: Nil).toDF()
   nullStrings.registerTempTable("nullStrings")
 
   case class TableName(tableName: String)
-  TestSQLContext.sparkContext.parallelize(TableName("test") :: Nil).toDF.registerTempTable("tableName")
+  TestSQLContext
+    .sparkContext
+    .parallelize(TableName("test") :: Nil)
+    .toDF()
+    .registerTempTable("tableName")
 
   val unparsedStrings =
     TestSQLContext.sparkContext.parallelize(
@@ -177,22 +181,22 @@ object TestData {
   val timestamps = TestSQLContext.sparkContext.parallelize((1 to 3).map { i =>
     TimestampField(new Timestamp(i))
   })
-  timestamps.toDF.registerTempTable("timestamps")
+  timestamps.toDF().registerTempTable("timestamps")
 
   case class IntField(i: Int)
   // An RDD with 4 elements and 8 partitions
   val withEmptyParts = TestSQLContext.sparkContext.parallelize((1 to 4).map(IntField), 8)
-  withEmptyParts.toDF.registerTempTable("withEmptyParts")
+  withEmptyParts.toDF().registerTempTable("withEmptyParts")
 
   case class Person(id: Int, name: String, age: Int)
   case class Salary(personId: Int, salary: Double)
   val person = TestSQLContext.sparkContext.parallelize(
     Person(0, "mike", 30) ::
-    Person(1, "jim", 20) :: Nil).toDF
+    Person(1, "jim", 20) :: Nil).toDF()
   person.registerTempTable("person")
   val salary = TestSQLContext.sparkContext.parallelize(
     Salary(0, 2000.0) ::
-    Salary(1, 1000.0) :: Nil).toDF
+    Salary(1, 1000.0) :: Nil).toDF()
   salary.registerTempTable("salary")
 
   case class ComplexData(m: Map[Int, String], s: TestData, a: Seq[Int], b: Boolean)
@@ -200,6 +204,6 @@ object TestData {
     TestSQLContext.sparkContext.parallelize(
       ComplexData(Map(1 -> "1"), TestData(1, "1"), Seq(1), true)
         :: ComplexData(Map(2 -> "2"), TestData(2, "2"), Seq(2), false)
-        :: Nil).toDF
+        :: Nil).toDF()
   complexData.registerTempTable("complexData")
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index ad2fbc3f04a9c..ee5c7620d1a22 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -17,13 +17,13 @@
 
 package org.apache.spark.sql.jdbc
 
-import java.math.BigDecimal
+import java.sql.DriverManager
+
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.types._
 import org.apache.spark.sql.test._
-import org.scalatest.{FunSuite, BeforeAndAfter}
-import java.sql.DriverManager
-import TestSQLContext._
+import org.apache.spark.sql.types._
 
 class JDBCWriteSuite extends FunSuite with BeforeAndAfter {
   val url = "jdbc:h2:mem:testdb2"
@@ -54,53 +54,53 @@ class JDBCWriteSuite extends FunSuite with BeforeAndAfter {
       StructField("seq", IntegerType) :: Nil)
 
   test("Basic CREATE") {
-    val srdd = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
+    val df = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
 
-    srdd.createJDBCTable(url, "TEST.BASICCREATETEST", false)
+    df.createJDBCTable(url, "TEST.BASICCREATETEST", false)
     assert(2 == TestSQLContext.jdbc(url, "TEST.BASICCREATETEST").count)
     assert(2 == TestSQLContext.jdbc(url, "TEST.BASICCREATETEST").collect()(0).length)
   }
 
   test("CREATE with overwrite") {
-    val srdd = TestSQLContext.createDataFrame(sc.parallelize(arr2x3), schema3)
-    val srdd2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
+    val df = TestSQLContext.createDataFrame(sc.parallelize(arr2x3), schema3)
+    val df2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
 
-    srdd.createJDBCTable(url, "TEST.DROPTEST", false)
+    df.createJDBCTable(url, "TEST.DROPTEST", false)
     assert(2 == TestSQLContext.jdbc(url, "TEST.DROPTEST").count)
     assert(3 == TestSQLContext.jdbc(url, "TEST.DROPTEST").collect()(0).length)
 
-    srdd2.createJDBCTable(url, "TEST.DROPTEST", true)
+    df2.createJDBCTable(url, "TEST.DROPTEST", true)
     assert(1 == TestSQLContext.jdbc(url, "TEST.DROPTEST").count)
     assert(2 == TestSQLContext.jdbc(url, "TEST.DROPTEST").collect()(0).length)
   }
 
   test("CREATE then INSERT to append") {
-    val srdd = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
-    val srdd2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
+    val df = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
+    val df2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
 
-    srdd.createJDBCTable(url, "TEST.APPENDTEST", false)
-    srdd2.insertIntoJDBC(url, "TEST.APPENDTEST", false)
+    df.createJDBCTable(url, "TEST.APPENDTEST", false)
+    df2.insertIntoJDBC(url, "TEST.APPENDTEST", false)
     assert(3 == TestSQLContext.jdbc(url, "TEST.APPENDTEST").count)
     assert(2 == TestSQLContext.jdbc(url, "TEST.APPENDTEST").collect()(0).length)
   }
 
   test("CREATE then INSERT to truncate") {
-    val srdd = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
-    val srdd2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
+    val df = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
+    val df2 = TestSQLContext.createDataFrame(sc.parallelize(arr1x2), schema2)
 
-    srdd.createJDBCTable(url, "TEST.TRUNCATETEST", false)
-    srdd2.insertIntoJDBC(url, "TEST.TRUNCATETEST", true)
+    df.createJDBCTable(url, "TEST.TRUNCATETEST", false)
+    df2.insertIntoJDBC(url, "TEST.TRUNCATETEST", true)
     assert(1 == TestSQLContext.jdbc(url, "TEST.TRUNCATETEST").count)
     assert(2 == TestSQLContext.jdbc(url, "TEST.TRUNCATETEST").collect()(0).length)
   }
 
   test("Incompatible INSERT to append") {
-    val srdd = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
-    val srdd2 = TestSQLContext.createDataFrame(sc.parallelize(arr2x3), schema3)
+    val df = TestSQLContext.createDataFrame(sc.parallelize(arr2x2), schema2)
+    val df2 = TestSQLContext.createDataFrame(sc.parallelize(arr2x3), schema3)
 
-    srdd.createJDBCTable(url, "TEST.INCOMPATIBLETEST", false)
+    df.createJDBCTable(url, "TEST.INCOMPATIBLETEST", false)
     intercept[org.apache.spark.SparkException] {
-      srdd2.insertIntoJDBC(url, "TEST.INCOMPATIBLETEST", true)
+      df2.insertIntoJDBC(url, "TEST.INCOMPATIBLETEST", true)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegration.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegration.scala
index 4f38110c80cc6..5b8a76f461faf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegration.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegration.scala
@@ -18,18 +18,13 @@
 package org.apache.spark.sql.jdbc
 
 import java.math.BigDecimal
-import java.sql.{Date, DriverManager, Timestamp}
-import com.spotify.docker.client.{DefaultDockerClient, DockerClient}
+import java.sql.{Date, Timestamp}
+
+import com.spotify.docker.client.DockerClient
 import com.spotify.docker.client.messages.ContainerConfig
-import org.scalatest.{FunSuite, BeforeAndAfterAll, Ignore}
+import org.scalatest.{BeforeAndAfterAll, FunSuite, Ignore}
 
-import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.SparkContext._
-import org.apache.spark.sql._
 import org.apache.spark.sql.test._
-import TestSQLContext._
-
-import org.apache.spark.sql.jdbc._
 
 class MySQLDatabase {
   val docker: DockerClient = DockerClientFactory.get()
@@ -37,9 +32,9 @@ class MySQLDatabase {
     println("Pulling mysql")
     docker.pull("mysql")
     println("Configuring container")
-    val config = (ContainerConfig.builder().image("mysql")
-        .env("MYSQL_ROOT_PASSWORD=rootpass")
-        .build())
+    val config = ContainerConfig.builder().image("mysql")
+      .env("MYSQL_ROOT_PASSWORD=rootpass")
+      .build()
     println("Creating container")
     val id = docker.createContainer(config).id
     println("Starting container " + id)
@@ -57,11 +52,10 @@ class MySQLDatabase {
       println("Closing docker client")
       DockerClientFactory.close(docker)
     } catch {
-      case e: Exception => {
+      case e: Exception =>
         println(e)
         println("You may need to clean this up manually.")
         throw e
-      }
     }
   }
 }
@@ -86,10 +80,9 @@ class MySQLDatabase {
         println("Database is up.")
         return;
       } catch {
-        case e: java.sql.SQLException => {
+        case e: java.sql.SQLException =>
           lastException = e
           java.lang.Thread.sleep(250)
-        }
       }
     }
   }
@@ -143,8 +136,8 @@ class MySQLDatabase {
   }
 
   test("Basic test") {
-    val rdd = TestSQLContext.jdbc(url(ip, "foo"), "tbl")
-    val rows = rdd.collect
+    val df = TestSQLContext.jdbc(url(ip, "foo"), "tbl")
+    val rows = df.collect()
     assert(rows.length == 2)
     val types = rows(0).toSeq.map(x => x.getClass.toString)
     assert(types.length == 2)
@@ -153,8 +146,8 @@ class MySQLDatabase {
   }
 
   test("Numeric types") {
-    val rdd = TestSQLContext.jdbc(url(ip, "foo"), "numbers")
-    val rows = rdd.collect
+    val df = TestSQLContext.jdbc(url(ip, "foo"), "numbers")
+    val rows = df.collect()
     assert(rows.length == 1)
     val types = rows(0).toSeq.map(x => x.getClass.toString)
     assert(types.length == 9)
@@ -181,8 +174,8 @@ class MySQLDatabase {
   }
 
   test("Date types") {
-    val rdd = TestSQLContext.jdbc(url(ip, "foo"), "dates")
-    val rows = rdd.collect
+    val df = TestSQLContext.jdbc(url(ip, "foo"), "dates")
+    val rows = df.collect()
     assert(rows.length == 1)
     val types = rows(0).toSeq.map(x => x.getClass.toString)
     assert(types.length == 5)
@@ -199,8 +192,8 @@ class MySQLDatabase {
   }
 
   test("String types") {
-    val rdd = TestSQLContext.jdbc(url(ip, "foo"), "strings")
-    val rows = rdd.collect
+    val df = TestSQLContext.jdbc(url(ip, "foo"), "strings")
+    val rows = df.collect()
     assert(rows.length == 1)
     val types = rows(0).toSeq.map(x => x.getClass.toString)
     assert(types.length == 9)
@@ -225,11 +218,11 @@ class MySQLDatabase {
   }
 
   test("Basic write test") {
-    val rdd1 = TestSQLContext.jdbc(url(ip, "foo"), "numbers")
-    val rdd2 = TestSQLContext.jdbc(url(ip, "foo"), "dates")
-    val rdd3 = TestSQLContext.jdbc(url(ip, "foo"), "strings")
-    rdd1.createJDBCTable(url(ip, "foo"), "numberscopy", false)
-    rdd2.createJDBCTable(url(ip, "foo"), "datescopy", false)
-    rdd3.createJDBCTable(url(ip, "foo"), "stringscopy", false)
+    val df1 = TestSQLContext.jdbc(url(ip, "foo"), "numbers")
+    val df2 = TestSQLContext.jdbc(url(ip, "foo"), "dates")
+    val df3 = TestSQLContext.jdbc(url(ip, "foo"), "strings")
+    df1.createJDBCTable(url(ip, "foo"), "numberscopy", false)
+    df2.createJDBCTable(url(ip, "foo"), "datescopy", false)
+    df3.createJDBCTable(url(ip, "foo"), "stringscopy", false)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegration.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegration.scala
index 7b47feeb7887e..e17be99ac31d5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegration.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegration.scala
@@ -17,13 +17,13 @@
 
 package org.apache.spark.sql.jdbc
 
-import java.math.BigDecimal
-import org.apache.spark.sql.test._
-import org.scalatest.{FunSuite, BeforeAndAfterAll, Ignore}
 import java.sql.DriverManager
-import TestSQLContext._
-import com.spotify.docker.client.{DefaultDockerClient, DockerClient}
+
+import com.spotify.docker.client.DockerClient
 import com.spotify.docker.client.messages.ContainerConfig
+import org.scalatest.{BeforeAndAfterAll, FunSuite, Ignore}
+
+import org.apache.spark.sql.test._
 
 class PostgresDatabase {
   val docker: DockerClient = DockerClientFactory.get()
@@ -31,9 +31,9 @@ class PostgresDatabase {
     println("Pulling postgres")
     docker.pull("postgres")
     println("Configuring container")
-    val config = (ContainerConfig.builder().image("postgres")
-        .env("POSTGRES_PASSWORD=rootpass")
-        .build())
+    val config = ContainerConfig.builder().image("postgres")
+      .env("POSTGRES_PASSWORD=rootpass")
+      .build()
     println("Creating container")
     val id = docker.createContainer(config).id
     println("Starting container " + id)
@@ -51,11 +51,10 @@ class PostgresDatabase {
       println("Closing docker client")
       DockerClientFactory.close(docker)
     } catch {
-      case e: Exception => {
+      case e: Exception =>
         println(e)
         println("You may need to clean this up manually.")
         throw e
-      }
     }
   }
 }
@@ -79,10 +78,9 @@ class PostgresDatabase {
         println("Database is up.")
         return;
       } catch {
-        case e: java.sql.SQLException => {
+        case e: java.sql.SQLException =>
           lastException = e
           java.lang.Thread.sleep(250)
-        }
       }
     }
   }
@@ -113,8 +111,8 @@ class PostgresDatabase {
   }
 
   test("Type mapping for various types") {
-    val rdd = TestSQLContext.jdbc(url(db.ip), "public.bar")
-    val rows = rdd.collect
+    val df = TestSQLContext.jdbc(url(db.ip), "public.bar")
+    val rows = df.collect()
     assert(rows.length == 1)
     val types = rows(0).toSeq.map(x => x.getClass.toString)
     assert(types.length == 10)
@@ -142,8 +140,8 @@ class PostgresDatabase {
   }
 
   test("Basic write test") {
-    val rdd = TestSQLContext.jdbc(url(db.ip), "public.bar")
-    rdd.createJDBCTable(url(db.ip), "public.barcopy", false)
+    val df = TestSQLContext.jdbc(url(db.ip), "public.bar")
+    df.createJDBCTable(url(db.ip), "public.barcopy", false)
     // Test only that it doesn't bomb out.
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
index eb2d5f25290b1..4d32e84fc1115 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
@@ -45,7 +45,7 @@ class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
   val sqlContext = TestSQLContext
 
   private def checkFilterPredicate(
-      rdd: DataFrame,
+      df: DataFrame,
       predicate: Predicate,
       filterClass: Class[_ <: FilterPredicate],
       checker: (DataFrame, Seq[Row]) => Unit,
@@ -53,7 +53,7 @@ class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
     val output = predicate.collect { case a: Attribute => a }.distinct
 
     withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED -> "true") {
-      val query = rdd
+      val query = df
         .select(output.map(e => Column(e)): _*)
         .where(Column(predicate))
 
@@ -85,36 +85,36 @@ class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
 
   private def checkFilterPredicate
       (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Seq[Row])
-      (implicit rdd: DataFrame): Unit = {
-    checkFilterPredicate(rdd, predicate, filterClass, checkAnswer(_, _: Seq[Row]), expected)
+      (implicit df: DataFrame): Unit = {
+    checkFilterPredicate(df, predicate, filterClass, checkAnswer(_, _: Seq[Row]), expected)
   }
 
   private def checkFilterPredicate[T]
       (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: T)
-      (implicit rdd: DataFrame): Unit = {
-    checkFilterPredicate(predicate, filterClass, Seq(Row(expected)))(rdd)
+      (implicit df: DataFrame): Unit = {
+    checkFilterPredicate(predicate, filterClass, Seq(Row(expected)))(df)
   }
 
   private def checkBinaryFilterPredicate
       (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Seq[Row])
-      (implicit rdd: DataFrame): Unit = {
-    def checkBinaryAnswer(rdd: DataFrame, expected: Seq[Row]) = {
+      (implicit df: DataFrame): Unit = {
+    def checkBinaryAnswer(df: DataFrame, expected: Seq[Row]) = {
       assertResult(expected.map(_.getAs[Array[Byte]](0).mkString(",")).toSeq.sorted) {
-        rdd.map(_.getAs[Array[Byte]](0).mkString(",")).collect().toSeq.sorted
+        df.map(_.getAs[Array[Byte]](0).mkString(",")).collect().toSeq.sorted
       }
     }
 
-    checkFilterPredicate(rdd, predicate, filterClass, checkBinaryAnswer _, expected)
+    checkFilterPredicate(df, predicate, filterClass, checkBinaryAnswer _, expected)
   }
 
   private def checkBinaryFilterPredicate
       (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Array[Byte])
-      (implicit rdd: DataFrame): Unit = {
-    checkBinaryFilterPredicate(predicate, filterClass, Seq(Row(expected)))(rdd)
+      (implicit df: DataFrame): Unit = {
+    checkBinaryFilterPredicate(predicate, filterClass, Seq(Row(expected)))(df)
   }
 
   test("filter pushdown - boolean") {
-    withParquetRDD((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) { implicit rdd =>
+    withParquetDataFrame((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) { implicit df =>
       checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
       checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], Seq(Row(true), Row(false)))
 
@@ -124,7 +124,7 @@ class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
   }
 
   test("filter pushdown - short") {
-    withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toShort)))) { implicit rdd =>
+    withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i.toShort)))) { implicit df =>
       checkFilterPredicate(Cast('_1, IntegerType) === 1, classOf[Eq[_]], 1)
       checkFilterPredicate(
         Cast('_1, IntegerType) !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
@@ -151,7 +151,7 @@ class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
   }
 
   test("filter pushdown - integer") {
-    withParquetRDD((1 to 4).map(i => Tuple1(Option(i)))) { implicit rdd =>
+    withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i)))) { implicit df =>
       checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
       checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
 
@@ -176,7 +176,7 @@ class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
   }
 
   test("filter pushdown - long") {
-    withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toLong)))) { implicit rdd =>
+    withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i.toLong)))) { implicit df =>
       checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
       checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
 
@@ -201,7 +201,7 @@ class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
   }
 
   test("filter pushdown - float") {
-    withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toFloat)))) { implicit rdd =>
+    withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i.toFloat)))) { implicit df =>
       checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
       checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
 
@@ -226,7 +226,7 @@ class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
   }
 
   test("filter pushdown - double") {
-    withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toDouble)))) { implicit rdd =>
+    withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i.toDouble)))) { implicit df =>
       checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
       checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
 
@@ -251,7 +251,7 @@ class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
   }
 
   test("filter pushdown - string") {
-    withParquetRDD((1 to 4).map(i => Tuple1(i.toString))) { implicit rdd =>
+    withParquetDataFrame((1 to 4).map(i => Tuple1(i.toString))) { implicit df =>
       checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
       checkFilterPredicate(
         '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.toString)))
@@ -282,7 +282,7 @@ class ParquetFilterSuiteBase extends QueryTest with ParquetTest {
       def b: Array[Byte] = int.toString.getBytes("UTF-8")
     }
 
-    withParquetRDD((1 to 4).map(i => Tuple1(i.b))) { implicit rdd =>
+    withParquetDataFrame((1 to 4).map(i => Tuple1(i.b))) { implicit df =>
       checkBinaryFilterPredicate('_1 === 1.b, classOf[Eq[_]], 1.b)
 
       checkBinaryFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index 208f35761b807..36f3406a7825f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -73,7 +73,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
    * Writes `data` to a Parquet file, reads it back and check file contents.
    */
   protected def checkParquetFile[T <: Product : ClassTag: TypeTag](data: Seq[T]): Unit = {
-    withParquetRDD(data)(r => checkAnswer(r, data.map(Row.fromTuple)))
+    withParquetDataFrame(data)(r => checkAnswer(r, data.map(Row.fromTuple)))
   }
 
   test("basic data types (without binary)") {
@@ -85,9 +85,9 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
 
   test("raw binary") {
     val data = (1 to 4).map(i => Tuple1(Array.fill(3)(i.toByte)))
-    withParquetRDD(data) { rdd =>
+    withParquetDataFrame(data) { df =>
       assertResult(data.map(_._1.mkString(",")).sorted) {
-        rdd.collect().map(_.getAs[Array[Byte]](0).mkString(",")).sorted
+        df.collect().map(_.getAs[Array[Byte]](0).mkString(",")).sorted
       }
     }
   }
@@ -106,7 +106,7 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
       sparkContext
         .parallelize(0 to 1000)
         .map(i => Tuple1(i / 100.0))
-        .toDF
+        .toDF()
         // Parquet doesn't allow column names with spaces, have to add an alias here
         .select($"_1" cast decimal as "dec")
 
@@ -147,9 +147,9 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
 
   test("struct") {
     val data = (1 to 4).map(i => Tuple1((i, s"val_$i")))
-    withParquetRDD(data) { rdd =>
+    withParquetDataFrame(data) { df =>
       // Structs are converted to `Row`s
-      checkAnswer(rdd, data.map { case Tuple1(struct) =>
+      checkAnswer(df, data.map { case Tuple1(struct) =>
         Row(Row(struct.productIterator.toSeq: _*))
       })
     }
@@ -157,9 +157,9 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
 
   test("nested struct with array of array as field") {
     val data = (1 to 4).map(i => Tuple1((i, Seq(Seq(s"val_$i")))))
-    withParquetRDD(data) { rdd =>
+    withParquetDataFrame(data) { df =>
       // Structs are converted to `Row`s
-      checkAnswer(rdd, data.map { case Tuple1(struct) =>
+      checkAnswer(df, data.map { case Tuple1(struct) =>
         Row(Row(struct.productIterator.toSeq: _*))
       })
     }
@@ -167,8 +167,8 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
 
   test("nested map with struct as value type") {
     val data = (1 to 4).map(i => Tuple1(Map(i -> (i, s"val_$i"))))
-    withParquetRDD(data) { rdd =>
-      checkAnswer(rdd, data.map { case Tuple1(m) =>
+    withParquetDataFrame(data) { df =>
+      checkAnswer(df, data.map { case Tuple1(m) =>
         Row(m.mapValues(struct => Row(struct.productIterator.toSeq: _*)))
       })
     }
@@ -182,8 +182,8 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
       null.asInstanceOf[java.lang.Float],
       null.asInstanceOf[java.lang.Double])
 
-    withParquetRDD(allNulls :: Nil) { rdd =>
-      val rows = rdd.collect()
+    withParquetDataFrame(allNulls :: Nil) { df =>
+      val rows = df.collect()
       assert(rows.size === 1)
       assert(rows.head === Row(Seq.fill(5)(null): _*))
     }
@@ -195,8 +195,8 @@ class ParquetIOSuiteBase extends QueryTest with ParquetTest {
       None.asInstanceOf[Option[Long]],
       None.asInstanceOf[Option[String]])
 
-    withParquetRDD(allNones :: Nil) { rdd =>
-      val rows = rdd.collect()
+    withParquetDataFrame(allNones :: Nil) { df =>
+      val rows = df.collect()
       assert(rows.size === 1)
       assert(rows.head === Row(Seq.fill(3)(null): _*))
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index 8b4d05ec547c6..b98ba09ccfc2d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -68,7 +68,7 @@ class ParquetQuerySuiteBase extends QueryTest with ParquetTest {
       val selfJoin = sql("SELECT * FROM t x JOIN t y WHERE x._1 = y._1")
       val queryOutput = selfJoin.queryExecution.analyzed.output
 
-      assertResult(4, "Field count mismatche")(queryOutput.size)
+      assertResult(4, "Field count mismatches")(queryOutput.size)
       assertResult(2, "Duplicated expression ID in query plan:\n $selfJoin") {
         queryOutput.filter(_.name == "_1").map(_.exprId).size
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index 2e6c2d5f9ab55..ad880e2bc3679 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -36,8 +36,8 @@ class ParquetSchemaSuite extends FunSuite with ParquetTest {
   private def testSchema[T <: Product: ClassTag: TypeTag](
       testName: String, messageType: String, isThriftDerived: Boolean = false): Unit = {
     test(testName) {
-      val actual = ParquetTypesConverter.convertFromAttributes(ScalaReflection.attributesFor[T], 
-                                                               isThriftDerived)
+      val actual = ParquetTypesConverter.convertFromAttributes(
+        ScalaReflection.attributesFor[T], isThriftDerived)
       val expected = MessageTypeParser.parseMessageType(messageType)
       actual.checkContains(expected)
       expected.checkContains(actual)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
index 9fcb04ca23590..d4b175fa443a4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala
@@ -37,7 +37,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
   import org.apache.spark.sql.hive.test.TestHive.implicits._
 
   val testData = TestHive.sparkContext.parallelize(
-    (1 to 100).map(i => TestData(i, i.toString))).toDF
+    (1 to 100).map(i => TestData(i, i.toString))).toDF()
 
   before {
     // Since every we are doing tests for DDL statements,
@@ -65,7 +65,7 @@ class InsertIntoHiveTableSuite extends QueryTest with BeforeAndAfter {
     // Make sure the table has been updated.
     checkAnswer(
       sql("SELECT * FROM createAndInsertTest"),
-      testData.toDF.collect().toSeq ++ testData.toDF.collect().toSeq
+      testData.toDF().collect().toSeq ++ testData.toDF().collect().toSeq
     )
 
     // Now overwrite.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index e5156ae821bf4..0bd82773f3a55 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -154,7 +154,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
   test("check change without refresh") {
     val tempDir = File.createTempFile("sparksql", "json")
     tempDir.delete()
-    sparkContext.parallelize(("a", "b") :: Nil).toDF
+    sparkContext.parallelize(("a", "b") :: Nil).toDF()
       .toJSON.saveAsTextFile(tempDir.getCanonicalPath)
 
     sql(
@@ -171,7 +171,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       Row("a", "b"))
 
     FileUtils.deleteDirectory(tempDir)
-    sparkContext.parallelize(("a1", "b1", "c1") :: Nil).toDF
+    sparkContext.parallelize(("a1", "b1", "c1") :: Nil).toDF()
       .toJSON.saveAsTextFile(tempDir.getCanonicalPath)
 
     // Schema is cached so the new column does not show. The updated values in existing columns
@@ -192,7 +192,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
   test("drop, change, recreate") {
     val tempDir = File.createTempFile("sparksql", "json")
     tempDir.delete()
-    sparkContext.parallelize(("a", "b") :: Nil).toDF
+    sparkContext.parallelize(("a", "b") :: Nil).toDF()
       .toJSON.saveAsTextFile(tempDir.getCanonicalPath)
 
     sql(
@@ -209,7 +209,7 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       Row("a", "b"))
 
     FileUtils.deleteDirectory(tempDir)
-    sparkContext.parallelize(("a", "b", "c") :: Nil).toDF
+    sparkContext.parallelize(("a", "b", "c") :: Nil).toDF()
       .toJSON.saveAsTextFile(tempDir.getCanonicalPath)
 
     sql("DROP TABLE jsonTable")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 6f07fd5a879c0..1e05a024b8807 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -127,11 +127,11 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
   }
 
   test("estimates the size of a test MetastoreRelation") {
-    val rdd = sql("""SELECT * FROM src""")
-    val sizes = rdd.queryExecution.analyzed.collect { case mr: MetastoreRelation =>
+    val df = sql("""SELECT * FROM src""")
+    val sizes = df.queryExecution.analyzed.collect { case mr: MetastoreRelation =>
       mr.statistics.sizeInBytes
     }
-    assert(sizes.size === 1, s"Size wrong for:\n ${rdd.queryExecution}")
+    assert(sizes.size === 1, s"Size wrong for:\n ${df.queryExecution}")
     assert(sizes(0).equals(BigInt(5812)),
       s"expected exact size 5812 for test table 'src', got: ${sizes(0)}")
   }
@@ -145,10 +145,10 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
         ct: ClassTag[_]) = {
       before()
 
-      var rdd = sql(query)
+      var df = sql(query)
 
       // Assert src has a size smaller than the threshold.
-      val sizes = rdd.queryExecution.analyzed.collect {
+      val sizes = df.queryExecution.analyzed.collect {
         case r if ct.runtimeClass.isAssignableFrom(r.getClass) => r.statistics.sizeInBytes
       }
       assert(sizes.size === 2 && sizes(0) <= conf.autoBroadcastJoinThreshold
@@ -157,21 +157,21 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
       // Using `sparkPlan` because for relevant patterns in HashJoin to be
       // matched, other strategies need to be applied.
-      var bhj = rdd.queryExecution.sparkPlan.collect { case j: BroadcastHashJoin => j }
+      var bhj = df.queryExecution.sparkPlan.collect { case j: BroadcastHashJoin => j }
       assert(bhj.size === 1,
-        s"actual query plans do not contain broadcast join: ${rdd.queryExecution}")
+        s"actual query plans do not contain broadcast join: ${df.queryExecution}")
 
-      checkAnswer(rdd, expectedAnswer) // check correctness of output
+      checkAnswer(df, expectedAnswer) // check correctness of output
 
       TestHive.conf.settings.synchronized {
         val tmp = conf.autoBroadcastJoinThreshold
 
         sql(s"""SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1""")
-        rdd = sql(query)
-        bhj = rdd.queryExecution.sparkPlan.collect { case j: BroadcastHashJoin => j }
+        df = sql(query)
+        bhj = df.queryExecution.sparkPlan.collect { case j: BroadcastHashJoin => j }
         assert(bhj.isEmpty, "BroadcastHashJoin still planned even though it is switched off")
 
-        val shj = rdd.queryExecution.sparkPlan.collect { case j: ShuffledHashJoin => j }
+        val shj = df.queryExecution.sparkPlan.collect { case j: ShuffledHashJoin => j }
         assert(shj.size === 1,
           "ShuffledHashJoin should be planned when BroadcastHashJoin is turned off")
 
@@ -199,10 +199,10 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
         |left semi JOIN src b ON a.key=86 and a.key = b.key""".stripMargin
     val answer = Row(86, "val_86")
 
-    var rdd = sql(leftSemiJoinQuery)
+    var df = sql(leftSemiJoinQuery)
 
     // Assert src has a size smaller than the threshold.
-    val sizes = rdd.queryExecution.analyzed.collect {
+    val sizes = df.queryExecution.analyzed.collect {
       case r if implicitly[ClassTag[MetastoreRelation]].runtimeClass
         .isAssignableFrom(r.getClass) =>
         r.statistics.sizeInBytes
@@ -213,25 +213,25 @@ class StatisticsSuite extends QueryTest with BeforeAndAfterAll {
 
     // Using `sparkPlan` because for relevant patterns in HashJoin to be
     // matched, other strategies need to be applied.
-    var bhj = rdd.queryExecution.sparkPlan.collect {
+    var bhj = df.queryExecution.sparkPlan.collect {
       case j: BroadcastLeftSemiJoinHash => j
     }
     assert(bhj.size === 1,
-      s"actual query plans do not contain broadcast join: ${rdd.queryExecution}")
+      s"actual query plans do not contain broadcast join: ${df.queryExecution}")
 
-    checkAnswer(rdd, answer) // check correctness of output
+    checkAnswer(df, answer) // check correctness of output
 
     TestHive.conf.settings.synchronized {
       val tmp = conf.autoBroadcastJoinThreshold
 
       sql(s"SET ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD}=-1")
-      rdd = sql(leftSemiJoinQuery)
-      bhj = rdd.queryExecution.sparkPlan.collect {
+      df = sql(leftSemiJoinQuery)
+      bhj = df.queryExecution.sparkPlan.collect {
         case j: BroadcastLeftSemiJoinHash => j
       }
       assert(bhj.isEmpty, "BroadcastHashJoin still planned even though it is switched off")
 
-      val shj = rdd.queryExecution.sparkPlan.collect {
+      val shj = df.queryExecution.sparkPlan.collect {
         case j: LeftSemiJoinHash => j
       }
       assert(shj.size === 1,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 955f3f51cfe9f..bb0a67dc03e1d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -429,7 +429,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
       |'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' WITH SERDEPROPERTIES
       |('serialization.last.column.takes.rest'='true') FROM src;
     """.stripMargin.replaceAll("\n", " "))
- 
+
   createQueryTest("LIKE",
     "SELECT * FROM src WHERE value LIKE '%1%'")
 
@@ -567,7 +567,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
       TestHive.sparkContext.parallelize(
         TestData(1, "str1") ::
         TestData(2, "str2") :: Nil)
-    testData.toDF.registerTempTable("REGisteredTABle")
+    testData.toDF().registerTempTable("REGisteredTABle")
 
     assertResult(Array(Row(2, "str2"))) {
       sql("SELECT tablealias.A, TABLEALIAS.b FROM reGisteredTABle TableAlias " +
@@ -583,8 +583,8 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
   test("SPARK-1704: Explain commands as a DataFrame") {
     sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
 
-    val rdd = sql("explain select key, count(value) from src group by key")
-    assert(isExplanation(rdd))
+    val df = sql("explain select key, count(value) from src group by key")
+    assert(isExplanation(df))
 
     TestHive.reset()
   }
@@ -592,7 +592,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
   test("SPARK-2180: HAVING support in GROUP BY clauses (positive)") {
     val fixture = List(("foo", 2), ("bar", 1), ("foo", 4), ("bar", 3))
       .zipWithIndex.map {case Pair(Pair(value, attr), key) => HavingRow(key, value, attr)}
-    TestHive.sparkContext.parallelize(fixture).toDF.registerTempTable("having_test")
+    TestHive.sparkContext.parallelize(fixture).toDF().registerTempTable("having_test")
     val results =
       sql("SELECT value, max(attr) AS attr FROM having_test GROUP BY value HAVING attr > 3")
       .collect()
@@ -740,7 +740,7 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
       TestHive.sparkContext.parallelize(
         TestData(1, "str1") ::
         TestData(1, "str2") :: Nil)
-    testData.toDF.registerTempTable("test_describe_commands2")
+    testData.toDF().registerTempTable("test_describe_commands2")
 
     assertResult(
       Array(
@@ -900,8 +900,8 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
   }
 
   test("SPARK-3414 regression: should store analyzed logical plan when registering a temp table") {
-    sparkContext.makeRDD(Seq.empty[LogEntry]).toDF.registerTempTable("rawLogs")
-    sparkContext.makeRDD(Seq.empty[LogFile]).toDF.registerTempTable("logFiles")
+    sparkContext.makeRDD(Seq.empty[LogEntry]).toDF().registerTempTable("rawLogs")
+    sparkContext.makeRDD(Seq.empty[LogFile]).toDF().registerTempTable("logFiles")
 
     sql(
       """
@@ -979,8 +979,8 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
     val testVal = "test.val.0"
     val nonexistentKey = "nonexistent"
     val KV = "([^=]+)=([^=]*)".r
-    def collectResults(rdd: DataFrame): Set[(String, String)] =
-      rdd.collect().map {
+    def collectResults(df: DataFrame): Set[(String, String)] =
+      df.collect().map {
         case Row(key: String, value: String) => key -> value
         case Row(KV(key, value)) => key -> value
       }.toSet
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
index 6fc4cc14265ec..f4440e5b7846a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveResolutionSuite.scala
@@ -77,7 +77,7 @@ class HiveResolutionSuite extends HiveComparisonTest {
   test("case insensitivity with scala reflection") {
     // Test resolution with Scala Reflection
     sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
-      .toDF.registerTempTable("caseSensitivityTest")
+      .toDF().registerTempTable("caseSensitivityTest")
 
     val query = sql("SELECT a, b, A, B, n.a, n.b, n.A, n.B FROM caseSensitivityTest")
     assert(query.schema.fields.map(_.name) === Seq("a", "b", "A", "B", "a", "b", "A", "B"),
@@ -88,14 +88,14 @@ class HiveResolutionSuite extends HiveComparisonTest {
   ignore("case insensitivity with scala reflection joins") {
     // Test resolution with Scala Reflection
     sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
-      .toDF.registerTempTable("caseSensitivityTest")
+      .toDF().registerTempTable("caseSensitivityTest")
 
     sql("SELECT * FROM casesensitivitytest a JOIN casesensitivitytest b ON a.a = b.a").collect()
   }
 
   test("nested repeated resolution") {
     sparkContext.parallelize(Data(1, 2, Nested(1,2), Seq(Nested(1,2))) :: Nil)
-      .toDF.registerTempTable("nestedRepeatedTest")
+      .toDF().registerTempTable("nestedRepeatedTest")
     assert(sql("SELECT nestedArray[0].a FROM nestedRepeatedTest").collect().head(0) === 1)
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
index 245161d2ebbca..cb405f56bf53d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
@@ -62,7 +62,7 @@ class HiveUdfSuite extends QueryTest {
         |       getStruct(1).f5 FROM src LIMIT 1
       """.stripMargin).head() === Row(1, 2, 3, 4, 5))
   }
-  
+
   test("SPARK-4785 When called with arguments referring column fields, PMOD throws NPE") {
     checkAnswer(
       sql("SELECT PMOD(CAST(key as INT), 10) FROM src LIMIT 1"),
@@ -96,7 +96,7 @@ class HiveUdfSuite extends QueryTest {
   test("SPARK-2693 udaf aggregates test") {
     checkAnswer(sql("SELECT percentile(key, 1) FROM src LIMIT 1"),
       sql("SELECT max(key) FROM src").collect().toSeq)
-      
+
     checkAnswer(sql("SELECT percentile(key, array(1, 1)) FROM src LIMIT 1"),
       sql("SELECT array(max(key), max(key)) FROM src").collect().toSeq)
   }
@@ -104,14 +104,14 @@ class HiveUdfSuite extends QueryTest {
   test("Generic UDAF aggregates") {
     checkAnswer(sql("SELECT ceiling(percentile_approx(key, 0.99999)) FROM src LIMIT 1"),
       sql("SELECT max(key) FROM src LIMIT 1").collect().toSeq)
-      
+
     checkAnswer(sql("SELECT percentile_approx(100.0, array(0.9, 0.9)) FROM src LIMIT 1"),
       sql("SELECT array(100, 100) FROM src LIMIT 1").collect().toSeq)
    }
-  
+
   test("UDFIntegerToString") {
     val testData = TestHive.sparkContext.parallelize(
-      IntegerCaseClass(1) :: IntegerCaseClass(2) :: Nil).toDF
+      IntegerCaseClass(1) :: IntegerCaseClass(2) :: Nil).toDF()
     testData.registerTempTable("integerTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFIntegerToString AS '${classOf[UDFIntegerToString].getName}'")
@@ -127,7 +127,7 @@ class HiveUdfSuite extends QueryTest {
     val testData = TestHive.sparkContext.parallelize(
       ListListIntCaseClass(Nil) ::
       ListListIntCaseClass(Seq((1, 2, 3))) ::
-      ListListIntCaseClass(Seq((4, 5, 6), (7, 8, 9))) :: Nil).toDF
+      ListListIntCaseClass(Seq((4, 5, 6), (7, 8, 9))) :: Nil).toDF()
     testData.registerTempTable("listListIntTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFListListInt AS '${classOf[UDFListListInt].getName}'")
@@ -142,7 +142,7 @@ class HiveUdfSuite extends QueryTest {
   test("UDFListString") {
     val testData = TestHive.sparkContext.parallelize(
       ListStringCaseClass(Seq("a", "b", "c")) ::
-      ListStringCaseClass(Seq("d", "e")) :: Nil).toDF
+      ListStringCaseClass(Seq("d", "e")) :: Nil).toDF()
     testData.registerTempTable("listStringTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFListString AS '${classOf[UDFListString].getName}'")
@@ -156,7 +156,7 @@ class HiveUdfSuite extends QueryTest {
 
   test("UDFStringString") {
     val testData = TestHive.sparkContext.parallelize(
-      StringCaseClass("world") :: StringCaseClass("goodbye") :: Nil).toDF
+      StringCaseClass("world") :: StringCaseClass("goodbye") :: Nil).toDF()
     testData.registerTempTable("stringTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testStringStringUdf AS '${classOf[UDFStringString].getName}'")
@@ -173,7 +173,7 @@ class HiveUdfSuite extends QueryTest {
       ListListIntCaseClass(Nil) ::
       ListListIntCaseClass(Seq((1, 2, 3))) ::
       ListListIntCaseClass(Seq((4, 5, 6), (7, 8, 9))) ::
-      Nil).toDF
+      Nil).toDF()
     testData.registerTempTable("TwoListTable")
 
     sql(s"CREATE TEMPORARY FUNCTION testUDFTwoListList AS '${classOf[UDFTwoListList].getName}'")

From de0dd6de2476c22be3f41f1bf0b3ef7ffeb60001 Mon Sep 17 00:00:00 2001
From: Tor Myklebust <tmyklebu@gmail.com>
Date: Wed, 18 Feb 2015 01:00:13 -0800
Subject: [PATCH 239/357] Avoid deprecation warnings in JDBCSuite.

This pull request replaces calls to deprecated methods from `java.util.Date` with near-equivalents in `java.util.Calendar`.

Author: Tor Myklebust <tmyklebu@gmail.com>

Closes #4668 from tmyklebu/master and squashes the following commits:

66215b1 [Tor Myklebust] Use GregorianCalendar instead of Timestamp get methods.
---
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 33 +++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 07db672217bc1..cd737c0b62767 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -18,9 +18,11 @@
 package org.apache.spark.sql.jdbc
 
 import java.math.BigDecimal
+import java.sql.DriverManager
+import java.util.{Calendar, GregorianCalendar}
+
 import org.apache.spark.sql.test._
 import org.scalatest.{FunSuite, BeforeAndAfter}
-import java.sql.DriverManager
 import TestSQLContext._
 
 class JDBCSuite extends FunSuite with BeforeAndAfter {
@@ -206,20 +208,25 @@ class JDBCSuite extends FunSuite with BeforeAndAfter {
     assert(rows(0).getString(5).equals("I am a clob!"))
   }
 
+
   test("H2 time types") {
     val rows = sql("SELECT * FROM timetypes").collect()
-    assert(rows(0).getAs[java.sql.Timestamp](0).getHours == 12)
-    assert(rows(0).getAs[java.sql.Timestamp](0).getMinutes == 34)
-    assert(rows(0).getAs[java.sql.Timestamp](0).getSeconds == 56)
-    assert(rows(0).getAs[java.sql.Date](1).getYear == 96)
-    assert(rows(0).getAs[java.sql.Date](1).getMonth == 0)
-    assert(rows(0).getAs[java.sql.Date](1).getDate == 1)
-    assert(rows(0).getAs[java.sql.Timestamp](2).getYear == 102)
-    assert(rows(0).getAs[java.sql.Timestamp](2).getMonth == 1)
-    assert(rows(0).getAs[java.sql.Timestamp](2).getDate == 20)
-    assert(rows(0).getAs[java.sql.Timestamp](2).getHours == 11)
-    assert(rows(0).getAs[java.sql.Timestamp](2).getMinutes == 22)
-    assert(rows(0).getAs[java.sql.Timestamp](2).getSeconds == 33)
+    val cal = new GregorianCalendar(java.util.Locale.ROOT)
+    cal.setTime(rows(0).getAs[java.sql.Timestamp](0))
+    assert(cal.get(Calendar.HOUR_OF_DAY) == 12)
+    assert(cal.get(Calendar.MINUTE) == 34)
+    assert(cal.get(Calendar.SECOND) == 56)
+    cal.setTime(rows(0).getAs[java.sql.Timestamp](1))
+    assert(cal.get(Calendar.YEAR) == 1996)
+    assert(cal.get(Calendar.MONTH) == 0)
+    assert(cal.get(Calendar.DAY_OF_MONTH) == 1)
+    cal.setTime(rows(0).getAs[java.sql.Timestamp](2))
+    assert(cal.get(Calendar.YEAR) == 2002)
+    assert(cal.get(Calendar.MONTH) == 1)
+    assert(cal.get(Calendar.DAY_OF_MONTH) == 20)
+    assert(cal.get(Calendar.HOUR) == 11)
+    assert(cal.get(Calendar.MINUTE) == 22)
+    assert(cal.get(Calendar.SECOND) == 33)
     assert(rows(0).getAs[java.sql.Timestamp](2).getNanos == 543543543)
   }
 

From c1b6fa9838f9d26d60fab3b05a96649882e3dd5b Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 18 Feb 2015 01:00:54 -0800
Subject: [PATCH 240/357] [SPARK-5878] fix DataFrame.repartition() in Python

Also add tests for distinct()

Author: Davies Liu <davies@databricks.com>

Closes #4667 from davies/repartition and squashes the following commits:

79059fd [Davies Liu] add test
cb4915e [Davies Liu] fix repartition
---
 python/pyspark/sql/dataframe.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 388033d385e13..52bd75bf8a369 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -434,12 +434,18 @@ def unpersist(self, blocking=True):
     def repartition(self, numPartitions):
         """ Return a new :class:`DataFrame` that has exactly `numPartitions`
         partitions.
+
+        >>> df.repartition(10).rdd.getNumPartitions()
+        10
         """
-        return DataFrame(self._jdf.repartition(numPartitions, None), self.sql_ctx)
+        return DataFrame(self._jdf.repartition(numPartitions), self.sql_ctx)
 
     def distinct(self):
         """
         Return a new :class:`DataFrame` containing the distinct rows in this DataFrame.
+
+        >>> df.distinct().count()
+        2L
         """
         return DataFrame(self._jdf.distinct(), self.sql_ctx)
 

From e79a7a626d9ac2e2474b9d5008c6b5d07df5c6f1 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Wed, 18 Feb 2015 10:13:28 +0000
Subject: [PATCH 241/357] SPARK-4610 addendum: [Minor] [MLlib] Minor doc fix in
 GBT classification example

numClassesForClassification has been renamed to numClasses.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #4672 from MechCoder/minor-doc and squashes the following commits:

d2ddb7f [MechCoder] Minor doc fix in GBT classification example
---
 docs/mllib-ensembles.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md
index 23ede04b62d5b..fb90b7039971c 100644
--- a/docs/mllib-ensembles.md
+++ b/docs/mllib-ensembles.md
@@ -458,7 +458,7 @@ val (trainingData, testData) = (splits(0), splits(1))
 //  The defaultParams for Classification use LogLoss by default.
 val boostingStrategy = BoostingStrategy.defaultParams("Classification")
 boostingStrategy.numIterations = 3 // Note: Use more iterations in practice.
-boostingStrategy.treeStrategy.numClassesForClassification = 2
+boostingStrategy.treeStrategy.numClasses = 2
 boostingStrategy.treeStrategy.maxDepth = 5
 //  Empty categoricalFeaturesInfo indicates all features are continuous.
 boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()

From 82197ed3bd4b8c29b0c4b183994753f0e02b6903 Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.co.jp>
Date: Wed, 18 Feb 2015 12:20:11 +0000
Subject: [PATCH 242/357] [SPARK-4949]shutdownCallback in
 SparkDeploySchedulerBackend should be enclosed by synchronized block.

A variable `shutdownCallback` in SparkDeploySchedulerBackend can be accessed from multiple threads so it should be enclosed by synchronized block.

Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp>

Closes #3781 from sarutak/SPARK-4949 and squashes the following commits:

c146c93 [Kousuke Saruta] Removed "setShutdownCallback" method
c7265dc [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into SPARK-4949
42ca528 [Kousuke Saruta] Changed the declaration of the variable "shutdownCallback" as a volatile reference instead of AtomicReference
552df7c [Kousuke Saruta] Changed the declaration of the variable "shutdownCallback" as a volatile reference instead of AtomicReference
f556819 [Kousuke Saruta] Merge branch 'master' of git://git.apache.org/spark into SPARK-4949
1b60fd1 [Kousuke Saruta] Improved the locking logics
5942765 [Kousuke Saruta] Enclosed shutdownCallback in SparkDeploySchedulerBackend by synchronized block
---
 .../cluster/SparkDeploySchedulerBackend.scala | 35 +++++++++----------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
index 40fc6b59cdf7b..a0aa555f6244f 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.scheduler.cluster
 
+import java.util.concurrent.Semaphore
+
 import org.apache.spark.{Logging, SparkConf, SparkContext, SparkEnv}
 import org.apache.spark.deploy.{ApplicationDescription, Command}
 import org.apache.spark.deploy.client.{AppClient, AppClientListener}
@@ -31,16 +33,16 @@ private[spark] class SparkDeploySchedulerBackend(
   with AppClientListener
   with Logging {
 
-  var client: AppClient = null
-  var stopping = false
-  var shutdownCallback : (SparkDeploySchedulerBackend) => Unit = _
-  @volatile var appId: String = _
+  private var client: AppClient = null
+  private var stopping = false
+
+  @volatile var shutdownCallback: SparkDeploySchedulerBackend => Unit = _
+  @volatile private var appId: String = _
 
-  val registrationLock = new Object()
-  var registrationDone = false
+  private val registrationBarrier = new Semaphore(0)
 
-  val maxCores = conf.getOption("spark.cores.max").map(_.toInt)
-  val totalExpectedCores = maxCores.getOrElse(0)
+  private val maxCores = conf.getOption("spark.cores.max").map(_.toInt)
+  private val totalExpectedCores = maxCores.getOrElse(0)
 
   override def start() {
     super.start()
@@ -95,8 +97,10 @@ private[spark] class SparkDeploySchedulerBackend(
     stopping = true
     super.stop()
     client.stop()
-    if (shutdownCallback != null) {
-      shutdownCallback(this)
+
+    val callback = shutdownCallback
+    if (callback != null) {
+      callback(this)
     }
   }
 
@@ -149,18 +153,11 @@ private[spark] class SparkDeploySchedulerBackend(
     }
 
   private def waitForRegistration() = {
-    registrationLock.synchronized {
-      while (!registrationDone) {
-        registrationLock.wait()
-      }
-    }
+    registrationBarrier.acquire()
   }
 
   private def notifyContext() = {
-    registrationLock.synchronized {
-      registrationDone = true
-      registrationLock.notifyAll()
-    }
+    registrationBarrier.release()
   }
 
 }

From 5aecdcf1f23a826f6236096001de1dd811dbc443 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 18 Feb 2015 14:41:44 +0000
Subject: [PATCH 243/357] SPARK-5669 [BUILD] [HOTFIX] Spark assembly includes
 incompatibly licensed libgfortran, libgcc code via JBLAS

Correct exclusion path for JBLAS native libs.
(More explanation coming soon on the mailing list re: 1.3.0 RC1)

Author: Sean Owen <sowen@cloudera.com>

Closes #4673 from srowen/SPARK-5669.2 and squashes the following commits:

e29693c [Sean Owen] Correct exclusion path for JBLAS native libs
---
 assembly/pom.xml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index fbb6e94839d42..3d1ed0dd8a7bd 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -119,9 +119,9 @@
               <artifact>org.jblas:jblas</artifact>
               <excludes>
                 <!-- Linux amd64 is OK; not statically linked -->
-                <exclude>lib/Linux/i386/**</exclude>
-                <exclude>lib/Mac OS X/**</exclude>
-                <exclude>lib/Windows/**</exclude>
+                <exclude>lib/static/Linux/i386/**</exclude>
+                <exclude>lib/static/Mac OS X/**</exclude>
+                <exclude>lib/static/Windows/**</exclude>
               </excludes>
             </filter>
           </filters>

From 85e9d091d5d785d412e91038c2490131e64f5634 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 18 Feb 2015 10:09:56 -0800
Subject: [PATCH 244/357] [SPARK-5519][MLLIB] add user guide with example code
 for fp-growth

The API is still not very Java-friendly because `Array[Item]` in `freqItemsets` is recognized as `Object` in Java. We might want to define a case class to wrap the return pair to make it Java friendly.

Author: Xiangrui Meng <meng@databricks.com>

Closes #4661 from mengxr/SPARK-5519 and squashes the following commits:

58ccc25 [Xiangrui Meng] add user guide with example code for fp-growth
---
 docs/mllib-frequent-pattern-mining.md         | 100 ++++++++++++++++++
 docs/mllib-guide.md                           |   2 +
 .../examples/mllib/JavaFPGrowthExample.java   |  63 +++++++++++
 .../examples/mllib/FPGrowthExample.scala      |  51 +++++++++
 4 files changed, 216 insertions(+)
 create mode 100644 docs/mllib-frequent-pattern-mining.md
 create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala

diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md
new file mode 100644
index 0000000000000..0ff9738768aca
--- /dev/null
+++ b/docs/mllib-frequent-pattern-mining.md
@@ -0,0 +1,100 @@
+---
+layout: global
+title: Frequent Pattern Mining - MLlib
+displayTitle: <a href="mllib-guide.html">MLlib</a> - Frequent Pattern Mining
+---
+
+Mining frequent items, itemsets, subsequences, or other substructures is usually among the
+first steps to analyze a large-scale dataset, which has been an active research topic in
+data mining for years.
+We refer users to Wikipedia's [association rule learning](http://en.wikipedia.org/wiki/Association_rule_learning)
+for more information.
+MLlib provides a parallel implementation of FP-growth,
+a popular algorithm to mining frequent itemsets.
+
+## FP-growth
+
+The FP-growth algorithm is described in the paper
+[Han et al., Mining frequent patterns without candidate generation](http://dx.doi.org/10.1145/335191.335372),
+where "FP" stands for frequent pattern.
+Given a dataset of transactions, the first step of FP-growth is to calculate item frequencies and identify frequent items.
+Different from [Apriori-like](http://en.wikipedia.org/wiki/Apriori_algorithm) algorithms designed for the same purpose,
+the second step of FP-growth uses a suffix tree (FP-tree) structure to encode transactions without generating candidate sets
+explicitly, which are usually expensive to generate.
+After the second step, the frequent itemsets can be extracted from the FP-tree.
+In MLlib, we implemented a parallel version of FP-growth called PFP,
+as described in [Li et al., PFP: Parallel FP-growth for query recommendation](http://dx.doi.org/10.1145/1454008.1454027).
+PFP distributes the work of growing FP-trees based on the suffices of transactions,
+and hence more scalable than a single-machine implementation.
+We refer users to the papers for more details.
+
+MLlib's FP-growth implementation takes the following (hyper-)parameters:
+
+* `minSupport`: the minimum support for an itemset to be identified as frequent.
+  For example, if an item appears 3 out of 5 transactions, it has a support of 3/5=0.6.
+* `numPartitions`: the number of partitions used to distribute the work.
+
+**Examples**
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+[`FPGrowth`](api/java/org/apache/spark/mllib/fpm/FPGrowth.html) implements the
+FP-growth algorithm.
+It take a `JavaRDD` of transactions, where each transaction is an `Iterable` of items of a generic type.
+Calling `FPGrowth.run` with transactions returns an
+[`FPGrowthModel`](api/java/org/apache/spark/mllib/fpm/FPGrowthModel.html)
+that stores the frequent itemsets with their frequencies.
+
+{% highlight scala %}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.fpm.{FPGrowth, FPGrowthModel}
+
+val transactions: RDD[Array[String]] = ...
+
+val fpg = new FPGrowth()
+  .setMinSupport(0.2)
+  .setNumPartitions(10)
+val model = fpg.run(transactions)
+
+model.freqItemsets.collect().foreach { case (itemset, freq) =>
+  println(itemset.mkString("[", ",", "]") + ", " + freq)
+}
+{% endhighlight %}
+
+</div>
+
+<div data-lang="java" markdown="1">
+
+[`FPGrowth`](api/java/org/apache/spark/mllib/fpm/FPGrowth.html) implements the
+FP-growth algorithm.
+It take an `RDD` of transactions, where each transaction is an `Array` of items of a generic type.
+Calling `FPGrowth.run` with transactions returns an
+[`FPGrowthModel`](api/java/org/apache/spark/mllib/fpm/FPGrowthModel.html)
+that stores the frequent itemsets with their frequencies.
+
+{% highlight java %}
+import java.util.Arrays;
+import java.util.List;
+
+import scala.Tuple2;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.fpm.FPGrowth;
+import org.apache.spark.mllib.fpm.FPGrowthModel;
+
+JavaRDD<List<String>> transactions = ...
+
+FPGrowth fpg = new FPGrowth()
+  .setMinSupport(0.2)
+  .setNumPartitions(10);
+
+FPGrowthModel<String> model = fpg.run(transactions);
+
+for (Tuple2<Object, Long> s: model.javaFreqItemsets().collect()) {
+   System.out.println("(" + Arrays.toString((Object[]) s._1()) + "): " + s._2());
+}
+{% endhighlight %}
+
+</div>
+</div>
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index fbe809b3478e5..0ca51f92d7a61 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -34,6 +34,8 @@ filtering, dimensionality reduction, as well as underlying optimization primitiv
   * singular value decomposition (SVD)
   * principal component analysis (PCA)
 * [Feature extraction and transformation](mllib-feature-extraction.html)
+* [Frequent pattern mining](mllib-frequent-pattern-mining.html)
+  * FP-growth
 * [Optimization (developer)](mllib-optimization.html)
   * stochastic gradient descent
   * limited-memory BFGS (L-BFGS)
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java
new file mode 100644
index 0000000000000..0db572d7607a9
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import scala.Tuple2;
+
+import com.google.common.collect.Lists;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.fpm.FPGrowth;
+import org.apache.spark.mllib.fpm.FPGrowthModel;
+
+/**
+ * Java example for mining frequent itemsets using FP-growth.
+ */
+public class JavaFPGrowthExample {
+
+  public static void main(String[] args) {
+    SparkConf sparkConf = new SparkConf().setAppName("JavaFPGrowthExample");
+    JavaSparkContext sc = new JavaSparkContext(sparkConf);
+
+
+    // TODO: Read a user-specified input file.
+    @SuppressWarnings("unchecked")
+    JavaRDD<ArrayList<String>> transactions = sc.parallelize(Lists.newArrayList(
+      Lists.newArrayList("r z h k p".split(" ")),
+      Lists.newArrayList("z y x w v u t s".split(" ")),
+      Lists.newArrayList("s x o n r".split(" ")),
+      Lists.newArrayList("x z y m t s q e".split(" ")),
+      Lists.newArrayList("z".split(" ")),
+      Lists.newArrayList("x z y r q t p".split(" "))), 2);
+
+    FPGrowth fpg = new FPGrowth()
+      .setMinSupport(0.3);
+    FPGrowthModel<String> model = fpg.run(transactions);
+
+    for (Tuple2<Object, Long> s: model.javaFreqItemsets().collect()) {
+      System.out.println(Arrays.toString((Object[]) s._1()) + ", " + s._2());
+    }
+
+    sc.stop();
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
new file mode 100644
index 0000000000000..ae66107d7015b
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.mllib.fpm.FPGrowth
+import org.apache.spark.{SparkContext, SparkConf}
+
+/**
+ * Example for mining frequent itemsets using FP-growth.
+ */
+object FPGrowthExample {
+
+  def main(args: Array[String]) {
+    val conf = new SparkConf().setAppName("FPGrowthExample")
+    val sc = new SparkContext(conf)
+
+    // TODO: Read a user-specified input file.
+    val transactions = sc.parallelize(Seq(
+      "r z h k p",
+      "z y x w v u t s",
+      "s x o n r",
+      "x z y m t s q e",
+      "z",
+      "x z y r q t p").map(_.split(" ")), numSlices = 2)
+
+    val fpg = new FPGrowth()
+      .setMinSupport(0.3)
+    val model = fpg.run(transactions)
+
+    model.freqItemsets.collect().foreach { case (itemset, freq) =>
+      println(itemset.mkString("[", ",", "]") + ", " + freq)
+    }
+
+    sc.stop()
+  }
+}

From a8eb92dcb9ab1e6d8a34eed9a8fddeda645b5094 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Wed, 18 Feb 2015 10:11:08 -0800
Subject: [PATCH 245/357] [SPARK-5507] Added documentation for BlockMatrix

Docs for BlockMatrix. mengxr

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #4664 from brkyvz/SPARK-5507PR and squashes the following commits:

4db30b0 [Burak Yavuz] [SPARK-5507] Added documentation for BlockMatrix
---
 docs/mllib-data-types.md | 75 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md
index 101dc2f8695f3..24d22b9bcdfa4 100644
--- a/docs/mllib-data-types.md
+++ b/docs/mllib-data-types.md
@@ -296,6 +296,81 @@ backed by an RDD of its entries.
 The underlying RDDs of a distributed matrix must be deterministic, because we cache the matrix size.
 In general the use of non-deterministic RDDs can lead to errors.
 
+### BlockMatrix
+
+A `BlockMatrix` is a distributed matrix backed by an RDD of `MatrixBlock`s, where `MatrixBlock` is
+a tuple of `((Int, Int), Matrix)`, where the `(Int, Int)` is the index of the block, and `Matrix` is
+the sub-matrix at the given index with size `rowsPerBlock` x `colsPerBlock`.
+`BlockMatrix` supports methods such as `.add` and `.multiply` with another `BlockMatrix`.
+`BlockMatrix` also has a helper function `.validate` which can be used to debug whether the
+`BlockMatrix` is set up properly.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+A [`BlockMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.BlockMatrix) can be
+most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` using `.toBlockMatrix()`.
+`.toBlockMatrix()` will create blocks of size 1024 x 1024. Users may change the sizes of their blocks
+by supplying the values through `.toBlockMatrix(rowsPerBlock, colsPerBlock)`.
+
+{% highlight scala %}
+import org.apache.spark.mllib.linalg.SingularValueDecomposition
+import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry}
+
+val entries: RDD[MatrixEntry] = ... // an RDD of (i, j, v) matrix entries
+// Create a CoordinateMatrix from an RDD[MatrixEntry].
+val coordMat: CoordinateMatrix = new CoordinateMatrix(entries)
+// Transform the CoordinateMatrix to a BlockMatrix
+val matA: BlockMatrix = coordMat.toBlockMatrix().cache()
+
+// validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
+// Nothing happens if it is valid.
+matA.validate
+
+// Calculate A^T A.
+val AtransposeA = matA.transpose.multiply(matA)
+
+// get SVD of 2 * A
+val A2 = matA.add(matA)
+val svd = A2.toIndexedRowMatrix().computeSVD(20, false, 1e-9)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+A [`BlockMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.BlockMatrix) can be
+most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` using `.toBlockMatrix()`.
+`.toBlockMatrix()` will create blocks of size 1024 x 1024. Users may change the sizes of their blocks
+by supplying the values through `.toBlockMatrix(rowsPerBlock, colsPerBlock)`.
+
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.SingularValueDecomposition;
+import org.apache.spark.mllib.linalg.distributed.BlockMatrix;
+import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
+import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
+
+JavaRDD<MatrixEntry> entries = ... // a JavaRDD of (i, j, v) Matrix Entries
+// Create a CoordinateMatrix from a JavaRDD<MatrixEntry>.
+CoordinateMatrix coordMat = new CoordinateMatrix(entries.rdd());
+// Transform the CoordinateMatrix to a BlockMatrix
+BlockMatrix matA = coordMat.toBlockMatrix().cache();
+
+// validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
+// Nothing happens if it is valid.
+matA.validate();
+
+// Calculate A^T A.
+BlockMatrix AtransposeA = matA.transpose().multiply(matA);
+
+// get SVD of 2 * A
+BlockMatrix A2 = matA.add(matA);
+SingularValueDecomposition<IndexedRowMatrix, Matrix> svd =
+  A2.toIndexedRowMatrix().computeSVD(20, false, 1e-9);
+{% endhighlight %}
+</div>
+</div>
+
 ### RowMatrix
 
 A `RowMatrix` is a row-oriented distributed matrix without meaningful row indices, backed by an RDD

From f0e3b71077a6c28aba29a7a75e901a9e0911b9f0 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 18 Feb 2015 14:02:32 -0800
Subject: [PATCH 246/357] [SPARK-5840][SQL] HiveContext cannot be serialized
 due to tuple extraction

Also added test cases for checking the serializability of HiveContext and SQLContext.

Author: Reynold Xin <rxin@databricks.com>

Closes #4628 from rxin/SPARK-5840 and squashes the following commits:

ecb3bcd [Reynold Xin] test cases and reviews.
55eb822 [Reynold Xin] [SPARK-5840][SQL] HiveContext cannot be serialized due to tuple extraction.
---
 .../apache/spark/sql/SerializationSuite.scala | 32 +++++++++++++++++
 .../apache/spark/sql/hive/HiveContext.scala   | 35 ++++++++++---------
 .../spark/sql/hive/SerializationSuite.scala   | 33 +++++++++++++++++
 3 files changed, 84 insertions(+), 16 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala
new file mode 100644
index 0000000000000..6f6d3c9c243d4
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SerializationSuite.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.SparkConf
+import org.apache.spark.serializer.JavaSerializer
+import org.apache.spark.sql.test.TestSQLContext
+
+class SerializationSuite extends FunSuite {
+
+  test("[SPARK-5235] SQLContext should be serializable") {
+    val sqlContext = new SQLContext(TestSQLContext.sparkContext)
+    new JavaSerializer(new SparkConf()).newInstance().serialize(sqlContext)
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index d3365b1e8f44c..2e205e67c0fdd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -222,22 +222,25 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    *    SQLConf.  Additionally, any properties set by set() or a SET command inside sql() will be
    *    set in the SQLConf *as well as* in the HiveConf.
    */
-  @transient protected[hive] lazy val (hiveconf, sessionState) =
-    Option(SessionState.get())
-      .orElse {
-        val newState = new SessionState(new HiveConf(classOf[SessionState]))
-        // Only starts newly created `SessionState` instance.  Any existing `SessionState` instance
-        // returned by `SessionState.get()` must be the most recently started one.
-        SessionState.start(newState)
-        Some(newState)
-      }
-      .map { state =>
-        setConf(state.getConf.getAllProperties)
-        if (state.out == null) state.out = new PrintStream(outputBuffer, true, "UTF-8")
-        if (state.err == null) state.err = new PrintStream(outputBuffer, true, "UTF-8")
-        (state.getConf, state)
-      }
-      .get
+  @transient protected[hive] lazy val sessionState: SessionState = {
+    var state = SessionState.get()
+    if (state == null) {
+      state = new SessionState(new HiveConf(classOf[SessionState]))
+      SessionState.start(state)
+    }
+    if (state.out == null) {
+      state.out = new PrintStream(outputBuffer, true, "UTF-8")
+    }
+    if (state.err == null) {
+      state.err = new PrintStream(outputBuffer, true, "UTF-8")
+    }
+    state
+  }
+
+  @transient protected[hive] lazy val hiveconf: HiveConf = {
+    setConf(sessionState.getConf.getAllProperties)
+    sessionState.getConf
+  }
 
   override def setConf(key: String, value: String): Unit = {
     super.setConf(key, value)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala
new file mode 100644
index 0000000000000..d6ddd539d159d
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/SerializationSuite.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.SparkConf
+import org.apache.spark.serializer.JavaSerializer
+import org.apache.spark.sql.hive.test.TestHive
+
+class SerializationSuite extends FunSuite {
+
+  test("[SPARK-5840] HiveContext should be serializable") {
+    val hiveContext = new HiveContext(TestHive.sparkContext)
+    hiveContext.hiveconf
+    new JavaSerializer(new SparkConf()).newInstance().serialize(hiveContext)
+  }
+}

From aa8f10e82a743d59ce87348af19c0177eb618a66 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 18 Feb 2015 14:17:04 -0800
Subject: [PATCH 247/357] [SPARK-5722] [SQL] [PySpark] infer int as LongType

The `int` is 64-bit on 64-bit machine (very common now), we should infer it as LongType for it in Spark SQL.

Also, LongType in SQL will come back as `int`.

Author: Davies Liu <davies@databricks.com>

Closes #4666 from davies/long and squashes the following commits:

6bc6cc4 [Davies Liu] infer int as LongType
---
 python/pyspark/sql/dataframe.py               | 14 +++++++-----
 python/pyspark/sql/tests.py                   | 22 ++++++++++++++++++-
 python/pyspark/sql/types.py                   |  8 +++----
 .../org/apache/spark/sql/SQLContext.scala     |  1 +
 .../spark/sql/execution/pythonUdfs.scala      |  1 +
 5 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 52bd75bf8a369..c68c97e9260e2 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -803,7 +803,7 @@ def mean(self, *cols):
         >>> df.groupBy().mean('age').collect()
         [Row(AVG(age#0)=3.5)]
         >>> df3.groupBy().mean('age', 'height').collect()
-        [Row(AVG(age#4)=3.5, AVG(height#5)=82.5)]
+        [Row(AVG(age#4L)=3.5, AVG(height#5L)=82.5)]
         """
 
     @df_varargs_api
@@ -814,7 +814,7 @@ def avg(self, *cols):
         >>> df.groupBy().avg('age').collect()
         [Row(AVG(age#0)=3.5)]
         >>> df3.groupBy().avg('age', 'height').collect()
-        [Row(AVG(age#4)=3.5, AVG(height#5)=82.5)]
+        [Row(AVG(age#4L)=3.5, AVG(height#5L)=82.5)]
         """
 
     @df_varargs_api
@@ -825,7 +825,7 @@ def max(self, *cols):
         >>> df.groupBy().max('age').collect()
         [Row(MAX(age#0)=5)]
         >>> df3.groupBy().max('age', 'height').collect()
-        [Row(MAX(age#4)=5, MAX(height#5)=85)]
+        [Row(MAX(age#4L)=5, MAX(height#5L)=85)]
         """
 
     @df_varargs_api
@@ -836,7 +836,7 @@ def min(self, *cols):
         >>> df.groupBy().min('age').collect()
         [Row(MIN(age#0)=2)]
         >>> df3.groupBy().min('age', 'height').collect()
-        [Row(MIN(age#4)=2, MIN(height#5)=80)]
+        [Row(MIN(age#4L)=2, MIN(height#5L)=80)]
         """
 
     @df_varargs_api
@@ -847,7 +847,7 @@ def sum(self, *cols):
         >>> df.groupBy().sum('age').collect()
         [Row(SUM(age#0)=7)]
         >>> df3.groupBy().sum('age', 'height').collect()
-        [Row(SUM(age#4)=7, SUM(height#5)=165)]
+        [Row(SUM(age#4L)=7, SUM(height#5L)=165)]
         """
 
 
@@ -1051,7 +1051,9 @@ def _test():
     sc = SparkContext('local[4]', 'PythonTest')
     globs['sc'] = sc
     globs['sqlCtx'] = SQLContext(sc)
-    globs['df'] = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF()
+    globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')])\
+        .toDF(StructType([StructField('age', IntegerType()),
+                          StructField('name', StringType())]))
     globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF()
     globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80),
                                   Row(name='Bob', age=5, height=85)]).toDF()
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 52f7e65d9ca78..8e1bb36598727 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -38,7 +38,7 @@
 
 from pyspark.sql import SQLContext, HiveContext, Column
 from pyspark.sql.types import IntegerType, Row, ArrayType, StructType, StructField, \
-    UserDefinedType, DoubleType, LongType, StringType
+    UserDefinedType, DoubleType, LongType, StringType, _infer_type
 from pyspark.tests import ReusedPySparkTestCase
 
 
@@ -324,6 +324,26 @@ def test_help_command(self):
         pydoc.render_doc(df.foo)
         pydoc.render_doc(df.take(1))
 
+    def test_infer_long_type(self):
+        longrow = [Row(f1='a', f2=100000000000000)]
+        df = self.sc.parallelize(longrow).toDF()
+        self.assertEqual(df.schema.fields[1].dataType, LongType())
+
+        # this saving as Parquet caused issues as well.
+        output_dir = os.path.join(self.tempdir.name, "infer_long_type")
+        df.saveAsParquetFile(output_dir)
+        df1 = self.sqlCtx.parquetFile(output_dir)
+        self.assertEquals('a', df1.first().f1)
+        self.assertEquals(100000000000000, df1.first().f2)
+
+        self.assertEqual(_infer_type(1), LongType())
+        self.assertEqual(_infer_type(2**10), LongType())
+        self.assertEqual(_infer_type(2**20), LongType())
+        self.assertEqual(_infer_type(2**31 - 1), LongType())
+        self.assertEqual(_infer_type(2**31), LongType())
+        self.assertEqual(_infer_type(2**61), LongType())
+        self.assertEqual(_infer_type(2**71), LongType())
+
 
 class HiveContextSQLTests(ReusedPySparkTestCase):
 
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 40bd7e54a9d7b..9409c6f9f6556 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -583,7 +583,7 @@ def _parse_datatype_json_value(json_value):
 _type_mappings = {
     type(None): NullType,
     bool: BooleanType,
-    int: IntegerType,
+    int: LongType,
     long: LongType,
     float: DoubleType,
     str: StringType,
@@ -933,11 +933,11 @@ def _infer_schema_type(obj, dataType):
     >>> schema = _parse_schema_abstract("a b c d")
     >>> row = (1, 1.0, "str", datetime.date(2014, 10, 10))
     >>> _infer_schema_type(row, schema)
-    StructType...IntegerType...DoubleType...StringType...DateType...
+    StructType...LongType...DoubleType...StringType...DateType...
     >>> row = [[1], {"key": (1, 2.0)}]
     >>> schema = _parse_schema_abstract("a[] b{c d}")
     >>> _infer_schema_type(row, schema)
-    StructType...a,ArrayType...b,MapType(StringType,...c,IntegerType...
+    StructType...a,ArrayType...b,MapType(StringType,...c,LongType...
     """
     if dataType is None:
         return _infer_type(obj)
@@ -992,7 +992,7 @@ def _verify_type(obj, dataType):
 
     >>> _verify_type(None, StructType([]))
     >>> _verify_type("", StringType())
-    >>> _verify_type(0, IntegerType())
+    >>> _verify_type(0, LongType())
     >>> _verify_type(range(3), ArrayType(ShortType()))
     >>> _verify_type(set(), ArrayType(StringType())) # doctest: +IGNORE_EXCEPTION_DETAIL
     Traceback (most recent call last):
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index db32fa80dd3e7..a6cf3cd9ddd4f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -1130,6 +1130,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
     def needsConversion(dataType: DataType): Boolean = dataType match {
       case ByteType => true
       case ShortType => true
+      case LongType => true
       case FloatType => true
       case DateType => true
       case TimestampType => true
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
index 69de4d168a372..33632b8e82ff9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
@@ -186,6 +186,7 @@ object EvaluatePython {
     case (c: Int, ShortType) => c.toShort
     case (c: Long, ShortType) => c.toShort
     case (c: Long, IntegerType) => c.toInt
+    case (c: Int, LongType) => c.toLong
     case (c: Double, FloatType) => c.toFloat
     case (c, StringType) if !c.isInstanceOf[String] => c.toString
 

From d12d2ad76ee673b819c92dd8093ba0a560847761 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 18 Feb 2015 16:29:32 -0800
Subject: [PATCH 248/357] [SPARK-5879][MLLIB] update PIC user guide and add a
 Java example

Updated PIC user guide to reflect API changes and added a simple Java example. The API is still not very Java-friendly. I created SPARK-5990 for this issue.

Author: Xiangrui Meng <meng@databricks.com>

Closes #4680 from mengxr/SPARK-5897 and squashes the following commits:

847d216 [Xiangrui Meng] apache header
87719a2 [Xiangrui Meng] remove PIC image
2dd921f [Xiangrui Meng] update PIC user guide and add a Java example
---
 ...IClusteringFiveCirclesInputsAndOutputs.png | Bin 249245 -> 0 bytes
 docs/mllib-clustering.md                      |  95 +++++++++++++++---
 .../JavaPowerIterationClusteringExample.java  |  58 +++++++++++
 .../clustering/PowerIterationClustering.scala |   9 ++
 4 files changed, 149 insertions(+), 13 deletions(-)
 delete mode 100644 docs/img/PIClusteringFiveCirclesInputsAndOutputs.png
 create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java

diff --git a/docs/img/PIClusteringFiveCirclesInputsAndOutputs.png b/docs/img/PIClusteringFiveCirclesInputsAndOutputs.png
deleted file mode 100644
index ed9adad11d03add0889a9a6e7ca06ee0e91850c0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 249245
zcmeFYbySq!)&PtM5(+3O4I&^Q-Hp;E4bqZB4TI7xq990jNO$+pp>zz*4BZSd^e}XM
z_<QfY@BQBU`}<q#TkEnG&wA#>-e<?z&pA6n)ZfYD;ZoqDp`qa^D#&P}p<#@pp<(f1
zKe}%Ty<{6hLwiDED=n?AC@oE={t0Al>tKb3rVtXJh@~CdPujc5#`5(m>G#iZqp5VQ
zN(Zu1BGcH4*jZnro@(z}X#~_|z6d2TCaEgJjH)x&BYoh#keOXw%|Yr=*rNErx2=9N
zO>lE4(R6>_ds7k#@c}$SbJa^wPl~idQ>S}H%J>zFdt#h7DOVB=T|tWE>4WHQ(#H?i
z)+{<C6Wj%}1Yg5;tr~A<4%$go@%4rtp@jth?0#1Bn2z=l8VvAQwj1rKNalirL6v$B
zh~EB3rWD$DOL8Q!1KrwdOwj?~7{V{~!Dv$X)$d=>pxISl4-pCwz%_{8x)D2_qxTuo
zT~#^1r|Q$}v%tm0r7zK<Hw5@52A_e1Yvevvzi)Y_8zyVgNAFX`mBGJu_cP+o826a5
z(>Lk266Bfe%f6l`n?OAiv)ZsMxAi1CHUa!<()Axc5%Z=!?#?ft^();;fQ6n9lYc<_
zjE>Ltl7i=_avXiXGQM^-uky>tZ?`tD+l6=(`}MPbV_5mser3Z~rGisAT88m#629@<
z<pK@BgJs_avEhjurNqV+kA|ePuM~8>)eW4dA65?2y_9?n!HJ+geWplgnF672hmd|_
zeC5NnfPQq?z=b8CT>ggalhns|nqOa#DzAl3m<QE;Y@pA=#d{virl4!rv%G_=q~4Q&
zgItThon~_M^Y~((UiEM_rfX%o+`OTK82MRzz)H58?Zo2XYxj{NE>H;HT#5_(feKv!
z|6}4WMjfC7`<=i$Vs^97m@=QS8ahmD$s-iv>#>)4pT&J?r~4S-@sdMfaR8HX^wz4Y
z*kd-(!P7oO>;-|?H>1ypHNOS|nui$=Dn-`P8(>aUTD}c>=C8QFK7aFsVK!{*gY?uz
zWhmO6Byi~kjv7wdSNX4$7%VIJ--F7GgshX_{_5+~>>%<a-MA~d#>}RT<Nf79_wA$E
zz?jvqtwl51dBS41&Oj&ALR!0C@&<{@UgH%7^pE~R0)jR-CRvs;-Sv&vRg%eV2fc;0
zB{Rj1EIO9qTLd&mm)y#S?#^8S?PHQ&=E{}Q)0n$_ZaAVr{12Gi2<>}jE7y2v^l{YI
z*~}X1F&e>QNx_o4Ytzfqi1Q0cF8~Y4_w@rxV&?*~K>-=3ET^FFBZDl~kA8FVB$Bgy
z5DhaE5fy2CKzG%~CP52&etaPCyWtw5>w=3`@yTZ0x29?cP3V#s?XnppPy=)FE<;=P
zz%qQnsxj>}6NvUoihBQv_y>%k&-_%eyiQmWbZB3{yn2ciBPEiA4iLv8mYSOAc+Eu{
z7l2!dUitv>%eoS87v1~Un^BC2fQe<X26T}QJST$kaGFE(lR)<mPv1Nxd@H^6oyI)u
zJ)PTk-qnzE`YeSfF`si8lCcS%(GgmbRzEG0Dvm<+>E_a0kO+tVl;faJ>!Z%4+?F+A
z$WVWhmpxlZZx<3FD?TR0N1^(bIY+D(n_ZSHYjBjJ7Jng7AiH3UZ3|C3v`q?&>vNI$
zlOc?nUa_hVuki;vs=1#DV6pWUSBW^#LeZB{{3@X+Gun?Y-t{Ioh#O@*^WYio`Ee-h
z!PrK#^i6OTaAfZxxESfrIEwuUL-dOveN2?x^0WMB1K3kvslPIOb#{_RJzabjBJ*f0
zbnC&HwFq~jQ0#lfd4+j3G1=Xh5={6}3NOWDsQVL%<;|l*6|HhzbH#IGb2&5#wjb)p
zJE*uZ1xNGtp&z4v;`bE1z-at&9VS+oon@Z2T1Hs#TO&EE+Q_O|>BoRhJWc$YP^RwC
zRb|EObxpXI-K(oto3GATK-xMwsyeOO%!S7~n?=w<8uiw~*|N*%V%fU9M_SApjFVQA
zX_H!$qLYGr6p$EFsliN=iG;BoTYwFLE$!r%9ll+(9aUYUU7cOHov|I~l+Gmk<in|g
z!o*^|5?amn;>CijY_p<grN73Lax_XlYOf{qd}cD9P~?4ABUPj6Tn}f2Y#O{|S1wa$
zkAS`?0~!EoX(qUv$i<U}^Eq=t1sOSKHbix0wG4I6b!@d~cJ=mW2%GJOo!l)ISkaVl
zIfK=QC(3d!88WJVevJAUdt@88QrMv}uc0izM>)m6)4Ff7fM#6o*tqYvK-rXWlH`@;
z<>|$SN;#p~TDt^a5}g!mgT^HbmIMvFYe#HOw~oiK<d^uR@Eq_6pIB2k@Y?V`=i=je
zB1lY8&MRgy4d~*+<r?5O0J{7xge5ud0)_-lfIj2rYWeuQQOP5vX6aU^sn3&Sk`z?H
zDYdCjQVf!rQaQL5K$jh1=5X5tAb7TPs&r|7+Nb5>HS!U%nUs|@om80Ao}J@;OMPv%
zN}Z9zjOE3=5{z}cc(VA{P8&aDcB!eaY51}UZ>(p6X~a55HJIHFVV!DWXzx9RU*}xE
zHV3RlP)R>u<Co*gcN}8(F80mz4e<^1t$)}_-(waa9osor1-IbsDW~!_+trg7?Dgs0
zi#YpP`!o9|sZgCzs~gzWS`;Q?>Ymm3wQ<PR*3EOnZgaqOVclykd|zM@SY=o;N9Io8
z9(^ToW<uZ`sS=4tQ!iF0z9!Bl&gu26<x30PiyC>kwYP7)>pOY{qV>)Y%UF;Y>D3Qk
zi<pzxuc|2G9CR<Kn2g*`+^+9d+X1bP99JOyi9e8b;7mU|d_Bs<CFrEKJV@6|r!g9x
zdG>4ZS52VWw-59y^c(bhSq54CSqN#`pyZ&V-xR+uI~RTff-*wsWMt$^!x~6zpZ7<U
zM(#!)e@~%y6-c2-d)_K>sSySYdzEh`CL&>XU3_$A!ETOOrTUSMgU3;4KG37&Jo|i!
z=_zxUT4CBD-!B$3-QVA2yjCZ>7D5hUzQiOjFDQfGCA_T9FDy_|DpK>zpolSuXH}(4
z6%gH1YVs42Y!wcSi;k&cu=rrU)x}Tb&D5yMn(E?QJ+;0a%c(k+5hJ;Au8k@xEehtr
zNiIrm=9R2jeeYTtP}*0Dr+>5i;nd*t@jEmCe?cN*uxwRg@Dx%$;5UriC|gCr&o{L<
zPmTgF5Em7n7(NMZwvGH4oJ$sr%SdT&|7;Yc@1&i{*hln}osadW$uvtXQ36h$?4qoF
z=%wsfHnq8onEt#`DPE)qkkGzgEoPsvJ!UqBU_vvWI;C9Ln5z{Sv=^4gn;st&f6GIZ
z(VspeGH40C)?cW32mqx84)v!@h`=27vm3>czc=?%)RHp<@6IqI9v73g7$-GVT1Bs9
zd;XxgBfiy=PBus`+=ZHFbf+dRE1xURCN{k`u0OFCF1Y^m%|h0-dsE46V<RKYul&N(
zM*p<>bYp)|u#B$!X=y{_xc^2uE2Mb5xVNOI!p;a{WZZVSmr`WUq0_7jGEvrN(`PFS
zDY7*Iv^jP+jUb$%ONDsF3+=x?bdKX42!0Z91L98Cf@)Uap^yylVg39fMO$6faCNyC
z<wwJQaKW4w*z~*`nRrQkDaz{RGs7~?vJhX1aA=kHy>Sd&7^ED1q$qumuIMdt8ng>s
z1hj(?em%FL50^i$cVx6%-+CcGocw}Hrw5->gN0gFux}JR8ODWjs@ho!6lZ@f&U^Ch
z*x3J28;o9}@|*J<T7SRyF>P&7P5s?#5pI|2ll2akm7=zZmWfEi0`Q_}tQ-7t1_v-V
zBAe0dA+dX%{5XK@oyke7$8OA7UQVJor=ao2_VH;r)NS3pekroHxn9r&)i681GBq|X
zT#l4U^oMQR)5C^MBU}77<aWb{Q~L!$ODQhOUa8Jk7irg`75fSE)GdMs5-WZe55uvh
z2=Hj2H)hCQzoO<74|fIkxtuQfGnyLOc8?c+$Blq7t=WkkK+nidhNz?@loC33wAm%;
zZGvt3<TU;!?*bN-2<0TWqds&u^f&MyVH+#Sg2jVQ7I%FRmta>qzvY$YFYbW{gOXdm
z)2K1z+BdbLjC8zrS1OlA$ZojuAZ)AREEqQ^=<Dle%6`NhA7pE49(c48p&5vXiFSBX
z$*O+5iy@Qw2#6y(XV#q`Tkod+ob~|i<;X+2N9h^@zw90)<N90m9^|5ZZfXeAwt5C=
z7GnFPvBDm7XlAn&PLGyLiq`f$DCm(OKA!Ike)r;oW$ozIMP@b`kDjG$jvVE^cw`5`
zhN;jy`?a%l=*j&(0mn%}-x&>!l=09114Yf3hiGUIGHkWqyS!Ib5wQRPIm|3U=2jdY
zK&SiBXlP;{BKNOAD;G0551@mivxtZIi@!#Q+`s>6=6pf-*AN$b@fYt^)#;=`pRDNk
zIruoZUP$25(b0*0va}Y_l#%<#^!s1pFKk>~oJ2S|-QC?e+<7@bp8%ZP!otFwTs)jS
zJnZ))*quEcUCcb#9i18e2gpC+$XGdBe6n?Nu?0EO{ef#{4svx7fAQiEqJRDU&pNF<
zZ2uRNqw_z=x+jqHPY)+I2N&nRVBb#_`_n3-ZtG#?@Lt9iXyxd9k3)i+hnG+6uL=Lr
z^}i_pZL0piO%>w%e@*?ju76Aw<NQOyzmfDm{Q9f)o?#NWVx0eCz67o{u=xcVniQI%
z%o}Zw2fGUdE;>_bXLlniZ@EToxD;MWb^Qp+&KEf1HZWJdA<A=Eos1c_ycx&L8|^37
z%9jO<Gl~Icv5aB?O6leJRw=qEJ6=3L%6se}adtVF2GVR2F|+Egs*JeJxs-@i843GR
zmL$&Li{8UGiK*-BOB=pM9wXxZr!!Y|XX5=g)(5_~b|dNHckHAXPo>Zv{Kw-v+ac`Z
zSvApr_WT*`gN`w7gN;r184dmaK3>vE(Xl^xiT~zrfJ%Lqq5%F~J6Zq+26isv=PvsH
z3+^uzPZe_T{tg#i)MvDST8t<mjK5tbx^3;>wqp=JrHe{HBMkmqm{N4%0)Imn-DfHE
zd#VYgR-gU{v;MFIJ9q5S-*Qsno@xVh1*$Lq$*X((0erK6+m4Qd{Zv62EkycnVbVp(
zQU0COr05<zK)1!A`-uG?a`{8+z0e;1Erb6nwEx4+|9=QADy4(_fAvZR_sL1g!H|9$
zzqck|W<UV3Y6iFTwk=C^Z#CxqK1lk4{_w_E#A4rB>1YY2sa3$nE)JPTzW+<XIJEEe
zhRhD4<%p3VC1tADd`j!7Tof{2HJkyvn0&(C@F|*f!}{pQ{_{y^(B>Ozk0O{J?WSu{
zXu<}Y#glSgomjCw@j@~a0SS?c6E=`~G_!LtDWi3pw+gwT-%;*PszEl*8Vko1VE>kq
zooztNYm(ubG7#ARUUjhXopPdOuW*X-gMf8=i&ncOcPTgrXR`Nso*o&CWMZ>YW*hTr
zU5RgdM2qWi@iYIl*ZRfRKRq(wA$@tSkO!UwwS{|A&epA_j~!dIWF2eM0w(PIPX+BR
zX4dQ+k#a0Q<+~ZbH5Hv(u(Gzu&5(bc%BHKrx#x$1K|uR#4PeBd8MyuGIXd>sZ&JNE
zlsb*Az;mmkxWfD<&B5d9Ab$=(7qJsmKEQ3ad?Kd*{D##Ur^hXLp;xO>T68cmY!9=S
z8x=5Na%0)6J5?`pWlqk{0Ox(*zUcEP2a9M=_gH&7B#jMcK+fXHoVB}hr~uSmhElIo
zZ<!;5*b29#h6Ux8sYS3@We~yT*>5e5>ey3Cb-~=$LiUgpQ#y!Y!qW`}2fQ6O1LRTR
zJ9|ne9DwLJZO5t0<GfJ6f>ak9OYA=;?n4~<P_%%O)H`{!fGRQt)l~87A0CF<TKf4|
zsiJaj5&&5eCunb2pNapC$<%^Zi6iR9umm7-m6OlQD9EHP&^$74+drYa;?f&?8SD+k
zXZUVCRf!^RkzZPXk=Hft>87N@4{iuieIawx7nTbj4BK?<5vR!#oJ8%@3o+T@$@I>r
z%|4oOEYa369XBtdE$1cRCqKN?-tZyN*(G6PCiNBT@4P0gzm6w!7kfKaq+jeCl+jo}
zg*167-!LuI96fUlKf*KTS&V#psR3oOrHqWx5}?1gSe=9lmYI{4gzF#vP+5u@s>~MW
zUd8{N!AH16Fd~&6P&VAoP(P_te0aHFkgi-`<WyDV>A(174EME3n%`2&8~bCF1@^j=
zM}>6ap3hy;hZyD&JPcUDv6AT~Z&N|FKsGj`y+Fxs1cCjp-6urq)T1+t^{&^WAk<pQ
zo?RXFf_9BTBTB81k8{|ml^0CJG`4f$e1PS=CVo^7l4uBVg!E7MeCZ>b92h-|J=)xY
zJysKtX>>?6z;|Qu*MAr1+h1ubg++5&4dxT}_pema)z!&~p_D8XuCcah^7MxfrHag+
zmbyU&guKh@_zPD{Z0(BKVDC|Z8DVER=XH|ga~{`JHDrI(Rp$OP@<NF!5Dh(6AyX#f
zuE6!UYkSMtA5@TX=UWG!>&ctEUAk!kYT_#=%hYmMcSGNFbZaxqYf8rh$3iQLw%2aa
z3<knpKHuUU&7&<}Bn&bw1~#`9{&oO3iLkRR_>7qpU5mT#bd>w+x%zHNzt_94SU5Zp
z+33314KlXWY)gC6F2ra|nHtA0E|E;J<HsY>SZTa*LVqK6QdPGXmEwPD==;m3ZS89(
zb+1~U;KEgV!Hyi@DMYJD;|}p74>yB?L!3ccInlfJ1l)YGPGF}qNBlg_L1|vMqAb#=
zq)F#uP4l|nNs^*CGqyaOAl~eh;a&r9u(@h^|2j_u^uKhH128ma3-cr;M3=-*g|pUx
zfSwx6K|&*bLWrDaPPcX$38Pvs<eXcN#N4;4^Kt>KY;1#PoV>oF^5IW_utj6LY9eoF
z;n8;Zvp0NK^gO0@KL!q9aKbC7+@4=mZ=0D?X~DZ1E9an_aNw0t1(P8qC7?~wr7`P(
zc`a>bcxexBScjRUuwj~L;O(4aojbEU=bU=hPzt$UgLCW}N5Sd<@bb2&k1l1crPhJH
z{cPzZjkP!Y9S{#s#AEhju04op?dFx=3X>^iWQOVKi+l1yaD)yVL()1m?!U_&UvXa&
zrzrGmDC$(Dfa~$$DE74Qi!1*M5qzrAgLJ`PV0ZM+29ELTZWy8`az2w$Si_41m7xnm
z9Hap743vvG1|cEGF25h3Tw3$buCbCkIdmOsYlrBj_J?7<(mg)55MEnZ7E9B5J*?T;
zlYx+Wh=L+-!)stZe8Z%O5oT2IbLf2_<WiN!<ZP4PHJ@Xo{}``&cV8$IKya%Fy2~>~
zK31-48U0qkczQ-b#2Yev#fg~CY(gDt_GV1^jxkawB<KCy-SQ=u(-lD86x7RCwBMv3
z*zp1&S_!E2+BQe(X@4nyud=zmVOZ$##sJRRHh;oi@NJ=Fy92caDeom8yM03Wp-~PZ
zVf|v_C{N_P-7F56c~JW<0|%w>gIb01P4i>RO_zETe;5T18>D3@ZqAO2DRMDq@6tlP
zV9{@cRVT$7(x`$IgB5Xsa#3?-3x+Aau%o+?oH7F4m(wM$RcUW_t$9616P7R9#V7Mz
zzlGp=?I5<tQv+$!w`({A)w&;-bbTlW3phS9F&5wy-PtU-b1Cq<`;ENp%rJp9!;S2}
zLr=m6T~Sy$Fi((sqKZ9qkom2&>y+1f|9#d`#!W1jqYdHE%PLAP)ZyxzI%+RjP^w*}
zM6vE%%cmnk<HSTFKvYc`m9&rUT3yv`l?|nCGJY!9^*Z-i)s9(Ebw!iGg=eE5;#LBc
zUgzjiloB0HBMi!(;ce=6i{ukP4*3o>$v%?kO{-|#yQ3<AjO%#9Sns?dvn6CcW~zx7
zVD&yNuu2+f)Jc%=oSU13o&vN?^M4QZ?mUFI8NAodZFzrIG6sg2&8xk2yx9>s_P#m2
zklc#b0`M=`b>x$nwr$xNTlhU%Z#<6Mq1n3$*+ZJ>mR}ug>)6{b^X^XVKkTio<>QPi
z`_7-ZGmw7o4)I>UYt8IT11U{dr;JSUSwWCnDTw1C0M`J4@>L^Fsry~KsnF(EUIN{7
z15dLrdqT1oY!F(!2(XQEC9`R&-z8-Qt}<3&RQ`f%>uLVo**9%1fbb0W!NZ4ez^r|s
z<b)=`B%f+x5N1HwMmyF+RQU0)yM-L56dmv+DEEF3#cI>>LxeNwW-I0jTCYy-4Qa&1
z&oAA!wz{2e{aOZj75*RdcC|`TG~RDX5Yos#?WqIkfY>INAj);ilodJ$S7(_h7e8Ya
z%jF(LQO3^&4FiE*2Ff)Kje7?XR1$<kx&C<_Pu^TqlYcQE$pNV?aRd214++FrbZ_>r
zT#tj#ih2YHRDP94HWDfzL~oUKv?(ep^BUa(eM}+Q;_)B(taxF<w}WNBn43r)=8Hf^
zcg1ZjO+D6*UfMoA7~_wSlS>Qb^0&T$6c3N4Dq9{clIm$Al<gDPyelb!w7?e4?c_sR
zr3C~LwF{WX?$^n)%lx4w2vi<h#7+Rp75oV+;-sBa+hMA`+>in}n2OvQZC+&Xo_y}b
z7GFND`hfWA^f|(|QLu2}#Ae2Em6Im6Sy=_`7wzhgk8hovGtzGpai{PiYC|`+MCZqP
z@|1YV3>z&tz$SZwohpJeLCKFeEeu)aj4HfA1Q^(tCxck5wF|=IgH#KpcqF${(i`~B
zX^g`iPVQH+#S7~8h%t-i&!9RfojOzKK=%A0HB)-*Ie_1U(zzM<Q{p(Rt<F#R#5;Gp
zs<yFB>9V2@x6803>`p1z-%=yEwR?K99Dlx7N&9YkbNy!JP8;YfgNetqHwI5zg6`1(
zZceD@%BnlWiyI8W5u8(zLjdvTci4mPk5VXfDhx;k;y#`LorTVb*pyq-u22Uhg55&}
zoO75Y#dkd-;X!TfNKgK01vW{rOtLUQ+l=FB13@kH#$bUdiuBfbC=m<c&oQM#*P4E6
zeKQy|eFiQDit-^f**;hedMvMhHStC{#MTJT!bf~vlFkPBe6O8TRq5EDI>i|v{IZ5g
z&RAjBPpW2(w*VH`%*M1eH!3yhmMwE){Xz>E-bAf`bhp?A|Ly%<QI5T1bXbEZ=KkiV
zq8m_iFyj@Iq!Q$=fSA3D+7BU-sUaDdYcVd-bASdovSBQn^1xHkRF}g&fueRVFwK+?
zjylcSXK5#^UUizAh#s47sqi{DR*8RiyZpYh&+0z>i%>+Q0SYw&9G_>-b_T}^r3b-^
z4}hN@jeWsU=^C&Zv_5w5_cEJXo9p~yqnqXMYjOe$HFca7Xam(R3$MGkCN&lEF(^Nw
zAL43b=@LRIZe)GIHxy-c@vM<VGBv}t_Vc-*@a63~W&&DVqw%w*BM{+U-ZLR~Ew=jI
z$@DbQI_FKADLjTGL7*cY9!rXaQ$<dADn8pUYN%5C>C}R_<x_}AwXJgAPWlsHa<%5J
z*_17w%s<Cw8o}?@c~kj&QzfY79QhYSZgC8^fGLx^6@gRV93W-d)sQ&P{i)G3ojpGA
z4c{DgN8j)8Pd_^IW#iAZyKu7gVaIlZvj)^vO~kW_+9g;i`lT&kXz`18_xMWdvgF)B
zKH4h}V-{>vp(H^jmyke-eHzy9){GO-!hvShjv~+&p9S?tn*&Z@vr^p&1>4hPV308B
zG+1y)s^v$6jvhSJIR7TT`yNs9tlyHCzu|M+E76-6x4C>FEycQoAa^rxyad!vX?VOc
zYnK`2x2zW=)r?G|+E9dnW%e>m5sEzlj4M|f%LcVaXw(&dKKV)5qYOZm_lm?1FnToU
z#OO#-223``fnIZ<-gGfYmFsf^QlcnSHK4ufO6yhGZ1HiVsl{7P_;!Y||9FMJkp-f0
zvUulMerR4jFg6chl$WaVvaD{(-x1+Vy~Eon<d8Vp-w8%s;UkOprdik_;5+tqQ%-R*
z3MDOd&8J?oiOK6X&1a=&XK-BN>P+oa$txUVaqp&gX<MU<`3C%0oD<o2XAoF<T1oZ^
zX{BF@01Sk@Y?Hq1#H1lfg20AurRY!^t@#k{R;^-)dYdz+y|Qs1bZ4x7=Q3F&Bbp4?
zzR^J7?##TRUezR$CCa_h1rwUsE(SS4vGP73nIPg+u!^tE)m&m}*Llp5R$c>G9B1<v
z$~3M+Wmy;?P6i`HeZ=vCb-+TQIgaih$H8tSi8?6-gT^I|)_k9?{Cy4QfZ}cjJ1N?3
zu!jSWeFZ6B)EzD`8nlegbTVIm?>}wZT;`biDq-@d590&t{$zF|ys90rG&T0HeUi!0
z5LmuY9<wU4kxK6mO91JjnDdCd!PEN>X>@=ldC8l@gS>*oxqCcB@f8&{rhP7pCG(|f
z#KuM20JV~}i`HTw_8w@a&BU2>cf9GvRV!Vlpg_jd3OKwi!n&*+$2X#EK3-dUC%?ow
zK6<f0QtT+BhmqLh$yLdlOfHLSgVKQ=LSyf5Ax&)`)tAbP>aB8g!o5|c*tuGU(}@hh
zWe`j&oy%PyW{UVtsfnY#4#dr{aSm*cC>NWG-<;EWKIOG~7E7~tDOZ>n>pB*?W3ZrK
zsX(TdxG3(a-Ol&sSe;<b8hEGQ-*;>W$MLSFcL_czWjR9SSC=>1Y^TNSZEml%i)3r6
zW!2d~A3WR{aIbh`@1GunTV~)F3s*r*{9td^roCJdT*bHb1+_TX^TJ7^Gd9nK(>)$c
zxLWyzkPj^Kcqo;$Fki)OR^91FdDyiv1gi1SEZ+3Fa{zskrl}(C?b{Qz&r-98EkYCz
zFqi^9Uj(`jS#><2_TD)2Ms@BqsY`WH8DAHrN<U!bRaxDpg^G+9&O>uQ&CRPO>(2}f
z`Ww)Ct!L;^Uw;}aq}^-PBA8FNA6A8->f=%rW|tG!pn)PY?MCSzqO$emA8H89#iz_h
zY||!M2g6OaC9^is+uaQ>UK>3ZYpU6hk5F7J>yIq+G(=DAgP;nXws{t=N11?jRM+V1
zKy~CVHM!*Afy=VlJIsX=Lg0dJ+f(mr<P%Izp@H}~l}JZHHJ*vjdLzlpGMVo)SlOnk
z=cUST$UfFP+#aakK2L$2q)!)D_*^ns&1?>n3*{xg{rH5WbSB^5&<bH3pIo#{6&dQ-
zrki>&^OUOR^s>KR_4V-31z%Wox3x>oLbuYw6RVP?{5j-X<i!SX-Vyzw4%n=PUBv5}
zZQwEO7b-SIq_D8v7u*QOtXJT*FdW|r<R|&9%WfSnhe4%pUwI%#YKRl<v0xRo?wwy(
zvZ;Oxi)B7Yauo5x{#BPty7w>!>mx(wBY?KoRrNApG_e6k>bj3x!!yLreW4<IS3ULY
zHOnN>eZG`a*8$hh7F6n|bhfK+)y#=my>kk1eF@CbYi`#g>x(+5v+K1mXndKgXX)d}
zD;@+5!W5-q4qu^Eo4I#zpPj1vrZnQ+t*ta(uI{8{endF?hn+u^_eg+F?vJw?O7X&K
zWV{{Q6$w(Vq?T!*x_1`T7nW_m<&J}x;}5UC9x(fg?^LFAE?2`vBW-p33h{Y(6YUlf
z3Wi3CDjxa2tDBN6a3nhUL7I)1OSFeN$pQp!)Sddn*Yq$6ln2>IDUcjo$pQ5rD_Y>U
z<d>dgVthLThlt|)+y$2zc$)QA9g26T{dGU*cv=0fW+VBmL<(c~P$R$;m(<kSU$hjZ
z42QT=-7R|BFG>j}#C09WYjDU@Mt=Ft^glipfAImNF!lXq{5c>%SI6&Tj(}s>F=)?V
z;UjFp!2XNPaK#<t<+ZWrFQ3!Y3+o*REMicpJI2@-tQ04`v0}TWYpwtwyQIH_&^%}{
z*b&=7pn`oz$c$2J0)TF*>o6F)Kb@H0Ssl5*E7&@KZVFZLCm_M_x=!NURISSrhdRZo
zg6Rxi^}Dbe>y`0?2l@8JDY(O?{991rrBZV=-criO_}Sl%EufEnn$lb@3D&sJC33T*
z;^Zf<>OZJv-%$SJj+wnV*s5eFu*ztWARej(>cPzy>_kW~5@HK(3E!jI@pS|?$0FPv
z>qO<V13|3qYfrK-1PknE$oAczyoqpgYynQ}ivB>>eIQO_+f@u7X-bw^B*i`%N~6SW
zn&yd0@D03$>TdRmELHEt4`<Z0iaT?x)=XWb_5w4^io7=UsNU3e%-C9Q!S2e-uv_>m
z@I)-z$)``xr91b1bmhkdE=3IM#3zLnyLCkGhf3*+4?=B&JqrgLI5Fdg>1IDX8vm47
z<Rx^$Ql{~Z{X6MvdFwVPsOGJFK=)EJ`{8gMjDDZlH@`P|Pd*-QDT{cPl25tMWy~Bn
zXx3_Vr9nJT<({X;DMpSRoBR+${7_JYX?_R_!G~Fkd!M+>69<aQtLxnuW2H&vs9x1z
zjiK8D$l~n_wxfkoHYQ7Y<(!+==EH{cBhD<2tP8I+y7+C^p1pMV<2s(dk0`){UfAA$
z(a&*{=zW0j?f7SRkAhExd9^ZAg4<Wm2QNa%&ZhBMMwk4>h%IaOo*O}+X{V->5Lbu|
z-cn2q@{N4d-UoS`^x8H(f~$74WHC=AB`dfgt&v|z;=6B6sGsL+zdG=FNlV`&u|%s*
z(#~@%6gJ@_i===CMX<yz`CMWMyj6rWDU~s<2~=ij5sxcPP4W9QVq~t0HI?N1wLFnS
zSX1t1GMaqr4gyh^b8AL2y_r>$!FAYV0`BJAYnE%?*KPjM7kn{pXpYERozEbocxUhC
zf|dS$OYD{6Ql5>0%}HL1YU;@Xb7YdA+lq2%f{tWaLAuw#*@3BK&CBc0LI;qiHR9m%
z2)Hr=_%Y~{)q+#1ZC=*KXhxGX=r`2pqBNos*m8MY1D+=?F_(3SWpwwO{kqxfUUcrD
zIGI@(|F~D$+MpTbw&g|{-M^zoB1=%1%E0^hAi=_}(r=R%`voCoAf7F6jZf=~hl;S<
zkw?_q5#wK+9bprJduK%jx2nqH4*@UF?cgQXYID97=UVkLCviq6&%6iu&6X3$Zq$^h
zR|y%4N{BdXXEpgHa}L55Zp8)%<WRj|wi8Vpn{|)N#(QEb-myHpKbhk{2y%ND;YCM=
zO~>^J9U4mp(@SC&2=Xlwx=P<O$E;@^#Tf!mRe9;B^9#OPHtbPM^Qm77JFR(PGQ(=5
zmD%$vD}x=&6nbNAlT)`jb*k<jVdd_lq-T6AUw4V_YeLO~%w<PHVxa|+vpuc}3)jDv
zqpIpn_gvOj=C42{1#>D0zNZ&uTvnGZt%7?m=8gcw8(d4b*Y-f(hPmPABUATcB_}Jv
z3=HV&JI;MIHx!#lJ<UB@vAD=9g4{RsLZoT!c$FD~+Q!PHuPCU+rO<<$k9JI&9}lXo
z=x_((Dj8<JoiHo3I`rON(euvDF0I5M^s$E{$nVtOe)0U_^*$wmICTXE-);`bD{^by
z-rm-*l?`z`A4N5Jjrmxmnn<u%w|E$|aIXbm{5Hi}jN5LpG8{$#k#DGlC@+NLab{H1
zgxRi``Lla-*ctUCN*I8Yrz0A_R1KcH5!6{}4ZP15Ri&(Py>;Goy9<}s8~Fj{9rVv@
zgv}nndQlM<Id9KL(C(rckB1`lDz}>)++LlP0p+LaKjk6;-^R&@4F+~E3@qPaN9>;n
z*!g<qh~*n6&VLFlA89r~Zd#`bBQ)b;5=ePp2~F~g`kz^DQNWi;Wc6fz#>6<MS$Gtr
zK-7;oU-Key<f!cCxMG@iU?rTGev2u&L6_udZBq3~r({-4n1&LorRg<yZ=Y^9V7Rs3
zG+oe(7m?1M=N9Ah+&f|w8Si7AMa8kd>73}X-y?6XIgJ7E-pErO9qV^7NTsRnL@Txl
zpT_S-UZ~B7S)jJ=?jIy)-k%leHeT>@dpaHMOp00;t}b88>yZ&BOF{@9L557X)aF5O
z9VK354raTK*U2xj$nvdbs2J(!o$D(hHG`!RFODRVNJk^8C&;ZEcj?x-Qe~KK0+w@B
zpHgj%;U=nC0HFM$@w?y9`*>CB=>>J=`(TbA7}0ATvqN{SUeHjxwBQC9(f+u<f#V+l
zb{(x|e&)PDKYRLfJ6zwct+rQ8435uMnHpp6Yi?0DlvEI5CqG<0Qc^sv(y?FYSdl`B
zfV4}EGgX4&yq8~-I~^2<xti@y+c|rg>pXu9NPleZz?3`L=>fP}0y~Go$(p!}8qSM_
zTFvCOYc<kq6`TPx48BLPKW0{OkM{^<!(tSs*N>Q@#P*ey%Ap5^nuC{n5z5|>`JX7b
z>4z$G`wW949N&|C#(^;X<&LqH3h~)1BE#i@-9_zN0{Gpp!?Xj8C91dxEkhI{(0IT!
zVQqTHhc~8)BI@Ew&WYuffl$}T7IB`bo$G}gTrjEQDI4$aU<<jMZu9RlkKEf~&hu^`
z0!S_K&LZqZ7s7RB8z@6_1MZKlG9({t0NSJiFFtpylf2KxeC#~bbtpU#a>^;F*Vrh`
zeSdE*(xmQel`|X;D=DwXW3$!mMZoJ1Z)3q|Gx9FqE*vGUb|~K_-tX}=o`XvhAOXtn
zIPMiEo6C2qhsV2fMEZi_<AlNvzaz*Yi9ezffKu`)h}vxPP7I8(*Ea5RX*oMR4t@Gf
zq-06uG~kV0?514mnL2&<`Hv-zttfot0ljOu{o2sv0DE&?PCn1@p)}EZ`$GO!y@Rn?
ztkfwBwu7h=CL8V)sL%2IbBnM_po%lG&vZ0-@63KX_cQjy>HOcOvAl_;ll#V^+WHjU
zXk{etOXg}#&gQFhA$y&6zjn?fDuTX0*X|1H=!~MCnmMT<`{EoV;)bylR$9Kh2kKEl
zdW<(Hr;r-eyxM=(IWu#)8nQy=C<=z1aMjE20$y44U`vRe{+`pM-DU;)<bL`Ye9zV{
zvLxIy*g4||PIPRe59kb6-SZLaTUN0Oy+LP7b!jG<Ey7?4hwaC%$|2;sJm*+9otzot
zKwQ<Qdn{K`rDJbcX7oyiS_CdMFUw4XTjgpZgM$1?D_(_(i;euOv33qG=W%zCU-Y$=
zy*+5TxVo8_86ff8`RYI(S)C?vOjK6!7`1#zGO1O{&nReX;s}NhrIZO<^ISR*o%q0i
zk<9YE&v5)Aw8Ig783Nm%AA0|@7QSu_IDOPc1ukx^)4CXRUi<m1tweqVt0&%|8P}S{
z0%M82Q_HnQ@J;-A+PCpKOZ~>3Qy&BIv|iVfJh!@J&K6bntPxW>agcJ_t<|mA9-rDv
zsxeKYqB1u4&d;1`i(E^uU~40B$)4emc^{L_-OP~MsgR+4g$nU6*jtrrebQh^7f(id
ztIP%^;$)liWb0g6e#);<wd^WolJwcZaj|2kF2uW(W2KRc;AV$8#OlSY1~9Uu)%#n5
zb@NRYzVYsmw;4eR&K`vkLaVoiq1~)tq;=q|x3a^yyzEwQhX?r*{-j3X;lf@7*2_NU
zH29O|GskulNw?S8Ul|sqC~3K=ng4_-q;Z}qP%3@D!KRC{L}zL8aI|@@V1HGz!wGpj
ztV&u|SwMx1ho~2?5v*@tj;iMU@=UrtAJ1^9J;?jOL)&<Sd-nkx=RgwYIk`@`xILAD
z?{1^(y{zZHo&N>%m-7ofC6~e8#D3l((_My%<S@gY90%XaaPk3wX)&1J&dRKoUeR}$
z?#oTCMp^U1zFd6rSG8fIsLj+}u|wb1ywgO3W*=jl@<w{iodcIsw<8}R(04NfZo)94
zp=|>1YO^rnAvmAa2Hds?b#NNrRNthwP;sq~<!9xE<O^#1?oD}GJRw&5LU5Sh!Z>F=
zA<v-6MJS->J&Wr4=@a(XtoZM|pqD@OeBm_Yyi8n9n!(cb)geRYhvf^GG4E0?Cd9ZA
zGZ;3r(tB@_oC#}C+6IFmWZ7_kmSiyoJ}t!FKrZ1JH}Uee%*eIk@|`<kB$%Mk6Od+r
zPODkB-A^W><Kd0IH!Ak#qQ7T31n$W`<+Xb|$0y$NGL?nJRs?k=r=yLPez^UY=Y6u4
zKrqo6+%=2uQdlo0Iqb{u`p{lfFl*}M!pympe|wv3u&#J?OxC9_7hiJYj`!YXj_c5B
zoe;0e)(=%eW5Ji$F9pW+X3rHigb4oJV>CZ!rAZThyx*EH6F-wbiV+pm&P@;(A-#B7
zJL-ots%9uVHI3S|vz{g^*hM&E(;NVOWbh<$PJE8Hq<^5-=oHv!{F49D%fEY_Kttds
z&o+h96Kc;NHf|B}C`DxkO12RS#_kB87YaDemtqN~`V;J_ZQ0s^ABQ1!&J*`aFM;9?
zxUBA7*xBTdcs;MDF)-&r?rWO1$*}D^wkp#ijqnkdS-~#-gDI`03fEoJkI1*ZS012U
z(|k3zj0oYBgD%>+9OemFR`AfK88R@n?jurqhwlwhZ%m9MjZ}2SxGO<nRnHD5#rwr^
z?|f#Bx5~3d8*_5=bz<H^16ptPSX>bru^A8oi9!e^?PS~Vd4~N}@>-*WbN-%PS+MT|
z<~h<{J5Vsrx)4|;UXYo%zD8^5<BX_CypgljApu8@V=4}zXNG5At?`730BBPB>V>5H
zcJj=CsX;MT+g)WvuQS~YZ^05lX^-!o@QK4Aw+J-AK4Ga;1v&cV<m9CAZhTxb4!M#_
zt(FcfyIMF5lJ12>*YP?iG$w;i@)m`dlbItVXl5GuhH+M!d9;l*wx_3^mO77vmN=>8
zg_cg-_a=fmjEdTf881^`%+cCW8IwnDpCmsQe2zqJCY8Lgoo%Se_xNLZ8<?f&AmYlJ
zZ_xr+UN~*PwQn%^U^``TWqyb0k$F`gVdmGZ!#IG4AWw+7d82b4gG{>f_U#0g^4lht
z_u*3iY(1wiJ)rR?qXpU6tyib=bGjrQ4Aqum-dgw`<Te=}eX&@$yl}5Cf%vT0a7S|3
zds<Zc3cI`c(VSJI+O24Axv*_bRw0cOsQE>+N0>jY9vd@$g6z4Gd48*!j>(Vo1J6b~
zArq!%T`T9R*pDHf@g+17R6E+08t(9!AX)iz%ek2B*QO&HcT5K>dN~A3y9cA?O&?@t
z-P-CzGS}e9QM;f7aGk);VbSzb7ytsn6&386`#$`VH_kWwZem|Fr%v~Z;e}gPtrvb0
z7H!gXEx5L@uB;nl;kU$jmNc`=UX14{jU2@5S}=|Kc%qEfzeJ*^t-q~Nb?NpfASrp*
z<FaZw(;AWa+n$3~3>pq5M^q1A^<PZ%s?PE6<<+T%=eui!9sWobUFI=%v=0KeQ+72)
z%`6sd`crJ_TT@$mVVNd&bP9}<Zm;FLyV}@}B2C==;g-g>Ab;THHfQ8l4l*$GcD;Ob
zWW3Thqz&?3Gh4|!dPpsx&U7Ju+9#q_{-i9FqBk$JWMC+^RXTNCV%g_I^jY{8|N6p>
zKURT1b{eHS`>G$Y-^sVdo=$O($iIYs2-|0rRr=#-o_yLj#5qPW#|dGtvq$v>l=lst
zxfdM;P|UeS?%$rjn7it*13Ma6xGLjdu9xzwbz`MTFISV=C%Z0;nzdoD-k&&+Ttyg5
z6wY*~c^hSyylTade`x#Hr_lOzaO=BRZR7n)Cu)1Fa(K6RHK;F--_z=#TamS@PWmS}
z(s$td@Oa-}h9q^@^fg4-c9;{KrG0I`V)b|l<bUATTvH(7b|0c*oV7cswBpe5H*KVu
zuTwKFU}%>*L%D2?HTjSF;tiLRq#m@h4Ih>sUKea18_faAfX%n9!n=bmrinB+x_LSB
z;SBM$%6u?zmRqmd6bXL<wB#ZC>1U;Mjyhi)Fe81MkD6JTzU-N{$d^S!U4P|0@o2Ig
zN06?*tHmo}wLC|f{_`<)Tnli5jt6X+7VA2(@=HPF?tzODD6@Xzq@1_);`crvB9Q5V
z5@g{j(^&f2`%FAn$@?=@%c_KpY}Wco;^9?4qxE9X@_4nZWXP%W0M#gBp+eVC+^B{w
zpu<V6+0XpyyBF*JcitiXq{lTe+1HWr(}G*fBeye0ezl<AbEb$>RK9$A+2=sALkXlY
zuu#ySv5JO8q?l3Jzu7j2_Z!$tXZ|QNU3g0ewB9=tHn$fK!{`xQY+dvu;K@K#l-6RQ
zhfnWfMws}Z5dN7A1!-nu91|rzWvvilXQCV9uA5iKm5-ak<v&&a$q92(rMnfxfN@B_
z{i7n5gni=+rsdk5L6;x$DT>)nuBusmPr>L``7H&BkrD+NF#ePo>$l3kQaDTcpq;O4
z<qbS?LMDZR+!SapqS&7<iwY^j*m)laRxs$0h|t2wRltV_?WJFO@P3KaIxKzg`Wau8
zf6V~@am$BlgXZVY1}*z7g)pX~wy!a~NH8jR9<jl%0&WiW@Xi);B4P+3#w|vs@AbDH
zABTfx#`!o$8d6QDZ}SxG^{BKq;Gc{|nRP%^lDXoMf<kTIA88w$UB6Wuel^N3E64&%
zVU!H0#vBg!acy6aHL-XN&b8SlqtTU}u$%cUWmi<eeK&*}iWOHTW1sBy((8@n4^9H=
z7fsI=&N+`e7zUZ7d6)68_!rjeD=&Cadl$WBQxO53{ZzMz_0K?&%$|{mHt;05DMcs=
zCKj?Bu7B4>cAu9&%wLo&$y0{2psqzl%xJ_8Py8M0A)Hp18xAeq&(I`6AjqQEeX>)O
zlJ&?GwS9YLwu2TE(Mel-wG^U^^Q0|LmU6&A`#wV`a^2W2{pom#w0G*{DAn|VWbq}!
zpYrShLtu#Ns4{qe1}qnT_6>ayFIBfBD^uHkD5`F;T-weKENjN}D$&shs#m_s18hMB
zwlP<>ozD9g)AOw)9T4iAEo~Q;Xw*ZbdV(7=Ebk)KlF(GN4z<)le(&nybaxip^@}|d
zb9d4N-xc087St^W+^2Qa>{?t%V~w|5Ww>3dZfezUV143Wx9?ef;T(VYp{)gveW$c9
za;r?-@+~((pRWG<EBjLYlBJYty#%9*FlP{=yHylIlmua3I%|x4Qr)d0(i7pLRzy+n
zmMWPqwEt=LQ^uWymby{4?Bh)~hBDKFcT{>cH}tH(_>^_pTg5&(@*#VFzD2UV-HNj=
zYv26J%K9S?dPkPVg&e*}_ch1FXgwO_S-*LelsG!@ZMH`mz4TAo*#6_E3eHP$#LuKY
zgkcW3M`~~;Uu?`aHp<)Ae}zlh+w*iXNQ2T~uVxA>%0h2s87$1&4mgLiDsG_HHqj5*
zM_g$IwrAVaQ|Y><qJu9g(xNNM2w*l7J1L{&Oo9AWJljR1oo7olW0Vk@$xoyMY3|QZ
z(QR}N$JU6Vd^z}}c^So%-2S}kZ;2h;B&tq;P~5>1?eR*1^X`%2z)N<p7d91wMPnx0
z0v-|(XlfniT6Uq=xQ4UA!QDwn#9~NS_Nv0MzWG(+_h}an^FeHZ+zq>h3DuOl*0N$C
zt~0N6Xey+AOF5tz(VoBISw+!^eC>Uc=k2Jas$XX3kf8{2&H;<6Mb|fIGXhd>Ol3Mx
zHx^Bb`0r-S5mnn2XX@DEyI;Ss>LeH}4M`7vu?O*h95!l#)Kl4i4C-Dyw6*qCywIJx
zwu-f*kQPv#vp2_|s7C)W=*-aE8jb?+YqQ=Nw1A^^8k7h?@w#)~wqX9?yVed2p5J#u
zPyOP%7729Ru5Q13zx%2D(xo&Rt7cfY`*c0Dq%9{N3RnU+j0yPm<GD&IA`8bc;tU)f
zsDhkZ_j&Kk+2farT%NFn`5D#wZEW3WXT4Cf^bYeYpGk51w$IfmYUo2SQWaqbUF^(&
z75|c4ZP3t5&cB(`%h0p}I&)F#&Xh=lZsyi{)4psnRmar0q`3i8^Nz26AMn7LgFN}%
zhvt69d;Fq6dg|xBqP4H#HLfqK&;Z(pyRI*r2O%B>9t5VUJT0p&m4O8MzpMn$9)v)z
zWcKBuAD<0k!g?4mrtkF%yMgr(R-P|s{>NNC$IcyY=u6mnl9v?m#_@If+`*IuDpf~J
z#@tG$ZuUm1C`bNn7t8izEAbZI2I8c46Z!4d1W(t1H7`bY(k|-#g!E<=SG8Sjoe<=p
z@1aN_TVA{&s%zL)Bb_gab%NR2>F5<QyKp4!F;{SKeEuTk;#y`GiFRQSld;rsu#
zt-eE9pIF6Gd4{00#SyFU$F?X7lmSsewKCKJT7KuzAqF`@M1(Z^dJ-gX2UM#f6QP%P
zVXx9EjcnZ`?5B04Vfdx0HIN)<+x*sjborO(DKU8)XV9XeHn}{g0-Qc!e1KxltC+*_
zDZAfIp6yI)WKn+hOL)8z8wB9293Q$FjNjkVw;!FSvm|I+8j>crJfw*f_Rn^-k^#lI
zjNtkLEG+Hue{TA#!m3?e;XhpBEs_e5&ym9?UNcQdoHD}{qN>4Kc$Z{J#3h09K-Gxj
zGq<0fqS|wP#!X6U{8J^xuV=A(b9l6RBlq~;bY*!maPCi=b2jrwnpw6<L#D5vIQ^u&
z?RoU!(b;gAYvp!pb4eBS3@hr8`1;_74lsSc*~-c<Zo#oum2KuegHKlq$&E~=T$TTf
zvmRG`y27lTE(NL$NY^_tG%N}&k$`V~vz;Tc;Fnc)G+dd6jUCAywi>#|*Zbs9)C0qi
z$*j8JEHyV~^f^tv$%K|<<&zU?Vx>F$(!Ox4ANIHB)gSF!(~EzQ$RKGG6KY=H9#BGC
zS5_7-3%pA^ZkBoc<+@_ciJEEg()}OQAED9z75#sYb3K(V?tSZBO2};dC&pxleNuwx
zqrX`h;Y76F@+Ai+By%R|r__IFBjp(4mlkG{XYp!o6*(@zcRf>`2(#UE8Ge$Vn&{j}
z`nu(9@^)1l`#a0UtEma!fgzUo?Cqineg+2VZFmuAEa8Z4?z2}5waCdan|1M?ExcVD
zGQdsXhAl|7rL$QT>m|vrLuTv}!Z9)&!;1(>WH<1<nah`J{t<t-nEtNS<;1ELiY&Lb
zwAwZrQJi4njc+|v+A!X5Or_cqErXee7ij^sP68%t@>L;JZp{Y=d*fMw{_4GK@W*$h
zo=qyCow~F9ZQK$5CJsq8eQBSdOK)$NVpuIaZaYQ7B8Z1)JO|4~W$uVZB0FMjA;s<J
zNg6`5;6!=T_7}-x7g;y!6K8XJwfwysU$|}HCjUa$>PWg8Uc7&%maBUyzC><L@hwi7
z%6S&ynqh|os7AN63~4$kc5Uu%T!dqVDw*@v?ltkVI1DK%N~u}4GmcNRIK^wdw?&Yz
zhB)(Vkelqz>z{T9qm&1V$Z{_B^LQ8^aG0bCo`Q%=Oyzf-gQ^d~d8dxdn<Ltw1*N6h
zt@ioWEvo|rcG1kD*;uFQig@;SQW;IA!sQkfZP~HRYN=PL{>cKsd~444g(YO{+V6CG
zDr0KlSV9XoDev0clrB54To#AuMUbDIL4D+lt@O~F?*bA-JIhXg)ZpuVsfHYOukF2=
zxi>oZr6-!WCudvBccR5-sw=~3%?hvu&B9mI4!*LqTZ<K<1A=%w9z&x`2faU)KV_F?
zW05n7=o!&IIK!}6+t-gFya(}AjjsB@rzL0R&t>p`phiuiKP<rCZd2v|#iQ%st%AVK
zfjGk|0F>^JB4f$n31n=mq#~jG_v`tqWaop@V=npGxN-@eKUa&tKs2yL#rWn~M0ECC
zwG}bXHJ1iMACcJ??zYLd6;UAiLjsli0SP@B$~;bVf7Jc2X&uC*newy5<zm%;(gXen
z3`bN<iJDGgvG93Um%x-`p?aC;TR^~KGzYgDx5H!1Q6JcucJU|Aa)dtpn}02*`raj#
z>cIcj@1GSi0XU?GuPZe7w(GTxmniPDTUCKs?pge5-`pHIBRy7aw>~RYyR|JPMEj-H
zw_1<{<UY*|XO`vH#J;bMxV~pNlRtH|^uNIW#2nYwZj)^W?Txk-7@Q3Hs(x~Ni!97_
zNe)$(D34s<jM$qXGG_HO&`oD$gZ3rP2z&An&{@5qE4pX8qD~s|qx+19Dm1mdpI!0N
z(0{G5{{`3RlH|Q3KU8yksJ%ai`}mK$`7btv<)Dd=_dH8g{&Rx)t4T|Tj*&flc}BZT
z>^S@%ZRI$JlkZ@!6#vTm|4T6c`=Cq4ysx5i-qRHLCmH|ShxPz=U!&A0r}*}tlK3ay
zKgj!E-$yc%?Gvm2h5O$|H89=#4)~P_|D>+{laPPDDOleZj`SNf$Kn06<=-l4I{5E9
z%Qa%2{)57QsK8%{rFiboXv*-}Jcob7^sfQs_ri4!kNNt4(ft7)qko?X&jaJIlKz|Z
zIH`V?O0kc7`Ssta`v-=op?fLEG`g`+{>^$m+uwH@<-DZ-4-%!&0xU`I3&w`(lxO~K
zJw`<Lv0n;CoWEn;%bNQFHOq42fuO%x&vMFr`PY9%`Cn1~cUS&*SN_+){@1|%_gne@
z`L{AEW$0VZKaU{G*-{mHk+<i1UWNeI(}#jP$PHN~ckYWVMZChbWI29u_usqe6(fu7
zO8*agZy6QWvb+xm2o~I30>Rzgg1ZH`;0y$};1Jv)xCak1xVu|$cMtCFypt1h&Rzd|
z?zi{jZ>?FZVK#d=U0q%E)Kk?p^?<QR2j_jjq1Od$b@LKUxR%Bv+0N%J6fc);cWph5
zT2@e53?bjdP2$s}^*ASwbgrH{mz>;DVI((OO6LV~eb4HjUI6OBX>5nit^t@j%HQyw
z#ak+Ra{0Ok;1^I{V0Ym!oNLY|=K1@BL}H(o7|Z$P*_~Ebp%T@&NO7?{$BS(Qb3kQ@
zO0ioDGvN&D%nnaM-XX%7j??0Rc^r`A!}1q*52~^Ct7|iP;cQKhUab9^ky4q9K}nZI
zLF)uWqT}dcNewl<?9{N>t}_6>rxm{YCHp20!WK*<wMA>mS=Qp)NWvp9Qu<CChOW~W
zIK@~34u6QRW<~!S+kk+*grn0EPHAh8YzhjqG-E0%w|<TV6;-k&k8{%ib?$)fvR|pi
z$4@f2926<hDMh`Ww{(`XWkPsI3*YoIIAcDK=dE73;@ydq?iHz7Y>+}5Id0k8<JjUX
z4!#DwfGh8PaM)+ht-Cl$<bUv{4<&d3Ak${H3Pl_;^fIltCC@^}&z9H*A!fQQNteKS
zrIhGXA>5U=g~j)24$~<MX@Kr(pJHo+a)6?#?;p6=W(?+<Qx7j?_h)ua;O*XhvpNPH
z%Iw)V3^!zsnn548M43!^;0VwSE$=Q^b8~X3=uQ~OWOs2iG%Kn;ILaR?so3sxO*b6x
zuI`TBkCoJ`1dMI@p1-VQo6fHh_9e|<MnIPT_)U#~0e`eK!%MC=GVLpynRQRtTyjdS
zI>)kms8!#O2o_q~fV_P8?~RWdd{g6vD;b<WTgSt08yve=c1m(x>=5~@O>^W4?yn1s
zkA>2BHyR7+(|Gha+&3{XFkDenMpx2PtvxmYSB^wu6Y`HzhZM_WqF6dPyI9*h4Jq;F
z8c#nX-X^0Y;I{`%5I|pa6nSh7IxN|^MKcdQ7~Wq$X|`Hi@6r7*=*b<>a&NY(f6P8J
znumm*oO}AB%aNaFLa(H+8!zWvv!A3_^ZoDy)@x}<H%IH}#?Ol8%x~^`x9<sJkK0+d
zU>EKpI;*b%hjS^@RPXbYTGj<5k!o~YOWaJ1hwJ%C`A12M)|KK=H6v=rn&-sTh_R9!
zV~d|NdyW?+Qq?!(<`eQ>8!Gw*6eK^Fj}QqyUpc#$0EP3R7Df(k?W=F@@MPz#YzEiZ
zQ<>o@mV#u;!Cq8@9CT!M-mlDF@%I|j^ZK+5d(2lgkaMOEza5>=$_qGDYRBb!C@oJP
zoXe`#V+F%3ov|&H39$qY71Dek9R}`i!n(zpd0h)xGPKGM>6>a;r=%3aZiiM^+s4Fi
z`K4k1ylKlD%h_GJj+&l^=dXGj_07f7yf`60N5=k99=_}lXK4@bu(8Uxk?qmbhds|}
z|1_D~AFif`^~d_k!->^WI-bqW%=t>m^i{jBG>z2c;tmp*zujcsbUgVKrdr{t1?Z4u
z*V<h~N@k3&<6*%#nj$#h(YmV0B((m!25%gXyHAyZuCSWIzLO<0c{<yZ;U_n-*5vqn
z<_S8WaU)Ul<*DhEAvi#zJ@&rpIa79i68q;Vq=~Y$eM~Xxhff~r`|)=~>QU-A0U1_|
z(#0>~7w8-+W0n2;+j`F5aS~Qgn_#{tHl_#-l~(ou8wNFf8BP_O+2xf-IfZ8}<d~F{
zw8}bXt)hh*FUfgv?<(`ChoBw;Toy*+M->f#?Fn|0sg70QY>qEqFakb-Y_iUMD;|mn
z4JAGMNW=w$E?TS|S47^ZwvJ_CCFB?{WH&WEqV}BEBi+hP(m^(!>2P(;eIzXCO2q*|
ze$8~zdQoJ+aJ4}3+E><d7=Oyp!!-<hXo~mJX<R%5K2NeVZ??J?Ig$xNtu#<B5^9C5
z)zhw_5=uM-9H7SDNLA%kC}Bigk+NB70zGt|dNJN91w`lI^`ZSCC>YDZ6}?K6W@&95
zUrd7pyHAUn?g!;>3h8gAthKV62HQ0@)qu*^?cyq)8LiG=2cw26w`K*m^V$GU-Vth$
z7;N)Z=1CRq&<`;K*KD(2lo4DH+H^G4VlCzNr;g_L6p%P47J$na$pd)!EjRvChx58M
z`epf=DMv*d!eT_~m_Hjb@x!QxS`ZIs(LY2<u@|1w(8R`Aaa|NN-wIS^mS_;7)mC)F
z9b!3~E?q1ZtyD?5cnUfu6WSF)@NI%l4J@AH&<)`h<E>x-=>e*YLq+QYnkrs-5;!eO
z&pE>o0%Lo7u3jzkHT%K7wH7M+dC!k9PsbWh-dBlU%UjUZHNz|Sz+ZgY;16se^We0v
z%=>n9%z<}2%R^DYV2Jb{uI;bcA*AQFTpu?MUD`8y_X`^?cQ}5q%_hlGtS|wm9s2Pa
z;7O%9qL6g|crgRWCszZ8k*Sy8SKlLlzP=p($V78Q2M0L@RcnG3U&uWwSqzk=_|*L6
z3x#H8#5nHbLW3ak(0mNZNQL9PA`a)=1)^)#kYBEh*}y_!!)j2I(n{1A=r6cg=xFrt
zh!#wscoFGMma8`uOYt~AIslvwpGy_2E5A_lT?y`fL>`h7(7DaUyChJKU_!_$kU@H0
zXaIfMV0ToZGOuj_)Cg6ha4Y88?7%YAQz*Fq0U^X{;T2*RI#AuoZs^r1@3mctB;aP(
zDsY@~Je1LE$n7$)QYquAmezjcWAQ;BF6%ae&A!-UkEqaBl*i%5$cFD}Ks^`lx$r>r
zg{g4<MZs)HQ2+4&;;ZZ3*E?&0MC{MFm)A4mSL%EFPtEBFP;|w}vQ^#LK#W?ErCOug
zrOY)KiWSDU$h(_FNKj~03k{m}(K?GcgYV<i)SH3DxuW78F=zKHqW8~z6RgcqSAHgd
zqPvc?kPlI3hd+bjeH5)|it5GdQR)B<6^z)Dp)URd4DQV4=?TS*o%Rr>IQM4yb!|tI
zx7>^$f1HyIBr?IU+LRxiR?h`a)$ONK8Xr%D-V0mOu2qDqG{BmAL;P7*)?W}HoN)VB
zjTmTBg73k1HMj<<O>B<$$D*;WcH@qeU9;~U(>Oh4dNmiI0;dq-CP{|UF;F1SGGddq
zQy75Ddp;&x^{z#wdOC+snLg8Cva*d1E~fFrw~>gCUYf0ltas3t8hH1P{8bDFKs|;?
z&LOZ%Xi}N(0X=lX?^(_GC^JNl0gI58Srn*XXr77U^$Ei;D?45pPQmWV1scat+t~JD
z@Zp}*n{&y`I+;mxCOZ%l&<pBP4R&_+9=8X(yjdP2O<Bs;EtxZoayzp`kp-4HI?%n;
zOZ}u$a=-Z=33CvSC8^Vlj`){n@#!ZSwcdJl?~?>U%ep;zVsEk~7s~IFENR}rY-yX0
zo}??*-WE5vWRhNjjlae9KyyEWEt>3M-4~{gK*y9}u}Ukx^V)g(Zlo6uy{AzumDpr}
zmad{92jPk`3}r7Y;l}R>kMpu{Hxyg*KxG1SHkfGAN+G8C=zU<sXUpPo0^0yilouC=
z`iq!i#DIR0f13m(H{oL*`)d<pBAJ|Tbt0jd!)ixyYJk(J%8N$pVm0CK_Uf_HL!%e3
zz3j`Og^T@ow=`#TT@_<lg;SQj_C1DLDBIz}6q7R-Wjd;DCb2bSPRg)NBcHcso=)EH
zWOZ`@^zT*|mPzkSZUY3}M_b2~zkRj90(i=mlniJU&mBK&#--p2Z&OsTtL~E9Sw_5c
zf$~v=d-t@k{+&Ve5A$Xt^yb2O2Sp2%al7NW>#)2Xuo={tF1nHy%ba*FM1<N>KiAIZ
z+LBDvNeCFfpWHrHE9kXtA076aQDQah0t_qqxD@)}beqSs+=Wh6(j%9Z>v^rp5+KHQ
zws>^RfQq1S&`8et^jT?H9w~&c@}lg4c92{9wTRW1=<q!Ds7;5{%Nc%KB{cOv?><(4
zzGk2QVn$YIsa=|dzdh*GEFK^rKChn31IySC{^A4uULoPt5f>)C|1Ij(mbM@$Sl4N@
z%d->W<bJJZ*zM8Y?{k(@r3;PON4tEkjI6xYR(ZioiLe)?H9dHD34G8IP^z7}&{fZk
zc#iwP+P&%Ol{k(gWghXEEr*!ww(-1X1~CTlmGW(jH~X(6GT=*fA9v#>6j{dIis_83
zBZ9E!0YnBC(rbN{Zl8{OOxQe+>%a)ybEc=^Dg=L6UyTHBjmT1htJiNxgPL3HC&mOK
zg%fWW#vQ>_bWr^PwVyzW+)2Li`o*=Cx#Gw=hiA&)d!9gO9TUaah_Lo}fePltTn?Ml
ziiaT5?oMm>AE6VU)!gsI^h&epXO>YHr&U+JLGVTE$1RgKR#P#8#b2i-&LF&FJ!744
ze58<u=>J}yr$~Sz)M)j%>9*k8D^n8}a8DRvu#!vUMH9RIRZBK%a-c2~v1eqSaK?u|
z-@^;kily{zd2sKzlKA}ipu(!c#dtf{=nT-5@Xl16Fr?^7(SI1xwB7!0sFS86;c3{T
zu==tJ9#lF+R0N+bJL8Ek{l^};zs8>D;vd%jRonvNZ8uTPJhb(e9<(16-Av`ok0>~H
zmm}vxjx0x~kB7$+XhIz4F2I^^w>!<PX3^P`?s3vwo5k%-j44^x*W!mX-Dd9Rn}9Mz
zEv_ZQtu7+6T18XnsanAIx&G4|LPL}yBg`sTGlkO}A1Y+0jGv45EcK=<_URLbul6MN
z4I4_?SzA>k9qR*dAGwlQ?FH_r;u4=so`MfkRO;6~rY1|L#4(4)la8{@@1~*H$T77l
zd?L0B%b&BVE}xixGO)VvyNHDMv~Zc}WmWK;J>W=XSl|upsmONEs8kv@mo#29MN}WE
zcvuQ|;CMZp=2xAd!Nqag`^0bd1UZPx0!1yJHK^09Gw_Z*njO~@aSZSVtDa@#b6YCx
z7?0blzGMorE!L{ow;0Y3t0e3mG*UR;%;}j;IBol~Xx0*a7)b*iTaDJ=_t=pi)D76p
zMBey5fnXAc@uE4{93V~F5F7G;v5@3orSqDdUr-c7)D%8JdFTBEO!XL4Pv25Jn_Co4
znVZ6s91OBsYU+^7|3o~b)%$L*?`Hg7QR}Euz?8|xTbvC^jgseN`a)Vuw^<naSXThM
z$z1u+qeumB(OQdY?mGkBwIBPe+Xw%e<Z8v7q^+@pJ&wZM3M<BL%&8AzG=Xz`iE(T2
z83_I!?gSn9^@MvvE?PDeXX-R8AlPg<op(6~ouN8eqN3}%dH}!lG{?3jS>WTXQDi<%
z!GF83*c~eS-V@K-K8-z5`fgqr+=sQ%cV2Un_k&GEol>|SO@hD!ZeMHtG64oPyQNdv
zRH2h_SPdF)uCB3jty%9;nx+B1LHwGvc!Jf5-9IQA)Vap~{cJtuZwsO<Py91$bWtvN
z%X91i=>~C#W+}%~B{MTT;v8;ZP~@ob@`GWx;W^nv@o5@W!_9d6cO}Vcr=wX~7E`ay
zb*iZvo##yB`gVzk=fp5F*VO_sOWIJ^_~XR!Ld`?ZZ<7M=n4;|FMU(N=kr=KdCRlfk
zH{<q#18bAh0b`hvg*AsCERD+rtk|B{_2DyQ5iSugHg}a4qIxEAd<ZoNhn@s4^uHz;
z(ZQR5iVjW;i-|RZ&CQF9?J$k0rnd-WHPz7H=xah7W0ojAZiP_xrVy)0#yPPcPA0b(
zrZG+2)-GL$5^?asJn{eoS=B>Z`EF(mp9NHEwuQR*2YEdztFojPoY(;)@peCFo4!!w
zGK|gOT3}>P4a`_w@#ZGCyWfp8zH@LbW)5RsT7^{q<2GPXZi@Kat{$%aw39;r_&7+4
zQLhGRaUR{7L7J`I2R!SYtW^Jjp*P_x_&bMV7S<Umdl$ROY&6olW8nA@&3n8~@AIDw
z$Z=S7YQ{}RCP?I+x@;P37F*Kg9&cy&_xP;kq?#ovnN2N@zPJp8UA-GPi7{*<a!K~s
z<Fyl^SoU1WNi7VUOR!k$SS+JR>Z#kQvpkxJNl}AG45w3loY0D;h&eX&n&yFEVS9W4
zAPma1wX~`&m9`c*FW279<nBReBvAFLZl8usxZPFdnGTL7EmtP%BP4|}IqNpDinW4}
z4}tjQ-JCS*@z?9io_j3+sVQ&Q2HhAyH7GS6U%{N#zBp@)t<#XrfJOBuM3Vp^$#9Wy
z`#%XOpByY0xST_FA6C;a%3VlG;y;cSnI6<@!UkpeS1+X(FYwPhWKmEm4ITZjR$Kpl
zksg#2)b_jn=K{ZveY^O_^k|nTD!4xnvuTIpB#L%uw}17|V^^^x#?~u_3R})o^O7j?
z=?y$taiRX)KupLUH5vX<kYI$h&`H|27<wipaHy!xA&FndDe{fIu)Kt{Iry7@($s%H
zw(cD$4N2D5pD^f+cy!lrRAfysnB3cR*9n*o2<LGwnq+AbpliL|O1T@0^fXE%w7*^4
z=Et%p;`t+^7tDk#NX+E(d8O5KFfr5ZxS6Bek8?l;bk?t7fXf9HZ^x4!Tcl{dMw$k$
zl5KP*+Z69P(i~SyrFP%@$n|5=ctG^e*89d*ytAF9MrR}!c<7fE`A52KeUJkK17e=c
zrV&!!fe2wC)@SL9&%=^>TfYK!kZYEea*emDKPUIflTRDhc%F}Xu64~?a@_|C6ZV4r
zBkh-Wj91{Yva%H|MU`Q#zY@eN?%E}Z>=b$`7#PV(&IKE;OB4-{$5=yC@V)6>%lMo&
zg7(|PvIid{aFu<m-8b-BE?S>1OCxt3CGdR;DBHFTsi$#W2pPcmi8jcF9P_eXWkeRJ
zTWq>Cy}oe|OK^DDPvvz{5)~CaMKU-}d;>c6p}ZU{%}d52Pc!rLmU$EeN4HFQE|&K#
z&CSk6l4?$CEMl3z9B2qt-JWkpuFA;SY7vumKDWW_by{kSOGAOi=~oDi)i3___&C~K
za$nGZM&CsT7*)aD`Fo_|9r8hG(5eDeL(;zEHaCL#7LUe_LBNDKs+j$|cDxsN^B+m&
z)i0<YBdXxOHB9j3x)uWx2uoJ-22l5R)07V9HC^r*ygQUd{`;;rGLXNb7m_P6f|{Jz
zjhoq5kzeud*Gy)vtcXid`+f%fR6BiR008cNZU*(gO<DY}RRy&fQU}pV?@CgmO)Ujm
zpPw{=I*qDyy`7C|#%9B-mEg;6&kubJ?MeT&fZlD8<Zbu>NsTo?!(BXTLAB^*<lzKG
z0&RkqrfWg%2G!JEI)}oyrg|TsWN_4le_>IP^c}O3;A=5z&}xBBPC37YIk~MvHcw)X
z_~1R#3p%@T{2qDr-1hAxv$4$Yws6WPiHj)-Ke91hw9Fl^3p(dNZ?m}`&sG)^xtr%x
z4>%&&R#B-}8sYrC1^%SY$)!LiK+wnpre}s|Cu{C-F|=j?7OFaymW)BFO7$x8#WbiR
zhxq$a#2)`8QweeqpGl@{84yRtVx3fTFb6yuj&bdk35+*?YBW0HHO@tQ50|}8_h;k(
zy^)C-b3iNpHaPDNY<FPzlG3dZ%lF}5>=PO-$aKM$yrTcR;N(A6g^>+p0cFEoNfQ2B
z;IF{ZK=llkUue94{L5ouWDr2Qpq!oFNA}<Uns?i4P+daG>YLTGe|xU7Fh~a`zf;%+
z|Fhr!e#hAUtBRx!ypH$Zo;wLTBF@gX7p(o_$Ns+Umt17ZKoa@CsS^L+s=lbj|0(eA
z?f3uxUVV5zNQ2W>QF*!|!a!G`R4{vufX+75hSG!8Xa<(cEW3`L0~^07V2dk0DbpE3
zPAefLL<0e=5|qgW#}~Ip#bfeiGV>mrd`E_XhK~9{A>yp08tL9fkkh-iv)%Oe^x(e1
z9Si!(^Xm9L_&dR0FEMh#w?lnsZCjGwuO<HNCFA?bpLHUU;_B<aUzcd>drg^x;{ra*
z3mf(7-(KDT<Zn?XxaVEEU;p*`D_HC#&={ohuTm($P>KKT#fSioJ}zUJJ}dq2*NMSM
z(LiHt!hSseJ+1%#nJ47^J{d5G<|g?v^Z)lxJ{p0>2;&9&l>Waa$_k?(!4mFb!0^9h
z6($23Bbvf{DCU1mG!Y8y4Iqz6{($V?1O>XuI~o==#@XvKm;W`<31ZNKv2F!7zxy{q
zfuzUC3>rf${S~|X|C%T+Xu*<tc*_(1UlIJJ5ugwMPbvP0?*EkHZyotRrTG6<DcT16
z4lDm-m+v6IX3ediEprhVRk7yegW5PfB`ia3EDW_j)lhnK_NM5g^RZzf*A#S(YyQC0
z-s5hN%yCH2?MAG{WT)sH)eY;R7JgAhmv78_rnw&)d|m%(VcKNh35Hgsz@*4V|1nGs
z+Tgvd(jp2H$X~u(JoO$5#o^ll$e(sLgQFldyr-x{{BAnvS#jcAIa%ZU)o;HFWX%vx
z>MA=IGkPFui$ReXthm_jn!FBoXzu`ux~kd5{fE(9H9`3%IzL1*<n*^k;b!#-A$1Wv
z5u1Dm^=%Cy9%@%HyBy+aR^w}k;#FH{WPN)-2TVT;k??oCGLtU|@5lE}Zk#6?Tab@$
zd%-F>vPjrpO@HrsY^Fuo!3qShmD^a(Dg<u*X#aaYQG#y7Z40GWjBp1KFSgZz7<{I)
z8zuQYZRBj2IH^gy%AM$5+#*|O*}iiOlSOLLvaVsN>PA~Sf#Av<XO#d}ZZ4Y*J}-R9
zRmHgq=UZ0x>pm=W6xplEQ+|QBMRaZPO9HTOhX|HKE21j<@b72*i6yXs1-T83c<;r8
zA1V#~gUCZ6pDsS}B6C2@iH_M=!i#fllY5zklsk5EP)=A}k+^;wjM-SDJZ5Ff?VBw_
z)H~)1qyEvJ$ZU$sd$4aUgisHR$Nbd2=cu>lX9GCo=s;2?k8J25P%YIc`Nws(Ay5d;
z4C6&4BD}cUHlXOLrVhSn4&x?yH!&?YM!q?21q~Kc*OtEs@#Sh#0CZT3if`q(Jj*<h
zaKLASi=Alf)sMmcyCf!>mna9QxLN{%N0-f2I}k+9-`2k@8bg_KFnq3aY3r;fvT1m#
zN<P=zmbb9CUy{S<FJK9o_+Ifmo+~}u=v;b~n$nPUi@|hvRBXWaZpRQ`r9my24Gi%F
z;Y}z1SV89)%jzz&_n=bvxTvsI(a(BV8=LFSFFT!_Ui@(SLfQHpxtTl%vIG=(r2qIb
zkekN!g8y4_N@&2+LJckH%%`4lGCRIXtaLL$l$Qr{ws<g=Db;aenn7lNVN)J0h#}(@
zRt@2w6Qm$g**18eFIR@Q&Y=D7XS9=QVI=7>BcsQhQ|#1aR^9=!s=PB+!~A(4Z)RM}
zxSjL`w<T0cx@(GK5!?4VyZka8d;2k#AIfo64Y}xJ>7I9hFnli5q@*NSZ7)M-mFFt)
z!C(%%!i%j&b`0GPzY<>O>h%iT`kydLhKA9a@nm8#0ZFGT!E(Fxz&vtdb|z4hZa6xn
zIv}Pwbu=cIaNR&pw#Z~$RFu!xVq-z2$B}_>0t}0PaF0(O2mn|l!(IQq;My2qQ{Cl7
z>+6Y0N#Pyidkx9S_O^cO>y+}iH#;lQaWTGW7XDdTdV_cRimHfqH84!Me3+r_K|@8N
z9kkKG%Ae_y=Ni_7M~TNK2yTQdk|Dj2D4NsNvPDMDWR3+pr*Huk@d2r%)WU)msap|3
z9>B~Hv;O{mPPZ%ChldBNhD23IT}|kS*Ad6!HOOanaMF&3oJ};s#n7ZD`lg5cS`1Cd
zyAcnuL^fGmwJK|a$vPK!)H!*x7Q`YFT)DGuu5c)bpvKyIQN=uckpG~b#djdIWrb?S
zF8pFuYoUBBY;Dm|+75Kqey;hT5c)#W^X=2JXw|vE&h!R0`_oRWN<b91KHXXP9;Q?)
z>Lh-l;<xwvk)<2zh-tX=>Aq8qQV>uN?I)|NLq*s2#l6k#v!0`TjP!nsFEOuRwDU>M
z(akADy+Jd@r($lZcrzuDnC`qvgyA$E=ZWqr!)H2Vpm?W!8*&&vpC&B<qm0ozIxM%L
z#$YZ7uUB%HZ^vvJLEjcRB;Z>KwWJD3Ma4QF{Op)}r{EKX4jF1dwOIZS`1Q8^1r!mT
zYouQ0MiA3>X=huEmRaJRQ{XcVdl5o<^Kd>g*6QVzlar(EasqW+UUuM<LtvYTQ!S>x
z8kb$g2yre(j|>bjOx3df3=Id@80Qlt(JyMN2xgFej?w$Tv_&|zT$qG7zY@vy-Y@hV
zsdKu+4t<+^qh4_B3YQ)3$MYTQ2Nb$4?$<=z?m_$}9wkyT6J`#mdMpG0&)WLxIP3v~
z&4dq`HF@9~7M;-yi;j9yn5S#WbiI4)oow4Rt~^cKF#W6@SZG_cy~%!}C%7bZ54a7N
zQZs8iJ7%fej)imapZ_CXD4_tM&Y2kUeUjf>*+(m_E!@ublTQ))7`a?(P+=jpOfst^
z%9Li^<{hCEexR_HUm`jd$yfN<c71OKRRiuCqF`?aa|gsA5zRV|EC{}}&GiUKz>#`N
z1`NYO&l2+33T~HNWzXakS0de~@&~Bd_I?%^9zs{R4QX}*k2JDm^sz6NU&bn><!KMA
ztA-5beITH1W^|n2(y7e5RX4+@r+Uv%-rB_ANDURQvAN_$A^Bv;rCH}O19G>(dWQp+
zRG0F|*Tr`SQW8SsXNW{bnKfu^^wKag(8r$cW?Bu_I{Z&gPcsvz;VqBI|JbX#77&)!
z(MvB5fAN1-0C0|h?lI&KDk3qb9fREh^|4v=Lp3t?1RQ$hebiJ`?WY@L-wS2EBP)>;
zVhzGi6>%teJYS0=61<lbMI>(wHhTyU(K4ep7c~fg692@9tfl3%As(^X=99rR#zeHc
zUv-pP7?h_vp~@s1Vur+bf$aHoYbt|~6f!Wdb?`WeiA%>ODgl$`ExPM}3C)Vp(3Pz^
z4dIWs{pMh-zDd>K+BKTK1GEJY>VWf_7Ey1DM=KhjiP=aYaBzYXkB>8}Y!?nOrBLCh
zvOG@Op$A&Nd;t#uT(Y#W{fS3>@FWBo47w3M`K!D{Ic;=O-#B>)7T%0x6>&d>dhaLp
zuKoOup&nA9d;*xS<y18ehNkd0s8G{UaJqCaAJnOYY4qg4sqCqrKAu7+_2_n+t=ncT
zcD?NKjg;@)RCMj#A89JEC=509oMdz_A0=YP5Lm_F!xERk>J?mDzmY&RiC`30Qm4_u
zpHgJCyPd=u3rIw2x8TzgHqv-^rL&l?p678jX9@!Yqfpv%Dy<)FY`r1~Wigru%&zRy
zb!F?ub-pO?2qob$HI%OEsV*>>UsSfYJy>KSGntR4(_%EA{0@~bg)+3TfM>2<KUJjG
zIp1*G!cnKb#E2#|=qV-a*UBdLP=~GSJo!<2^SlEWNuAGxSldkwooQLM!}$moUt=v&
z@2g%1d}cP?^6j^vK-0`RU;s?*RC%Ck=i?wIaAEI&6&`KrrndyfZ;%t}`l~ZznF&!=
zFLcN^yC2A(<&nBkWiiGGtK5+ri=2cpCYb$Bri;`VsslMps9L<VINW`io97v_pnG_^
z2qVspG?hcb5cW?l(>uRc;D=G!=e2jXJk>0WUbnb=L8*E)!20L=@CzUJPP_%PvD&EG
zaSSc`8`jSx3FZ^<I8Ewqb8F~8lPYLxhEQa<E8$FyWs8sH%V!|6zef!GrQBBNK2q;)
zW+^*-JmG%W4KV)b#Pr{_XKWLY>q<zDn48uJk{%Ghi6;%hRb$ADJC^T76BdaoHs<O}
z%&qS8DFY9|+4utaJ_;tCZgGh@>n*+L$)5W=ImZd>0m^-UEknjO`5tKHPzPIWRns(;
zV28J5x8kA0G@=chM-3g^MHR$g>_=Kr%F%C-JWzxxiR}G8I+-6vCXQN`8n}?tjphW^
z)Xf+;94``V(`R%_=j$nTRnhA-d}A}vl$AvwmroBXH|$-1emZ`hjHoU8*%OZE=oJZ_
zjMN2vMf6QMUvB8?^dfgcp^J5h$>DgJ;Lae~JUB1JjG^^`dHM53xXC`VgP?M1FxI0O
zsoVp6AsO?bcoVm0S%pSwJ-%Fa=cF)9#K?Pt?+H$gyjb=JyrSvD4eQO=fi**cMWd;O
zp*8DVSOV_~di{zHh7FJPms0e(rK{QqeG+XK;NA5e`E<D+-<Wp^aBy(ww@-639VX~`
z3e{RI+4K@P?`Q(1q#WtpY57FuR3FzPX;07hjCF$9&^{rP@ESjw!mV-A%W-bq8JbGI
z5rbtK0<AhP2bfbji(hxTfJvyxcUzj=M$k7iPl-^;Txe}@|K)?*fRd}T<%T;8YAMBL
zodF2zkh*ZEs@AeH91fEpzf8Ftp~&T56~PpV5AxaGOfazqiWJZeKMLF289dk|u5yv>
zxe){x55m6(At0ByPe~*h2DjDCt!Hv!mLzM-AS3OaF==9Yz0Jv;h6QLw1EAI`rI7c?
ztkb)E@6k_J(_7(<T`Eh)*Chd4cOGWciki9zney$!V}ymi+4F)fwkXtvtYvmK0nOD4
zvh(V=T3H1o$(piSU941D{a9iJ1qA~?8=a4Pny;6ORHsVfOPkK#EW4k}=E`pr_vkMl
zp3l?fadC1|{xX%I=1!cSmYZ-urN`1;NC*Sc{yar6p(ZYYq@VC)>++hgdua+qaOHV8
zdDv_Ei_l@Q2Z%lKS2y<-cT96N@p*!A?P)kKj{LDM7P(YVWYfOYgaAPKr#H}(;Chz#
zZ+xK;l#0x4HR$|wrLQmh3I@Ezv&-YR>9Gvdu}S<Xp9;~te^fHHTfIXRKN49kihydp
zHdVc8WBG&~G1)_1eksLbxXS42Gdfni?iNb00sDJf5~PFpunqas>P04;*6RbX7uZx~
z3O*%XjY;!UAd{E}EotsvQ8xj<yn#`xyd&7l>5~pGE!@NI+-Ig_P9rRp&~Jy2Wt23~
zT+Fks*-B1L5(lt79-WgMQPVa=Yjg&RX+BZZ^Wuuy_=jt=enI3slsfonIy%-tLFD@h
z@c`>fcQ|L~X06~Pax`RC3>IK{5GFqUqLnBs7d?QD9N23#=jutbe0xYhrkhxi9gH+J
zq%4rK?F(vFWxpM(A*a?{zlSTzI#-7C>5yP;$2~VF55L;cud>ntjU<`ZoUX(^U>^hY
z&Lk7Ru@VwRHvJYlRA4nc-I~(&>;uQhR)tF;$vg^*^U+RuHYGiMyh>HWhcp4wVGMG^
z-fL1G7PNZ*24b6EI20dFY_;<}2+K{wF^Dvf3jBmlHQRM7)wUs2Sy%I>W!pA-5A|)v
zGJfDI(cd<&hXTZ*-PpGg{>E$$3gF2xy|Lf+;U0L``0ZH5BJsg_&Z+t+r@w7Av43x@
zuCKScFuy5PHub|dF@fV?K>EykKu6f`^JYnd1Y)y}@@@LN4&X3q9xDgVd52$&M%j10
z<EDr@YX3-316nDRURrJq%8p?4p=4^hU_iKN)mmqu;ZG@?r6P|63ycZ5blw8^(eZIq
zqM>`3Kyt#4`VRX;r%`h!IW}@lLuuB>h*~k339PZrYU|d=nrUX))-25F$j4g2R1|~}
zFkwc!3$t>Ewz5&9*wE&!rLNkkM=;J^9L?pUQF)Yxk(!fc=W8!r(AQK6(*_1_!gj81
zV!d7O=-yzO2{5t#3JdoKlfR(W>y&A8+apz4591P48FeW1tMsck?2p?CDbxITzp$N;
zR+6BKKfLHs_!Lj%H!<Fu_y<Z-GP3sTTZGa1q?^tm(PJ<TmGQ}3zx)D}h?Jq&f%oqi
zQeQ!j5FgkeVfCXx;MR2yrEnmXHeYG7V5MwXVPRp>>a}R^x;t93VJ0y2bZbVpXC{rO
zCQhr1?9WwFwA`%2N!a$I4@`vsoYMAlr)M?F+mt@;xCxIfRRfqR_{zSQl&GS@e)qsJ
zvYLgQi28M7jP2>xU{bc~(q1kI>z>r=hCO_xCR4VU);xXG!tqP^dX4ve1DiUQ!Sj6O
zUf9hpZhy&_sJjamZ2j7wl4sg5uu2I?!%TA!ghE<oxxns-^$%Cd5&g;^#HHMuaHu>S
zU!z=f7<{_Bf*y15I+c&jXzd<{99fN%g|>Ebo+bPx+C{t9#Y=f4rR)&O;#_dO9BrN_
zpG9cV=o0~M3Loe)zE4Ei=RARF{EofwdX*bdbpTG_+;i}P3K#mc%ZdJlO_X)JI#=tV
z3<mvQ$%4Yd4r~nO!*Ey}+P9wV*91NvURYRSPUN^{<DvmsxHmS)h=fBN@;=N{=MfZg
z`JMe3H+hqIdIP_Bd8-o=WFnyIh+>tIov@xK6MNDIfka6VMjt8G<Fd;?=MIpP`O)96
zNIR0Fl1=;AcqOh~M?d6KQCdypOF{9C-I0G$WJ?i-!=$vIMO5llblXKN4C;(gw5mCw
z&B3Qtx0S}%us&nneVWNJ*#4s$r6koP(m?>8lhbJrDWH<b`RS<-{*Rl}4XQl0{$!pP
z4t=!;b(Dp>wH%@RDn({>eMJ5I8a;PR+=RZjiDGdS1IzYgC4=PR*y8kX{4+XlRaLvN
znuY7l-F|83yymeaa7<J>Aw+F~_V?C)<I@o<`gu)D?Ol&tNvJPWDv!M;ufnykSAviY
zmg!FdbSo#{>hjnKqNr&~KK%uylgDv*US2z>CezYxS4C$-9FY*S<EHZnE6kik^3Vr@
zWyKKm38UK|dQadgGH%V=`#bXm7BRvf=q4sVrLTBgryVu<kI;ABnD*?~$O1$|0OLap
z)YMWSqj|sl0sM=BEYmWrn{wAwY4<UmSEP5-D|int%K65iykXHuLOyh+p^*`!MyJCs
zpg~T{&lKhB4GKa4%qjU{Nmqda4?@mzk;8l^8Xcx(t@)?5;?7d%G$a;Q9eSM#XFI6T
zNs}sW>3Q#MH{$XBAbctW|41bdGN)3kl9{8C&Km2;t=8+UZ5ws%W4^i!k|r7qt?R&i
zbxAH|2o;e95P@U2)8P*nk{F+!9>HYTLwSEA^jE|>S`IUq#&aD4rlQKeUu4r=zi4X+
zGfHyUW5ebOOrVwzcDKp8$OvwwB1vA{2@yh)>l$3MdxXa4ZD;uQuKlmn5=V&OP`9r{
zDBOE(DS9cIABs_gzGEo2Cx~aLK6A&t_-nNo+e86C|DTkYw!`Ug=%@82BR!0d*BEzJ
z>5c?+vrV|S21{;!1aE5cg!v_Co>|^7YSr0;oPyyJQ71OdAp|?!#2y;*qOoU!)c}#(
zDiiIUS+vyxx|$K@h(8<_wT?A4Si9tqqy*Eh_8}MNm0g!bs~dt2w<A-1W}!iHl}sb;
zQ}q<HEP)M$Lq}ns3O19YUqg|lG|L*{y)rJz!*c}iK2%Xx*KZRm11oySJHHp+Z=V#+
zUp*darQSl}UCVV6>^Y<Gj#+5F01TY)%Kb6~lhA|QEE6fb1v8vzb9bLM8YBuC!m_g_
z^<P|r!D&%VX;!O-*LMTlCEf*gf8Larkg>%772SA=zb;S9)%tGCjmE|ClfW{t@=G1z
zQe3&y-NmjwU!eiPFAX1EgN09T!pD173vLsX|EaHzs9sKza-19HvA2i6Ip@-0?fscC
zq1%lKv6-($D3wKBGc$wJp)et+KL?X+7!?_1GyiCe{?LEHtl+Z&mS~8Xeyu+hri7i_
z!mVXUO1`#rxbb{28bz7Yp=KYa#bJFFNI=R(WAz+}4fe=iYv0vwuT?i#E6K#<mK)D|
zF+5>K8%KYy)Aq62T0+)G@wpeM)8leJ9&9FGn=f#vIRXVP^X5rcB9Rm)b8{oHnjIOi
zy*UXKyJ`zPjutY~^TCgv*=ro{B%Ljd<|7li)eBeoXDjS+QpEz>a*u3m+%dNz$2D7V
z?v|lT2RhnAqG53O1l6qD@G>^^tYvyyxKR#e8#&hW*L>EN0t}Z6TyAbVMl3627V2zC
z8=MZ`czI>czpye@7T~fj?w~LSp2+h?zwyXf{8;<1sK@C-$j<%pwdd1yU?RCTkI$NV
z`ZD*-FsSn-@9k!c6q$G=VQ!RaWQ@gM;-?0m`k6~vgj^m|e6VU(YHdZ%P%2`ph*QYw
z9aCg4i~>}twvhk&B_S-BJd`@iac@#`u0`7ohv_HNjoMUe>|sz4jIRNRr!e}G3;~TO
zC_tEGXUp>#6d9?u$ZLH4q*wbKn77EwFs0k^WdiP`)1Qb~hp)DD9<kK4I?IHIKsBZD
z^6Uo#jX`qLNqWik$CYl6kjr>qAcStYD`X6Ra*q8r?PDQ&9%WvCc^aP^W?>?^;iCUS
z0LT88%g*}S2jGP@Nn5ul@8~0#d-@p{eh*ilc}Y2pVolX+mGvUaUy%6nETAuhtt0E;
zvwvTkg%z}^_=1=v&eL)N7FOna_3ZTeQP>!wXHs-&)-{b26L}zriE!E)#{nkh-Qp}Y
zI4aXBBh<S{p~#vv#~roVJ)ZUwsTHmN&K$Riz(~9WTCcQ7S#x}vedXqM7>bLw>lwI1
zuEWFA#ahD~B;EQLHj*G$gf`YCJWa#0fKLScsrnh}FMuMb{F?UD;o8o@A_%OG)Tl<(
z>-q-ekz+<xvtf>HwhX6n8@3!isMfpeg_@ns2NJqU(9e5}VLePH0`BH`a}#fllAJM6
zWD&{vNeETORH6)&#Hr;49$6clXG~TfR=S@-wk#nKOLeg(%O{Aumv$q&KP*j2hDYu5
za1P`5cmN$}TGCB310u|CPOSwqz;0Mil@IP0TJ~9wO*%vu6y3c_(Z}A&uxN>M@-(u)
z&Faq%XuYn97?y(4NDCrg<iZOztB7A{SPXD>ZJwa!kXI3Dbl638y*>g;FB9+Uzx_if
znel;mG@z+*vZ?MJ!t>k(KjG6kYg4Txx7$_kAejZzOl>P5BCz>40c9pAs7YMFG}faT
zt^x{AKu{4aH}PuykO3EcA}5I4p9KI?#45WKPTMR`wnNuuE)Zu##oXVPXP*-L^n($y
zy%cg^BqxF-uiqu+-1$hmnUXxo$i&!Sac09I-s~=hvbzdJOZ;17k*fQI!1VHr7V3oO
z)I*`V#iQ%4+C|x(kC*!+(#5^KKuqE_h~U$aVdX-Kc8U(+F;%u7$7Pb$6N#}(%kZKv
zKEp2wRMA!DTjzhFk#=TZW0L|&M@st^Z=YrXqgzMls&6u_*OPjWg-7zxk{yju@$r5w
zyn{M;B<~rFfuZ4BqNZFtz1yF5vyKI!_0wjPAoX_Uh)4oO>Ci}oA4<+nPusPuo2R}9
zT(wg9lWKrmG@Lwx-=Mn7+1I`Z>VcxtmN!&6{aM!0#)Sr7nUPQ5in8uT-0zpRQa9$P
zlPu{7zkeTVX|HB~IY|?rN10XNtI*8~3TuY8026ai*0Ie#1M^aeDM_Tal}3w>3GDrT
zRD+GVxny5_eNhMrOW;+=qfd6W#y)+8QIWBrHc4&?BBybCe!O1$IpvyQA`75r|Fq0t
zuR4`L=zDte8L1nixq%ZagDjW0wCs)3HGe>ovd1@jqp4UD!%lAZ=55A8T4=+8RKQ6Y
z2bxvseROhsXy~0|NQOpgGt3NAWap3a=!{PuKeYd(PO{NHhOuVLw}<ZulGT6?T8Fk`
zTp}TLgd}A2L=}A)#U?!z8$#J9^U+EWltwZ!+gd9EW0HLvFaD_~8>Mo!@On@sHRa1S
z>XSd{TS+enx59(v!vgI$%8U@GnIFTgTW+De9yW+Dn1H1kdcXKp4JO$L)dsGX$AyKE
zMDki=^Ggt@-=`hJAH+Y&SbFw3hcR`T-IS7*4JSc=IO#-cXwkYZ6EjdUhyk0<)yk#{
zl`IS*?^e(-T8w3AxdHdO=_GR4INqUQHkk+U66J2!Rwzus%CAP*s_s`U*&HnrkKm-@
z5l!GeAfcmIsxTeC{SjC;>ZD(3kc|HggqJ!Bux!-r)q|7RwT!^xF^k62l;fi$-`;#)
z>K$;k5C|FHZtE5(S=(JcrThWoZwVlG#}_k%ZW8U#r|9YqK6o=RlSQImVWSJ|N`_^J
zk%V{<73dPmX(!^5Ryw)Qrun#mGhH^^0PHJfZraO@Sx@E-{0qUZs*<;119~z%8k}&w
z5UnJT=%3Djsv^R#_+9RwOEg6q-=-Pnf_z6Uug}H`842Y(7$G6o=Zz~tzn_?*yzY>k
zZ5)*F_g&L#J2)o}FoGrpK|Q53Q+=`_Lh26Ga>3q<=2SyrJ~5`mS(PFe$LE3N#9c)6
z>~M-X>7zmk<7vkk&ky;i%DdmUD7Vs+Lo1|>hayy~W$+By#JI#w{FIishIo<;Ee|wf
zFhNcS-vh<6FY4{=`C~e{C@btY(qOZ3YB~yv?ReC&P5g9z58UqVZabA=I114!#cZ*)
zz$PNanorArdI5kO*QQGz<1s_?1!`7ywz2c6>=Z{3b>tVB<T35&c!0DkosdAKXPwO&
zI3|N`G1?gK(V5@_#p#gF{K3;p_BZG?B(tQ?nO=z5C^4T@h?x@p59IilbRIKIKV%%`
zhz>BwELtmqSVes(FWxwqWrC6AiAUXS@V#Yt4ZUSEnL7Db#7;MyJVg-;C;FEZ#5WI6
ziP1Fw)>^~$`s7+i1Gn<Ypo4YylJK(j%yUNQ5(R_qwUfr%Uk0TrK=2&Ngz@HtY@c48
z1u~6Wql3jLceX;Pxl@yDdPtj?@J2P8mI0+4wt#YukZL%%NA*-;X6#3G2!J}nd8^4c
z(3HQ}DtS^#GfNVMj*U%a=2+nQcBF53c)I>_7tz5x?(0|7J{L8*u^i*IU&YN@^loa3
ziqRcwebJ;18fjei<TchS5j;nA!nGqG)JjBUlzTsUHS&$u<_kAG)Br#HN=SDMmeU{}
zj)<C=u)mQm+AsXx0h?J~FW%y<=m@)ke|(N+fKz6CE&o>@4wCd_jr)h{-+4F%dW-LH
zjHFoiF?vF?h|Zt%1mGghu>XYlRfeFv+x+RKw)9M4(J*Dg^)Uk`b8j04h&JK0Ukhno
z{3`<BNcYZ!X3_VV?&zFp6Mv4N)wiv*2ijnmD-7C>RLup!<Es#5<R5*OPjRgWwv!Qj
z+A!~8)sxmE@HZuL!>o)S9Jfpvmoh|=CD;CZKEx6CZEt#-V4_wHD)f1R=ow^f$tfrd
zLDB5acy1^as7dv8b$O79zP=C0BcmU;+|v&Z@eE%-nRitQ=|EGtH#!`B8JQ3WjER|0
zIVow^xFsa{pP6jvW)TcbkJSyt4?$Wo=T;c+jon_!Ietm{^xzceDfpDcy}!|L6l?O)
zU{vZ9(TLA|?{$3x>A-tO>8X`o#~0_sNDkI4VA*p3wa~Ks`Yuu{h+tjdFZopofV(1@
zXhxFdhOPkwNX#Er8v+?xO^8}P9qO!+5!Ms}uY&gdxco}g+e}qy-MgIh@9erh4Ym#B
zu$~gv!nX{&3Qam}I}c*0$iy&BM5q#|tdmKqmFA%$r%tAJrTny}CO#^!gARR<mGmyQ
zha?#Yq?-n6<#!D-_k&euPMp$gFg@qqVsqMj#4Fju&S%FQOHE5V{h6R=aC5RoVCWq#
z_O_c<!)TCocZ*8eo@v4;V7_$ND>oI5i%-ecUWI-BPSFG3B~Wy>(s<!?`kcS4gjQr+
zI+C>O+2!W(Awo$@(<)dOXi~OYP&>D|$MgXvo86Kh456myj)Z8Uod5F)hN<;z8JMD?
z;%tp2US*iBP$19mOnsQBHzuHmWW?!LSih=D_U-zkD!IYtWGq3)-tJMy>Y57*^bj)o
zVP}GCz{$Erjr#H!oT}<HasW!l-CK+#|3Vs6Vxksf<f|rL50sL*LYXsf>b(9zuU0cp
z#jdIzrins&wYBPs83o&Va6~6W_3Kk^{8SI*Ts@prKc{}f^{!xU=VATMfVKPM)>bkR
zRxX@z@Ng2`6OiYjDPQ9TWz0NJH4T&Ez1|WXKVGHVNq(~?KDxhaDSFn`f*1RwONVL_
zER(u^*_Z^RNryo|bv$lE60cUId*r*ITPjqv(>%0Pmj0C}q_|uXGv;6KhQoNiNy1*-
zxoiRDH|5idA4=jgC39$4{@!QMzxG)tZ5h(9$iapNh4JF!RV6A{J}hoHVM{Wn@a*?Y
z^&q34<KyiX6+zMQK}}x|NfH(jyDZ}Ya8)G9=t)M5PI`JPTB<TUC0r3nxh$v3L-LFq
zEWHUSEnR%_OTL-M2;Iv`ZlzC|jr}h5r##|e*qN*(L=DE%Jx@~FzIZG2T8xA~pmPtx
z;lboHpkzWwI-eG+KeKLNso{@+t}5VmjlomFEeV3RSoz?+(FDcJMRN(gOgnl87lNMM
zWOLV!$pTRe39+W;5!jgK@0@xx%g4{4rq^&Z{SO$k*%Mnd<!;R<kdDU%p_UqVD?=oY
zuTBmOg(q=ibE&fmtkUkU4wcs*lw>TtgYkZe0aRH?0%BezR_ADcnc~I5*YNy_8cEfA
zYekYyaV|D?w(ZgjRY~`;_I`LfjK;ZQ<sva8$#*Aol@CiH2J<!N3nvL*TUUd&U>cH~
zJM)7x)?4n6Cf0E+O5Jw&)jpR<4j$U(J2=La=vmB^V8jQh9+rn5OgNICJl!96c6LH2
z*+eH%Sy#{6$OZ#!Tf5f|SC+-XZ??C#0zh9IP3D>Y$mU*3N9#rm5XUzW1ENSm4Jc|R
z`nZsaDq;EkPQ-_*$I(EbM1Q5xH9{xe;=-{XBQiWH5B$XpsMzqkZ<h!u8gntDZRrP$
zj$z7sA#3`wBG}c41f7%GmKrv5fj}X9)xLtryl7U@jX2XGw^y(_!7ounwW)v#Se>SJ
zKL6#UqS~gA86Q5HEQJJ?nrK!ZMRs>2*ylcj=U|~{H&r21F_jX_rSdPfpnX6Ax`FI)
z_Z7y)-XdIWNa7VJ<f0kPN!eaM>*iE*L-cUHZZBDCWCV=oKnea7z77JCjxJTZ`92<c
zY($F_0*-Zzo=Zr5bemnN=$m|ed}hj3CkmxP0P=j7vEu+77TF%r7|{l9uj?h}orT)c
z3Oa$3JGVnl#K0O_F;DFv3FeQ`fnY7_RO?BW0;pi7ilz2~@5VctQ3!UZbdBl^g_3!u
zw<tP(sIu?*((xpKNxroyFYnyJA;}a3h4)2N{EET88XbakXK#bb5~KjMS<O@Dbc@z5
z0$Hx2)fzI5pHE3TIyz2%Ftkoi4gV=0(Ix^<b2^w+bn_@Qpd*MA1OTYg-OnVAKZBxm
z(F}eM?mY!AdJt2cBJ$0~0v_r{vIwfW54zI$q(J(O!b`r+cc4=Zl&-VVD*1r<uC2!d
z&W>7XO)(+1ynzy8r2d9P@^}znZPhhXq$@HOO&Ud_DN~9Kf5eSF-fU~NtDV~5E@cb5
z(Bi>k<PMA<*qbU6y=r)i&1GjKonI5WK_Qo_AkyUvjkI@1x`6=HYbCt8+S@2PLRJG=
zOBug5Q1%hyt*oQh!D4--K^lkka?`#5xGb5J|4LJsEMVJA&s0%$Mdm-!V2;g>NxZ_!
zx?vj_DkS0g1gQkM9n%f~DUr%OZEO8I5K~ZgGWlK`{rRZn-YUp6(F6@y0)>v4_*DwK
z<@@91=5ps_-M)A4{-Q}oQDE?SoMf&mM4pZ|9~Xx*RNeqMGV1%QCEgBg=7kZEgG~^R
zcKB*o5Ve%x7l*|;DRFXu<&OYPy0VdIj#BSPl3iuBx~t-7)JRxaKiDSRfyh`#`e<{H
zLrW@AR;pwI&>^px>U#Ta5_0mbczgc|u+=xP4zIw&MOguv6y3H&L_{(8I?^P`+~22z
z0HDV;9o=qc1Uf*P{0f47BHn{)s83D5mZbG$KwSM*VPYi<ZZbW9$J$F?jQ0-tcZiK!
z_0bGB6%s!!cLD`I#O$mtCJ8Nmp20FMC7(L0e?L|9kr_(SjUF^;QoB8fnNY7Vf)%a0
z;<Y`%gRK($RvxsqyIb&1feDn;Yl($<^X5xcapS~+oP@KShcv4PerF>B$IjNAx=7>m
z(>)3n1;K;HOG(=59SUfpKB`xc|AiKOoZbs+ju$eG4SvkC-~LqS21J>-5UnEQb6o`~
ztzWfcngBpK4%bD`5M^j;3PVFfL#<1g#g1bRR4p%v2g`~%mhQ5Jb1+{G=e+D<if3Ui
zkiP;t_xE|#2Ras!U<$vxc4`IpK-=Cx6NIK39Hex&wzi=5H-uiGiWv=vF0dr)IsZ+k
z&+~!pZm~KYsOM(jnw=aMdf^>ZzWpl>k9$W>Zg8^N4z<s+)eeFd85u-Z2Xhe4&e6d>
z{;wt4j9_;{%}h*8J|8z<mzfOVs;isXe@xzMYj8e}07Z5*3<BngMejszXKuZ4@a2>Z
z=0TB%DH(Zklfs!~nDmGBY@FgC=s%o3HK_in@r_xN-(L*pHxIO($pSR~n^g$1^rVB`
zfJ(e58}jy#`6;Lqx}LYjIs|9y9~VCIu1~jA3LmYs68Zc452zbIKc7Z=G4|E8Cv6L%
zf(`)O-rk1XZybAG^SFZ`<;}j{vpFt-wI~CyxVS_^=6+k?xxiv~2**0>#}6Hwl9N;`
zqF=l1*K5@k<zt1<zGwx+Os5e9!Dl-zR@9x<*X9-$rwOeOkg%|tD2M%o@+22E3clIX
z@`B&I)?tcJW@9U0oo|*hTN`ysC@3g|`4i<588daRBGW)=usUw=v9YnJqtu9q2sJe!
zp4PpG+w;#CW8xT@WoX>`K8N7E2F5o({XN0Kxtj6`UEWL}$@h+42XuJSvY^$TQ<+|a
z{;`rTuVN6iA*=S?VL=2ii-M~7Z}#02M!U#3nfvRqNptyCOf@-(P2!hN=S>1_85ixZ
zg3u($goMlbpj52v8KierRvZ((*8bMqOff@)^HIs*S(#5KW3>&nXcgiG%Sh%g=OXC#
z?&CBePdEQdxoq1d$+r+d1wSB(e2LeX$tY=Swi~5ntAx{W4ZXo}FJSX$KQ=a_LVd(b
z%9JC*8<$14T1HE|MIBH~^`k0z9=A#NihF<|VWVcaR)4MAA3L3~NmROO{lMW94R2`4
z9wR+?i6LQNLN@!d_WwuMTgG(}ec!@>bayw>-5t^$(%mJU(%p@8mvnb`cQ;5%gM>)K
z9rX8)=YH<<PTn|kX3p%h*4k^YuZshNfQ<AfJwuGM3vYPV;>R8}e)a#CWhK7sKYkSm
ziD~1R53xpT4OjLRf)z`B<bMxIKNz%Jv2+*(!ghH8Q&}O$aXwoGm1jFIE5BAtiMu*9
z+Fk-yXpIJYMBzg^QAgK!+wNECi*zxKE>%2E8Q<<UJ-xUnYGtFE?k=rc#ZZkhBQ1-P
zrONukR7<HOsgbU@%BUByP%P$#w;9A<|G>YaD9XgST1EW#+kul>MN-Qzrrq@YU6D^S
ztxo10f7+L~4K8LPc2td&TLgsz0+7GIBEbm%sa9RHl3~4)pi~H=8$qRTEJjAzg=?QN
zQK6Bbt)=Jz?fPW6LfM2QG3{3@0>UE0zE%KXWxzRr1;>#iyw!Dx!rV_J4+=xIyfmTk
zuf`w(HBk#HO%gE>Tr#%+mC37LKUDyVf$RfH?XQlOt*xlscm|D@#=KOuE<u7DVbVB{
zOWFB28Ykv)erPGFc88*(V`ZMcA4?IIfSqVBVN4;BWZKOO(#)F$&eQxt>{z{!0QAG~
zUw}sONl2?`6dVjdNal<=gX6{eL&-PViM2s%A*A<v27`t5Kev__sa8Shrpo-#_ZHsj
z`Myu4{g0j7e`Q4!e~W=`R}GS+H5c_~YdOj{NKsqvfQ1!}@BdBb9c;M2^$UaiGNl@o
z5miZSaUHiNw&6$oqRN%9JthCo`r07t7cxZ8x{LFU91TRlZaJTCIxFyQ3;usS;fwo5
z0fogagUgt2;_6nMwDEk?Xi_f26B&HPduJ+xY~1;Px-KaA#JxK`K#Pfd*S;0|b-5dk
zm6dg~A16@#Yi15a3{bLiU(gim-K9CFeczt8;%u`EQ>1HuZeXJ8)2KmSE}Sdu+%{I9
zZgBtnWeIdLpPt$tOOVI>9NtYCPxwB8{1;dF@Y+91OJFIlK>!CiQyGJT|DLyxFrBs^
zeY<a87#z;-m;t!JSw*n<X)qS)CNs}ZsKo9Sg405wW>yf)!Ke5qD+CV@@!YZ@OiZuc
zx$9lZ_s#E&4AvldN{lvY`2L3?m<#8#qU}9vd|D^bPc0-=6)ZE65sNTi*9zoWliHq~
z>}afUzy<<CBv<dFk(d;iNk5)gAm@!<G$Ar0?Q0G2PYarKp2Gff<-vyI`C?k*1lw$)
z%m)x9QzZ=tGYE!AU&62M#4es?r1_%>CI_6iJ7+pOt%$=xAO4yE{zwDc#QWiAN=<F8
zlAfcN%2hmRzNV%Y^+o2mfOZku>^`>Eqjx2@JACn=`~PjL!1UEAk2Od^?tkRANWh|!
zrF-ld)_o4{07?Cd=1aLr<d=}m<k~JGNc;vr7JH@+-Ud;SWw6Hvo}GPR?@%M+_POda
zg5bZ}&VvY?e+1?X+^7I_K$Ps6LGYb)udEJhbnhfcJ4tgti3+<l`ftm|j~spK1j~py
zW@3BtzUJz_puEa(+Ia^M8D@F`0m1@_ST)EbM-GVf@+R6-WlH(Mi;m4B!2TBqIv!g8
zPYD0<K`oHA<zJW7z7s~L!wqGc?*^WR=0yJwJT?=^+v*KbN6YCTBx(9uQ{L9HDu(GU
zn_a~ZcX0N;#`^tXHbaMG@U276+&1wibvF8D(MoB$NrQQ1yZX^ZV=6)I!|r2(@Cq`F
zl2V?S#`;batljX$zDJPn0B%SHlp_ZHt_>5pjejF-Gq3Mf@i(UdLg0hh=mUq}yo^lQ
z%YV0EWSGjXswA<wP=EdM<s{%Et+|CDTX2~D71jmqI?7M7XZ0^8HXKaN3%n8)z<-Gk
zD6mHTSp+6li)E@Stxm<ZYzM+|I<1c6YQ9al0C<`|U)IkHD^dtN{Yz$zVg4Ij&ar>o
z<OAl-J>dy>7Pcd!4}A2s+vUXcY#@tb95GSTY7FrB2vNp6#OUhzx#pyvjK;S5rIxdq
zltseX46T|ZY1xsumR?q&eN7=dGAj+ucEUy(Bd{&=(Z^G?^CeMiCQ_N*01aJ?W7-mZ
z9-Zd-Z%Ljf14!iWgD8HW3(^t{jor4xSeCr2j_Vk+(0#mwTij}_xHWXOfGb=+u@z-e
zOZ&FH9H!X-jZ({E;Dj3>f=MdU792q;)=*Q!)NwP#o62dIT~}S;CD|)GxlKL1zrP<e
zF(J!$x8X!!8io(`{_&-<`4@>c0PQ<5mG<+d^(qhRVCqOF!<h77W$npUR$*$VU!lAn
zEj{Ys=yK!y#B+!XNpoOyBUl{P{ZD;k9=Z4Ztze$ct>!ef+K}Q-zhJaew%~X<b5HVr
zWK^U_exc_<sjT~K)!GCAuLw&X9-Y7Kp!s*M+;9-tNe(P?@O_!Tez8E4TEkr<W-lml
z`v?sgp>Y0Q@&MIwmN-?P@ca>Yxmu6%Lmga=p<H;;y>eVuP-VwjHCkR&xf+1KQlTjG
zxRDkG;?ctL@`xQB9Sb{-|27g(TmhU+omp61ymz~-AL=~Wo&4G9ig<1}06FDb<lLAe
z`uAZ2XHkKUzzzdi(ijgYING`-QoJTEJslOl)I%>Zr&nyJSbiO%G^*E&gMPpPiym&p
z`p$x7nV4QaX=rRk7Y9RQ9X5j;Lkv=3f$2=p`tPnCw-j9a8HH1bHT=-^*t@!jr4`9d
zb){z=wR*>qJ#OIef|fYd`qpWGL7Wis%#TY6;)~bbROTikpUqln&;D+Wjm!0|Xztx2
z{sBRFW2DNEr!;yu`2oIuz+OAgDLL=ST_xOT|L2oRMQj_)<SY09IcTKXu+cDVC3!lZ
zTh5^O(P|+jN7gzxb<rp({eo;GfX83|G<O>w&lR?P2P@)X`17|*&jaf}M7p;5b*u6t
z9yXiBj0?{0_UL%+m2VO>{BKcF-RK|n%SJC==i}&yev;pV4}10TQ<B&FHsGyp&~{8|
zm%VFD4_UaP5&a|YGpJ?WJKM{J_nB=xL{K^Qq1~RFpGbNJy*N;&`8goV!kQYEKXavW
z_mtIRe4|Mq1{6Jk$0IW5i;cQyk<Xt|Ddf@zy0u~1ecqAX|98}&l5}Pc;SF%ekJR4;
zZGrIGc>joWI6A)~^)1#zNt+avXU;q^Eo`peDPs~w8-&amVi@MV=}!#&Eb2LV!S0Om
zW@1^MN<oCXAJ|)=S+8|#pUdX>LGZU28yp-QIpJC&wX&^2WJUTvR9^}HMYQK_cW-da
zRDtg*h?MR2Xttno5u0lZ6I7|R#G@A76yp$-mX~YowA;IG=)MF#agt70ED2H#ajIsy
z{@N9v9gf`;luJ*Ee9mpHcNGv78_rkTNN5Tm88K6++-u;F-U48J)B$q0kDP)0GrcLy
zvObvnq0V$XE;Tg_P;5rJP5)#3;mmpWWKCz%ze5KhJm?1de!Elul{MH74HZa-ZX`<e
z!w}MR>rv|(uHq$wj3wK=-Gk=<R5ZElk_Zd4tOj3-KHK>St4a)@ZOQ@ACoe$mxS#fY
z4cy;12|VA53S@yyr(<AXxCdNVOkVew8MAcj{|1#UKKNg3w(sR=zlI~=aZs(&uiIN@
z$T!wBhIT;$nH|~^<~t{=&dl{LK~7FkVz3{yB_!~X!y(eqW*4;9Ns9nqYsb>uT;1w-
zt}mN9|A5a1*ZXJ;zV?#rI~`b#_S?bpvEqt=H(dU_FaTB;1F*LZO5?c(wST6-?Cfku
z!{66{?o$9X*WGr4v1aPc6$<V`3I213>3)HM{RkMd>9)6z=Si#FKgYihU1Z8N&iw7r
zwx3qDUmh3d=s8ui6g~ENzgKu6uu7oBR=Q-mVg7|48el|Tm;O_z=Wi}gz7f2|fHhk1
z2q5o3)xa4RgH=DzOU=k2qoFC|cMkUcch#dF@QO+xA#yw-0#;>NjrI)zVL2|fh%sLZ
zYZ)+AqjGR?gp}<u|HHR<z})m20_4QW`-h*~i5ABYb6un#VIaXSq2yXB3lp@V(v9%$
zoMZzG(q8vFp>nGI-XHDJpe;njJE1=&MGf#)$6&SIHxyFwLgW;AM;fapyV-xHqVVKC
zef+GO89Dv!bDX*cM7t+Z|3s<4ZIy14n55<l^=soDc7+Gxps|92|Ma$*ce}F-u4!Q#
zGP5BwHs2vZ6p;?6qZ~C0M@9d#Ct|^vBU8~t8YI%w<9g4*kX~jx#`>R>JgWkK9Z{qx
zcSk<_({c-ENKUEkC2}U`BSjMvlb+#WC~|W0RL?IACg1#dCx2OEN-G<sPpHut=a>FW
zP!Q;iBN17UWBA9LqeOFmE-;CoU}pdYA9#u_+~DFd_~AC|P1Htf;WCtoc(15(QuWd0
zM(-vl<*E2pzWzSZgPFEe3rBOnDOST_OK(zvhl$Dr=|4A7h|N!iOZzp`51ObXvF|sx
zJq}J-nD?Qf%NN!F@iv2<CDQZWIoapgQ8W^`I?>B3<uK^0U|kCqnx9H?BnK-k#7wgL
z_w{wcdd}Ehp(`YLLfmDYU7KEC;s{^4Fe`Mz$qpCe)s5u*K6nN8)hPVFvHuP4#sf}V
z-*D05U;EZZ3rX_n6ZDtgf*+4m@iA(b-sO{Y3V?Oa$u&MZwq7^irljkq3$)r+qt6Fa
z1}4#`O)Be)!qp-4tr!|3=uHRy+lQQ??;m!5TIGAO6KbchuQW=Js--}K8<~@O*1mnD
zX?bXAJY}wTB1jc-=g7wE6yg=y0WD-try1~kb?5;M>XLFo)<zs0Q;_*DyNV?!yUGg|
zXMTdN$tJ`c94N!hjF13P(F`+967)fEAYXdb2o74&Fj094J2vusiOE?b7|5@jy^I}K
ze_X7r`uaa~MrHPGp^lDuj%^|;WlOW)_%1ccA2+_@dsCD6bG6uHJiR@ZIz+X6K+2Z`
z^<_jXU^NIZlFT;4F$67I%yN}BL5t9In(g45x(|-hrvK<Ib^_q~Hl8zIe_#4c3?Kre
z79pDU&ciVsCrO=9q<pkbfJr3V36GbAixF+<wrCSx_i{08A`KZ7<ZYUooCovu;LyX5
z5^<fi<BErBdG8kfS-T;4?-rufn11clP;>8^0xv(|o3c`{($6Wx!TyEizgn&?veuwe
za=N*ri(^EO?T@tFTD#`46D;|4zxdhDQKgZ~wGx?HaQsRsjOhAB-FQ$1_NR+mD=}yw
z_`BFRI9X3`w3fi1Jz$e%r+o5~wfZaWw-oBdWqZE?FQ4RJLBsy$T@P}vSVeuWCJmlo
z>SRSySPGWG6yq(P!z5o2K0G!*g<td#&}p#3vAW)q`3_odD09jm_h>ljorYM_f0HjM
zEVBKnG75UO2Kw7+rv6V2Fsy3mp-QqqWz{@PayHM@KabciI#FQgnk|XCUafGYSpEfS
z{PO_cS>*FJTL%H(=hJUm%d|JP!gBB9d|G0%tDgTDMp;=bHh%kS53DjNgf_8_SPjEL
z=Bq;$Fx$i?mrHM(J$LQo@L-hL>IJ#U<05i>Z2}LAz2H2!A@g@>2IxSojta<%{(JB|
zk<VZ<S6(}k>+!ze2JK-zrr!-d-TVlR${P{}8zNS06~wt9kqVoZ3d)eo%|p+y^~jGE
z+|Z#rX)mI&TwF_P(0N($dvCw-H?}cBwI`|ig3G1)Ndy);k;Nq@m(%?W^}6@IXgx$m
zLn~PS%Bo(MA{YwsP=95->z29*W41X#uYJ_Vtdhyj=c)^l{t^a*j##y`&fKLG;y<3k
zXuQ^7DXE~aoHd03kfjiCE!v{WN}Wi%@@`dz>_aA`yWi^iFl-gE+W$Vip?8$+cDBm$
z_oZor0m3E&;7r3awG*PCI<j3QIQ^j?s*m{`Fjp<Te%`WD8zY;GIywzS{b6GB85EN~
zGk_Qc3*M3hIyJ$s-@LHE7{KQ$r^FV~^Bc?~53Z&;-JuB=*ox8L=GP{@GmHn4dPhnn
z2cI?Tk;(VU!-*|vjlaT}OH_t&ct1Lm%bW*5z}a@Cs-;C=GI6i)eIKUl>kF>sqH9_j
z4l>-iuH-5v5HC=8@6Z(!YQc3L3{~l=v^kNfMMOvE6)ftGeU7krCu#z(_r~Dh;EJ+V
zLZNscXX5+-b@WwE*WHX<O}$EuAzi?br%wQ;ALy$+d~GBVw|y7P{AS-XIUdn#KK%VK
z1z@mI=f@b6{yGj{`|hL<(UvK8Uk@+d^}Xw&Zg2TR`Wad$E(Pe}vrBu8q%0wvN29i8
z{3gzd>6}l?0)dId5AIM3CmQ&I#PnkxeM>(u{?ldC4N*wMgoa~uxV`LRQqTX>o>$G#
z(gpGOXI71jepUrVuJXPp(lXim%K6?pC^U8;98I-^CmmweMQI9dc`~x5fsks0OWhz=
zey?ia;<is>zLRo-UI#u#mzM&eOPi4;xs5Tf+NsmGgeISyTuMTHM?hOygjo(QD(B-2
zoqy3lLjc_F-Q+QZhOcX<V-|eE<?GO$q2N-suX+ag=-<T~@PTECQlx3^u}>$<`Ttkr
zrLg>8<Hbb*sc6RiPduQwro)(+ni>M^t)!wt^cO=X^Ij8<+g<Zlhm&jS!RVW;uRnS>
zTAf_ZB_r7$AiW2cP$rm)3XC?At^-UaBDh0R)F+3xd8UU>c6Gdx69YQ>XjV3#u+8Le
z{VVl2Bmb|t!j$?83(H=$3cWiF`7q-b@q5FueI<md#Vm{cq!%j+1IY@avTI>OlKha$
zGy}3a^+)6JyhVG#dWhcaPb<v42&%Pv4lJgKIzgvK_d|ex$uQ<S(^op=L$jp9{N+E2
z@X$Kp$5-oDYYF{IE^fSfm++!%zuEXa-O&VIFGWnhE6COLhwEEiPI9ifSw~_q4gpHA
zodAR!C`D>}KVvs+o~==z7;1naB8$)3m;ag!Ji?)^bu))Kh&+Q8gB-db_oWKbHZOTV
z&WxJ=^ra+}B^#g{$_lF_l!y3;gdHGGh9@cCz6D(E4R1Zy){CJsp@4bbT?`VvUP7s;
zFKdsERZYXCho%msLK>ppe_a%%NZR}ew7>q1cr{ngt%V@q8QAHkP#;q?PMa-6U4Hy4
zVE~7wgDX!HMz)x`OB5F9Ut5_{IaR|)s*&-syJ9kyPFm7~STv;lwS4Ny!H~%(CH>l!
zEn_A^qF;Hu#TP0GdXnR84~l#MVg_C+(4J4c+OT!vU8>X%b=VtX$IkHzlt-xqMcuPi
zIWS80jDwE(6`MvYs=kk8|DH9$hRYk&xOBEE4?qvGfv;9d@l2ZGlxl#q3D~dr=-!S~
zZL<7x<7e-;-lqx=qpml=x!k6ZlkVYUDqw>k#K^28TR4<FD;=QLYi@CmC#iQcN2v@j
zjZBo)4~Z2%!d~JT6!XvibwlfY+67>D!zJH6ln1F6_eo6SL%3qlcfaO1On1~*R<GL6
zh`N}+mapz1Nb<e&F@TnSQ@_)jd8|xI!!OFhR0{fe6U*pS*ZWJu??pvMQ<>aj?90FN
znAH!igrg@|Xec&*Kxw-&1kN8Jw^&3EBje{3hFR1R1R7%vmq)%9x41Cik)Wc7&1r~c
z@HA3pySHIY!7`r*yGvj&%eA>T0X=@VOXBY(mp~IkvowbY@CzVoG=5S2fC7jOw*6(P
zs?eE)ZiAP2Z@M;DuOj~s>HJ4}!zvbex{>wfWpvF^B_r1C2jgt~Sn)iRQ-y8Z_X@rd
z0B#ZQ3McQO9V#2y(1g18a2_c{j_PHdDC_k2qLr*1qDk1dAPfyFfOs1qtdr?BmZ`@`
zHDdK1|G{hu(uc?>qWOa^>!(~iGM>)&m0bOfJ*8xnGG2+dxY8;a8DKKc8z4<>BrWaj
zdqC$)^zHS<<*8+k5GJg}FpmTJTW@91CAqPeyM|Sqz%c%+!+Z7ztYKWxZWK1x1A~y4
zPjHO^O>d_B2V_o9G8&@~LBz_@iQLK1Q5n%9?C~S;$ZEPvS5#8)!Cr5g>7Dq9Wc?>q
ziw^=#64)-@W&dr$t|!eu{@T>h0*$eon-s`;R(VRq_mdlNZF6ZV`D?LS(LFwm^T@EN
ziYlMR8?EJY%5QJI*=CupO$)xk+u7MY1NLe_4kG;F^JLRlcG|X(h_&YyjJ!N?zI{d|
zH;IcWVyr%G4G1c+IsBedltqn0EWi}D;~CKK6%rN>_*V0&vRrQbM0g-*(BLxjE0d4?
z=CmQ4QNT_L)gsl(BaXo9HE(7Bh8JZB>54fM?yG1+X((SBxHGufbau4qc=B-*;C`|d
zlO*C&Y%y%k$L@Mlth5`n^PWZA-4%_Jcz6;Jd+y>C>Bl1fJ8kQ>8HuWL*f%_<++i}y
z*vF#&e16FitpDqA^MD7&*#HoXSzq^A#M7@1{rKMrMIN|lkG+FL3zrPvbA*QYiLaO_
z+HBlh0pJz3V`4?d%Lv<B$BKmB{Upf{htzXz)_^u)6BL$A!SUo`b&0I%@DU06M;LCW
zhm{>Dip|`2sfZGBl=G0+ZM~E+IOPKvlu07I`GIM6nv;`?KA5Y5Q^JP;Kt9fjxqAq=
z1ARBFl`u?%B!=$g5R_}MOGOq5^5Zf786ast;)EuEIbv5tnH|7A<L^wex+9vIcB77M
zST;hCFgI^`_fr$D^YHYqF5beT$Pw1pG!AEd{WF{aDaz)qm%@;Bc|zNHH6yY{sUX5U
z|D8M?XaPqxcXf5uMc{p&uqN0!J6i+Rh{wj}>fS*POdHM@NMO;q7c)X4oGckl0Na6<
zfkFImD$4=L3mB!T$`9N6xf5&LJC?-ds7Li(8l}7Y1O7H1dWg5t)BoZ*MoGv2Ny{0e
znP*_rd|p~ko=6q!+hi2|Zg&)}cgTvo%M0!+M_s`?1_7hP4`@i~4L>HPro^p)(a5^I
zr`YR_#lB#IF!WuGFV=LQ$Lj`u9la9@#(vnVY%fqM{lxDS#A+1sX$YcEgL#Tx>}9@d
z1sXJmDvMi;i%XwH+aFgCk^oRbkOZ)#7@m53EC8(}3*9Q6XDnXlD8^-lPn^9_spr%i
zXeffv?Q~MXaX)Zt;9cldi%_wv@AVMo&D@?xh&(yP|BV(2agdkwh6Sc=N1-;!Ws(g5
z<+rA;j^%d8<^I(wVlJsgZ)aWIuun1c@k))Tm*J{leLBzcMIch(GG1`E52Nm&LruD7
zsi;9^D!wYdE|Z6>i?4W4I?Md~hPro&5MO?{SNx#)zF8<EtND~lozs}Vz*SJ??$qi+
zF5p@p8xI*|vkd<;BZ35kbJWFf0%O;(>c>SV97|oL%EiKRbeF%;D*<P@H&BQH9buFw
znL#&VwgArz+@|cJ=nsZXG$v;9!&siJ3nextnwDEQsnOV^PK^jQ<slgvi8Kn{5^SNn
z>F1`JX%iCva>`slir9IsZ2r*)xG{o3HVY7vaJ$~#9hD0hQ9YqqjV}Mgg3uuK6IBYN
zKbnDxgZ=mTDLlE&5s|_>Rrilk0Qz&oImR(@;6VPv`z-0<D}+zBJrOr<fdry|)&sf;
z`nY;^2m}O<_}uCw9HA}Yr9Eb?|98J!T6Qkl+Yk$1c${5oLJ$cfzd&-#qF>8dhGVqv
z>dL@xz@Kg#r_z%+yu9;N1;jE%cGvK%$50XN_?3dg)%&*=Vz-V|P|0XjbwQ-zSbNa4
zp9(zh9FQ7Wm5Qm(eX$L_%QPVVUnw4_-!n}MvT2ha)zm!Nc?V96WBEg>bQ@8yfL_-P
zm(#)L`aPD9MaS(<W3VP(HTLli4g6RN3JSV+uyBk*1*MBNoO;Y3i-4I6!1m5;wNx>r
z+F+|4rnmP(%H(<bg@gcvg)W;eWFY7oQ!gJK%Zt7)`;ly68ba&|PEpYw-zQwPL@LWG
zn6{>g-RGOoI5=RH0DfY67x3oK4Juhbe0gDo-|_|8KxX|HNIg>1NDr=14oC^Ukmd+o
zRHc2jZ<t{r`8(VO(BiO$>rQ?jLSSSj9bXPwfp##!nL%JJpGkl^cQgZDMl&{W-%;yv
zh%yLKzb}D9{Sl^X+D^!&92|l^8#hZ^hoI_YZ-kewuatg0rY>FUKSo7ibx*w(#6_|f
zKn4fg2hRmQp)eJ|VolU)Gc)lAUnaoYrWs(tk{&XW{2m^ly2U5*#lpfiN&gg)aG8*>
z$m^(KKX`H-*P=4alqpt(*;<Zpf41{1V3&6q&h1*V&qxm=YNkV@uqeHEe)$AWKOZ4@
zhp=oG`wgfIb&m%bcDNea$LwZv<t(k@T3YF(MN-UCkjr*}CacS@`c)prcqaGtem_A0
z*W}nLHV!aGIXh%^ZF*z+zIi%i3^sh-@>a)bcRoz~dDs2MuLhekQBGSJ=B{BzCfC2-
zOXgymR>vS>B&-U@sO0k^#S>W6J3j@<AhNM@Apqjak#ITva(yo4J&KOW`}1ca_DQ&X
zB{$e`EYWZ}yH#TUwK-s->mDYH<(u0$4!LW-e%%|Bj-gf?X3;a7XOtee8`9Fqcw!Xy
zk;3?5MHAP^f&W0dF(@;7%Lp?*Hpb+BZIYOH)E(7Y$A0n8=L2YdS`yj1F+g$($davc
zyn2RzX+dVvz1g3@VKSq%W;gM*K(ed}4B<&w*pPe8|F|?Cdek~@+3Q^f(emb4%%Obl
zHJ=Q~#n>Kd=;-CXpN5nszC7KLadN63+e%78yPmHV%GBD>7`H4wS^l^o42mS4-|-H9
za@0(+Uyk>m4B{=W8Q}I@h3V>(qo=>~3q#dVmxikmavO4oW1eXHMcfj(2~f2%35RFD
zXZIr*?f4R65sf@k7bKxC;*cyXsLWeiBHX{4*6D$PC~tK!Rh5KKzj;5mYB8VpLNzka
z8yOinUu15!Yr|sARFPJtXM=6U7)p)Ta-nqSB$uPi@86b%nb~YYjE;^QSW?T8uj>Jf
zp3=wGd~bN*imAN5>$oUY7q?6%492{XB6(8N^ovurG`}I7blDe0SBv%<S2RY$?CRLO
zCSXJvm<Sd}5uqx=*_|~0=9iFwHa9<ibyh!Zba%07@8Hnt!BpAu)>7yAFYRW|to{Lu
z{J+L}&w|D3B+GW$-6{6VzX72*>Sc+f=}~)BB7FDv_lmOZsBCxd2$B@Yy4fT}n>75^
zQz@OxWEyY(soeKI9)BterHJ$5#a;plD=;8{#B4E`I(3(|*#8!%VcP7-%oGyH!pTn*
zFX&ZP;5TJ4S_6iv6EpTPXw|m&Vg<#UZNDs#h={sX@T1fKt(rt=76J?m+HV%3JLf5s
z5f|>pvnLwyvo)L@fnv_be^Zb!aOmKWZ!+QSf5kH2L31iTJuyo_+c>-$u4rk*>~5Iw
zVNubA1Tk^S9toMYlOnS`y6h*b<=1iH!2tm)WhuSo!_-+2TCs##SA842Go7A?v;&QQ
z29C8J9odc20mn|``DMz5U(S{%>4kx>i+a1hUL;`0;X)yVCGx&6vXAa=Yq2Pq5O4Vg
zL4kD499d&;NEZ~(^xr}R$AUpA23n>^!*5hEt?UOfqb05t?bP0-MJUGMNXEe!`V~q<
zDHNOM`>XfC^$M3lQ<T95$lTUha8R5Nmj$r`ZC`dYD(ecLkZ4c+`=Xs7{v0(tNT2C-
zi)`G5rI3W^C29~#2p@sdNiWNk<(1BeH7bfVyuIFaORN96*py+G%R+xwlx?}++X$`H
zhkk$}O{(~xejX)w;Q!Unb40~OM~Ba?`=0=+k2q8kmWGfA(K^Ic(4oaI+&~&lgau|6
z-R0YAJ7x<_+4)NzF;Xx%I2@be!UDS8x6#Um$W-YfH@OAm_I!0_R#0{nX%;FNx=*jB
z`4}5Q*ZwexQb<=6W^td?sR|MkRb&+vnSPA<{=zw6!bb|_#5b_<&2~Ru^*px=aWd=b
zTxt$EhVMULZ*E6V8|pmD_I-6{hn~r+3IcH|GaQ6gHO*gCy8(;aKJ_EU;?>Yv@1yL+
z^kgYuo=qj<9QZck?8aOTQEW(2)1Vz>jM&T7?7}f0YJ|h5FWA@N<XQ_KCeBpXAfb2w
zd$!(J=kqFpK6&_YO`FbYZ(iX}LUJS*Gv1PK;_7U*KED?Cc@>q#=K4F2($%2thn<}(
z0m#%w{i3Nq0oCN$maI7^)l7$`-KO?w)7`Av8o#0Txb+WQfZ9x}p1S;VP^HLfNQQcB
zj5e=2M0RFYSui}mV4NQ+lAQZfo~lE(39%0k$u|oO42+3!$x!z`OO!MUVo@T3V86D*
zSR7*n!<&7u1Jz8J;YOtq(`kNcl+)U8R~Egr*?!q=;i$%F2Oh~FzK4^NV^hZa_6$b)
zT7zBZ{;g<G(AgH5qVtAt$=D2^i2lTmG{m%>EFh5xL0jO0X#oOs`7B$(2;rn~Zf-8f
z&EYgwR2$&3g+##pt_k^#J;+ZBJOcUf5HTeeT#O#7ntuiB1optQUm6DWC36RVAe$w)
z4<;DR-V~|e`Wpr_RMS(YV&++|1!V31$6$lhgl92(2*k|m!J<L=ILQY3B5fueb1tu{
zr)}AvIGA0F+LwPE9SrsO<fxwQY%dLzJj-AfOJPZ7=)fbc9hG#{5I#83rs9kxmIkXy
zr#5SmtQcynX;6n(q|_U$D-|01)kSEhR*B~GGvzZV^`_~M)fo1ZKi@7Bz~iup&d9%;
zHSrn}JcYY1`{1yd3_!Xb4)CG~`P+Ki(jAGZsbQw3rT{@IC_0+qhneUu|K0Dd@s5oR
zSiWu=EK^f<w|x{5uXH!$gAdvz3C`-LS@mRmeE1t8(J8}(=flqPbl{FstLgRT<F${2
zg4p8hQ6^bI3WEGqI)yIli<+qg#&E?GEN)SzcSJJ6OjDVs#bUdCi002I?I;QA7s8iU
zUe>85AsbwF8tp;7cS1aB3NxRgC@B-O;<d12!zFvU5=KXKYSFqo1c)BHiTvC7$r7<3
zMegDRxxzV|OBk9u`NL3Pe!4ZP(Z*03-=CCM8(7mZ&3zdLeo=wIo`1~qqE*6%pGOk9
zL|AC?W_T8Eoq16D`C?KACiJJt)vs?kznWNqOewU%q6n648zU%Yp^WogG;)1gKZGZk
zk|~*98TkaI3~=?5ZU+ULZ%tJ}qWPU>kGK>ka-!nyka}IOEHta#P()sF!H<;v8#>7(
zT4(1~3Kz4Zr#JTMG(jLFGc4dh;?j3F!If@bLATFLGz5_#D3%0~UsKMa!x|XeS4S+!
z!N($D6*VT<1mI;23@MZhU6K~iU|nu6w8|MieNt-OZdF`8<;YB>NY50-E9RBwD*P>P
zSwG5F_33)UX}UJ$y1919X;N2r9#J^A9wp8yS1Q}W5mm0Py3}M1aLtCQmaD-ON8_+U
z0uezI1(nqcQR0kvWb@L}wtK;gtu1b~uy>owQ0-r%dj`S{b?5Np9)zv8y0XMx>+B8H
zRN^f;ZB>ka-n6!OG~%kMwcY=!<<dZM*J@2$vPm&Fu6?DFm1G=9zaot<OqR7UK_zD-
z2a``wU-D>Wo-g8y71;6#lg0^mo#rQf@cKrsXMvbDB-H>AsqKYQW#I#HmjTC9p}hKb
zcCflbBs4hI!cj!=2#lXQjt)5E#0FrraHfvxo1#+e5wO>3O%yV=RRy8uW;{Q{@}*ih
zQe^jd9Nq=&3j#OBE`djOLbq+uQfe(!upl8;WMa}v9D_z>RnoaNn3anKIuy&z<1cd>
zYjB{g6u3x-``H$#;S{&4_$<*07Phvw+l)HK!!a1bq0R#f;h&uvK7#XLfW9K%tr*9`
z<Ue?LXvzA$`jn8)ek22)-A4sBy>8#PSFlkg>lLQs-wd_ZVX{vzcd*vi-4mjJ%2N+=
zF}oxq3<z;V91bQt4jk^(nu+IeuVs0^R^h)^AJkrs(_rKNF?}@p#l-T(Gc;P8KMkZL
zOVm(QOwWhpCQ3H6z1@mmfh`=E6`m;iW{Db(NkBlK+2(#@*70~!{+SiHJvPXH3rXag
zO+>J!Nu!9j(BVv$Xuh=Gmb)@gzKCst0Tl)Tp2E#9usO-oXDcC0WZ=PIN#s(0#3joA
z#P|U`1Jc+zXZ8#G{QgH=N_0d59&pmxwPOnE=}%#yP^k65-mocU5OTEUdhY1lImhZ1
zy~SvyFT}Mj6NK9F*%_g(ULRZ@yJ11b*oCq~vI7?<FDQB4v27gG_p`Qf_fg=uxW*NS
zWuAs&H~mfZ0{Qncc1F|L{6}9t;x=n1nXQ@UZ1$94IYOYy=~rXPY@>uxzSimxPjiJ2
zjW$?8NIV?tww27uKJ7+k2y;<V&bqnI0rLaBKYYT4@?92M8l&k`u`;7!WARaJjp1lD
zR#!@(ZOn!>m-=#i9vhWVqnm$I!C84%a#SRtftR5oQzcmNJ?+p|caY)==LGpdk`PnQ
zHfT=Wgh;a4Ub}T(x%K}>XiT?sM6FcF5jbo23vT2jfmaD6CQ*h474}0e2?f5~+wEKg
z;NQgH>9HX5>F9mj4{01c9N3HIDuK~#;zgrUZ1l-XF=((HXXLTcp2UjLqFb$(tBr72
z(&Tj&;7T_WJ1rw<<{Bx7fod#DFYa`_<<6PwN-<HX$Zn8>d+8-Y47loX0mXFZ+ZPFA
z8uM0{U%v*0OT0wY8Vz1tQFp@gzcgUyV-aS@id>zPil~}s2?_b-hEwGAHRpJoP+QiI
z({rc&nd1Eg?RvPcvVCr&o<^yRaMLqm`l#8ybMjbcB+EDICt8%0{93iLrK;YdmR?<5
zZSdMmw{bPOEf`$8z5?HeBxnM?k1Dv5H1slrX9t{|8yBrkPpopR%$W&xv{)`C=`7(x
z{oDYCradyFX`bJ1qS?(+osx?b$<5*9>v%!VNY?)4?6U8pr$w4?WQ;=l5*q=4zPU4@
z&LxY2V&a~97nw)zW(T~%uNp(z>|z9I6u`Sh^V;19hHsSSaS2Glfd?*hgF;LycT<g`
zDG7}Xh8xpUKP?amxXpUUa%g=+-ducDmQ*+_TtrNOvbab2)kQe(6A0VBknLyWOe>5e
z7b_c@v9KWCaTyf*eXv~-5fuGYSo7pg$J?o2LAaVqk$HuMNW5ROhbO`y+9iYI9knU;
zu*<#q1ab26W?Q`2uvZ$PYc*NyI7>Y^HJc#PD3Gi?&y>h2bWhg`XCGefwotIKHSFN)
z%_jTqwu*OZhf|i;r#ZPwe_sX#GBII$EZ0^w?KN-te3)*bVFV^PKeDj+3<1NgKik_U
zTDW0|JAa3U=!r>3Ja(I_nylTPE<?xxX0wC@^c=72@HoW+LaHyEDkyJ4@}x592EV}G
zSPP@N1*63NEM3Z!k<TxDdm_JISjM@C#59cmz2%KE_p0(svusI}XT=72do3{(iD(E-
zfzKcJ4`T}zwtkXNp{rNfNCT4Kv*L{C-cDu*oo);Icrv~6IwHZx_lKQJODrU+rQRZK
zLiUb{_`qn@Diq@^i)rK=!(<cKX73Hm<+H+5^s8=kX5B#A(*5ZQwyY_A-NeNEqF42l
ztll2bMy)3i7?djq_=nJH8G10&90Xwv)BDYB#cANydAcOf)vEcU>BsDUvKipjAkvZK
zD$)_U7&YM9@9%3mNN~Pb)2C7N_VG7y&CWF2E#LO+e@y=%!_`-RE*)%TK>qP#4fuz>
z@48><+u_28hO<*rD5GLM7aWC|nXllmad0--&AJL`qcS{yUeX0-%C>vsxzQ6>ubSZA
z(IiW<30z!W3SGkIOj4JH`HC0p5SS->&Yprq?2#ekW`%q>E2kczhUboIQB=4{A<H2z
zN0g@*mqRF!{_-Xhj+<_bw|Jfn-2ed`T&&@u^>~({cds{kci@rYV!+1aM+7CE^VlFS
z21Ns^t0&2f7E5&K>!lIv2;}tJE1$HlIdy$W!v`=3=(LkC*=BHSmwNNor;~^n1~~m%
zZ~WxCqsrF<5x<FyD4GwyC#;zqp50`zr#r1o@)&+%*#~qcA~L6#rY_(GQOgQfXc!%T
zX^#&JY13<e)sh)$h(|gyytkjlin9tme{q8ffzR4B?FAsF;Oo5z_J?K8XMpR1leo*%
zxE%KJ&jOFMwzhVAvhTLLt$#&Zrf9RnUD@FQkcy+@kp}WL^Ewca-+ppi8nEHIGGrk#
zV>J?H>$N4E!-!~IAl&ZzJcu298ke5cG$|}3Ca1>}yM}i4iIR?$$Za%92hY{?W&KDf
z2mV%>D-1lAAfc~9eqjw4_86A%01V1yhHeOwg=0Yw)~Ap6J1*bC%J8u1BjY9fq>qO~
zUBFCs_89U#m@!nL2+Jot8{u55PgY%`JKkScxK@^@q)fWh<2)uP%4-$W20LwxWAy|G
z<arP3+6JV`GT&T0YxWe`$f&lEH{d6{$vp2i)m1|HT`@Egr5b<)CumIx^Qp{L)sPt@
z-XtluY}0!Hde}QUesRc1cQcY1ZD6;W!0311kB6!qVOO6Sv})tj>mTq{E#*Qmpc+4-
z^Jdo_e^aGz)%U(dtzsSb$~-f$62wV3m*^EWXJ)rhYx%D%fNwV^^_CmT(bCi6xrHWc
z{T@hgJ(hP%L%jomH8nLyt3q=}z>%D*O#*ys&25=%*uX(D0}h@pQ*=(d8ruEk6;*c2
zej1Xm@sv7Mt~wfNAqdok5>I+=`*c)v#vV7E#Q*XJZqU7{k#PXcXGaRTI^f!`HD%3B
z<AnNGip_G@2v2Nt;<-tl^0QoHlRauQ-0W7H3mlM4@Hv)ZK8A?#ldNo>UsQQHx7lPm
zUGKur<gt(Gk&$qpm6h8~mSztEs%7)(by^g$J-^Du&FQdI=I<iH*>sZh@XTz0<9V;B
zho7$p2<Rx++GgH=9n!E`O-UH|o)k%o5dUO4`21@cG=7*dHzMBjq5ETv;%{uOs<r94
z(y+q(d@#PWkw53_8DAI{Pm7@0Qe<U1lEM;xQbMrlX<y+2*3AXE_&h@tV9zLs0edFf
z<EQ1uJ9MssW=oQlp_ch{R;kgOEH)H*>9uepn%{VsG)ok9=+&uV_Q_MH^>avFWa$X{
zBdwUGJNa*f(0ZmR0xnfz9HoGSCk>Fq%2K*oKWEvdh58UOG*Ow6dkk3Lm;kzihao*x
zB3wGRd<WF?>h9Nom>BGXW#!R^mOwGO_>hu<+gRTf5@&7rsZ7mS0E(FUxn?b%omuO&
zIm2{%+zJCi+Mi!r)6PFULO74H^u0(d@~>*07-h1V277vPPjxylHP-N1Z17~{?Jsm7
zZoC!)-pY%jWLJLBp&~>hzcrV+ZMcEWF-k6At=z#8CDMy)++v9@Be&Xdxx&aVe>5Kb
zNNYc=Qy*Wy)4l1>{uLH$B?X&u_Vr=fclTymFf=6vD;^Nl`JV~^kUxzfxVM?{3)Y~O
zt8II>noz74D}rb-8-X-B8vrV82w*H2skW>XxX?gp2ToFd5XyPCtIJEjL_gvlHW2TI
zN9d<+`X@{~%uebG1*TNLLJFB+mq!Z;@CN!C6B!F#sKx1BuPFLVj<b+qje|v{nb^*N
zi-5f~DxH;`C_y7mBl5l3fLKA$v}uXWhX{5Nt`_DbyMyTcH<N#Ku9%GXB+9?pXAmAM
zUKEc_63WhoAd2AE&%JwxS=MOITAvpWF_}$6G1wcggEVvSteVS9FBR3r?dGj9hu<aG
zF&x>)69jzs<g&7|*f6)rLPgPpFKRbokO&~BrN!BO0R8g&6bNZ}vQUa^pfsFz3HVWW
zf$39wXJ>S_Ii9M+sdFjP`69WDp}JZh#*f5qi(3IGrum;&vv}FVR#bbwp@;EASm`wk
zq(W&6RV|^u{h7fE^VLBM_k{`@@a^s;Te-uyseU=9VVaW$`FUN9V{Hj*Ty!`x>iyM!
zgNQcHw1oWb19oc;8djLgR^3PFysE~>FFpF4X>gV2$iKVkzgbAc;46Vlj<V_T@$+Lm
zSri_U5Cd|G^-G3b7fPMGBYHNQ+h>qK1;lIkoW-*(5hVVgH?K#c4Wn)s!Rl8Wq^=uc
zIrKjr+9Bey(9YW64KW0~YV$S{5+AJ4z+quwnS5Wp^60a}qq^x7Nx-4~i^yuMmKpqI
z_snHyviz21^+l|u=!~R|g2d@u=;HA+;EhF$JyxuFDr}VN`S2odB@Bf<NWNRUxKr=c
zWa`}29sQ%kr?=|gFhI%H$s`$LhTX0<(+C(p<>#<w24de(pljbt*bJ^SPGIoOa)o2I
zo%_IJ=T6qWEfNzmDG<-QEmmqbb4V9CROY`=ibSq-Uk%|IVd1_46C2?$XcWY(%D<ly
zCUnpKpu2Xz!8p4NntbpQyp!C^wnS3kS|P9FmNw2aofh+>M2Q<|_|XX}U$nZ0OMH3D
z_q{lB)W~d#?UTyB59mi9-{uIqs;NGF0`X__<nRH(r>XFCWAHn0b9`<rnDq>aZL?<R
z;SN+qOddS-y$-{3RLHFlsI7KzL~5LkOoVs%no&xTM#w!em6-zIzJ%)t+U7)jwLiJw
zKL-%UTQF8}NQjG5ck#Q+V2fyIV8>hX-zBR@WqPX1n51Ji)bzT2crAJSK(zI!QPSVv
z4uluN($dCn|0J}`l+B1^^Z%`nK{W@+IF8^gl+W9}fx(b1lpSd}Qhg2hlgU|9qFUqj
zr%L81@k<2^oCr->$Z-jl;+;{7UJ}1mYUZ0o&LnEFneBpR>VMZru{NJOlccX$yC@Md
zV5QV@QKM;SVsl>6)oLl}F+_})#V0wzz#UF?M2aThS|PquL0;E0WcjUnb0i&I-NGU^
z*U@utzyyu{=eVY&9uUS^g=Z9M6}tLZaX55?bbTl)(ZI3-{aDh>tSh3emIpw74Msw-
zAT3y=BB6J>e5`93G)afuB@)BA<fuif`w9!q1-`eDf@eTBkhvju@oSV1bh$S3g;6*&
zhgT<{#q7IAw%e89j6gG%gcbj6M<=AwpR-p$*X=!+bf2V{_qpGJ>b0^2I;o$r8qUO_
z?ODzNqqez}@vFL>ip~(=LB-_rtnPSDy1|OKa1^r0LbiYa6aC%@*1^G(#5T7EE%_w`
zC~VN6F3{MxxUjg=9Wj9Vwf?4wFW{!}DgVltZxn=b_I`ri_vrf^H?#$wc1@adhFOBe
zh892*bvhiy=);FjZ!adD6F4*tuJ?Vs>U+V#fx|E`@B=2D261M=C=oF6tH+Nz2?Wqm
zQ&S^+v+$x5&9}V9AYWG@>FHhv@HGQ|n|Fr08Vl;=ObyWwH*e^s&Q1y7;;<hk@F8N^
zE&#N+XNJ`oA7++a`t7d5zU#^@XbFAqIv7qi_8e^N8#!t;5&OUy{U}w37bw<LmWYpo
z2XI?}#`z=JEfM1Y?l8OC4FQryl1(qcs>9}owf`_7Mt6C7{=0ti9!7rI;C<e>?8d6n
zh6A>-cUGL^Q^r{aL}cRtH`f&@x|c+S=FVSY5%_)taDnZ|FAXvpNTezdS`zrP+|Fj9
zY=dn8vPX;4VGNUO6*DRZ21IdaCazb__q6pYPl?Nl%%!EJ7N4wRgxc%grzLEj28JLd
zcasyoc>q-bO)sKn2-SJrd+zY2FN-vTita;AKzg465Xm3m-5?C@>;iuO_3<0I3^D;h
zCH7c}Qi1)^3<;)V!8|c@g=KciXCqMROyz4)W%x4J!qRL&;Beg=LO41;-q{~b2+&J*
z^laF4L6MY|1z+x|sKV@Ie7ZfQd?uJkV=X>KjG&_gZ$o}x(gRi`-yF{yZXCorUWi6f
zq?i?t!X^n*p3dN`{)8Ic|B1C(_pT^TkE&%`fKfZ3Dz_!IwTw*>P-2uRW?QmXdIO=_
z@xzB8juR3R!bPpNALlQs!u@DaZ7w!iC1qs#KOphV5Qh&*lmqv?(XUE6tLBjkvy)j?
zC9TZ&uA0~dj^o<Ty3=ezftsS)?2p8AxtySfiJl59r;5MB;Ak=c>+NAH{nHGzg8W#*
zIjolG{6+EjJ)9%g3KH57!Q|!TfwV&?%6Ugeni|*5L9+x)9Z&H&(LFLGI8OwCv(<<P
z65omLWm6G(|3>NS@7J`zCWl!*f|D3xhXXDvCnF;x34)4+HQdL8u_}?iXlAaa*XB&!
z;zPnXIvn9}s@6Zy&BG#>Cc5|Wad$A1*U4hxD;`;{Y`(*FlzG)p6I_hHiiQU&82Z|3
zLX|7ruykG)J$O}drdn3$_m?-ZJnN36qfpBm^ZtNtUUe*D_qXBu#|8Jq<JbYCU#8f0
zMZZ7mW(C^6^`{<pLvnc3g(${UeDA!!ocU^SLT|6L4cbgimgn5`3?8kiPBF`2_XFP7
z*Y^^OfrcLn^x|i)KAYPgX*@jLqBJ&aadEm%{SRW?6Ut>xCZlws;uO*uxUKty&OjVJ
zM#r?9#VN4&g|n$}-3v!bmWY8xq@|`p_npiaE7-}^XZbKtO12a`tEys%N6ieb+aZW|
z`ylwLQ-`{HczV|VDv`Xjm)CRS;#{8j2Hhr4M2s`@1sr&dzR57~@Ig5_IVXqdY65r%
zMT0v9L3R$MvP24Wzx_@{nV-x%KP4U#53quhb#C>Y#p}lQRB>Xi0WwfHNU$phT8(Ro
zJfY<|2o1Xw^lvKgvNzF@d@+rWPrtg3M08AGko-|aCs{-HNEI>-wE^j;Z_11CGpwi_
z0!p!?$v{vKD;42#*#Q5Xx%7x~jf>;3WvY>oCCdhZ#7=Mv{&CF^!!)y!xfI%Yw&z8!
zd5(98y1uvSZGX{FOaQzU9Jtm=8b!E=6CQ;u_GDiHVpA~p`5L5e?e3bQ4qmipks_CR
zd!JnNS5z<n=e7x>*=)2p^mYDNlww2t2{d$i`A@j6ZrANFG@4(H>iOn{MMN+@uK9uT
z0JA<nNKq>jE@uCnxSlRm22kKm4?O^}6oHhO7WR%c7}Fu#;Xi7czWTqVea6<|{ijZh
zsp6`!qk13qg$5()QzZ#1)2J56{oP+{FaT3tgVFesjC@25otVQPY-}&&p^4BU5pgd_
zh_<?Kpd|ir1I^Em3=4Uhux<}dc_#|I;>SZ4h=SUJ(I4a@B;Gq+>vd=&11Wk(h*6Dm
zCwSUFCZya(aKZ)iHN$Dgl>^_I1=%MVfaw1IB6W_PUMM1TXA!fh%+giJ1efy&A*WHc
zi=<?I;<~X9lX7u!mo6#pB<w6KeeJ)x;FOi;T?Z3!DZzQ*{10=0)?3P2&*Gvg@+=XH
ziKKbj+`)?@OY|vIQ=;r1<kigdQS6VQIFi_gR__IaL((2f=@vxd9I7;C;}9)ltyk;R
zz^|G8Ak`nJVq%fAUl83uKgk=8#9x`VN`10N97ngd;7W?sLOUk@i^wa50oJ3|D?$0L
zAZQ-TK?d;KbK_Z3Q;&>LEBZOUin3^b4IE$4O7YqY*T6D}oM<6Z8i;|<X~ii$KIZBQ
z<tL|3>ZquWAGP2_qO{sKGbf=vWy#DRSH$=g;W3*deP}s$aInk)hm_GDinNTLFcKp#
zn#*mn)?}mZ@W&d$qrRjJN#JrAuqrNxCo3-lrIc90&wS!%--+j+GefBl@cW@{gFMUE
z)>Y<fEe?+nwNTy1;W+Y)G)<O;PsB_HJs|!T1U$|b1%Y3V>vxhe*_dz<cvIv<u`5bL
zi&HL4Jp1C3wRxk*D74IZANnJMQQK?DFru|YF#gJ_7M|c?l2)FwrDDRl5+wG=zM9q@
zpK9<pVE-gg))#UQxevgKrp4GY8$5B*ndY0lOs<}JAIM#fDDbo5$>IYWXDHT0yP~S%
z**zI5kfLX50D7sw+R%y@o<bb%%H1`*0@HqX07mTb)&q*oATY90nFRbJc)D=W;8|-7
zV;A}xW>d!%DH=m;Ut>VQXEZ=qhXzFu@(XQpIh@0vZkR8Ysbv+;6mrtuA3!Y+ay7TG
zu$B|eEu0Q%mL4?c>tEZ)>fn{S-CL7;-PiDoD~%a(mg!OB^rRrpZ%a&0E)x*)1USDR
zvIW`uD#?gTafEY=Npp=y%=WVa<J0s*(#wp#UtEbDREe!%XH-IdlcB20(%4T*ZMxDG
zD|Tg=<GzQ!9xqdLyP(Q&m-V|T(-;h#e|>q{+1~a8BAvg`rMWKmSV+W*=aR5gc042M
z`I5!D=jx(eY696V9QQ%SzmJ;jfffYmyvJ`{kw9R36Z#jq6}>!|V{mjQhw^^yspG)i
zp5z=wtK6$jH2br%YcA5;Ie}wbX^Iv<$4aVQ^xvw?WM9}|?di#ayXZAPAgF-GLasF1
z<(LJCsJ&D7=6AWr)&a*7IY{V`glUc)jQt9aX(@{Tw*{V2{^^%{7N;paD+`V|&(i47
z!ZRGH##^uKS)5lZvmRLmyde(d%WHXxZ?0$1Q#B%hpo`1CxVR`>0vc~wru4ltJ=Bke
z#pX8Bw+8;}?#AQw&v6tK6cTKKo(ZK-f3<A)K$6IJ8FQU9cZ&6Vm`JAf`avhzV4DLH
z{wUVlQ&31ghA94Z^N+2?j~=Z@Ym3=PR!bPkZbi;@8lK`+qo7fodk>Z!-Q)lo<{)!;
zg#Qs)`aaT9&KAk01$1=q?cl_ZG#u!1mdODpyPO2D-Xdc$cHs32;rO5Eh`LRDGRUV(
zFmqw9T3%vhGI=ye<F%J2$xs7?h!|$l$p#-I(4VfWO)(7W!&Fs=l@#*Qhy$b3qNwM9
zIMUL5*V(tKL!36kh9T5+Q<Yz#L8pfwaMot7mw0GOOf!J=gYXA*lr(0aSJEWaw0w}h
z<lJUBS(y{^1y&5crsvq9Yp4yw0-fpO_-LsCV=l(gyhNz!;RgBhuleFwqtr}UPq)5L
zMZ;_=wXte{zq=!7a25BWE9Bp|jJgAp8O?^awo(0|-G1jS!LW1Pr8`IT4=*GCOVcmP
z9IeHSzs3)8b0b!gg3Twp%oo50t90XFBjf{XstBL&)>Rd9e^`cZvW49P6}il;#`5w$
zcL09UVT_@Og<e-rT1ibW+*&;UiYmBVebDI_le(0^j<iWS2&(tHUWoS&mGC}DPojYF
z!-CrUuTH**lG1x8A0j*Orq#0l-2w1NDDkfYfSy`ddcdQ?giq@-f)ZB_n`zD+g7*&l
zY*~W1Jr}ywvi4GE)Qt2Hen2Ktw^yoR)?3`T`N7r=+a0=wGex$%qQd0=koA^PRfS#K
zsHB9_lF}*NAl)IGP`bMWluqex5TrY$q>+^F?v#-34#{(GpEtgDoS%%rfW7v*W6rCl
zL^=+~Sndy<#TRu{a8}lM<TP7UpWd<J=B>Vw<FCKV^V=*7Zf&B3B$;LC^yW2EJ#XUX
zqqG?P?5C!e&bHYXg6wOqB=AG_aGLml+e)DN5a14{z&J|ZV`6C;y56hY)-ta0ieg+P
zTkH<HJ}EI)Gb}-(n3#I&gBp22*17FSFeynSsrNl{v;c+Kj&;Iw8&of0d#n~in}4YP
z!S*IA?!0y<i)=EaL@MW4=cYn76g9t!GnyC%=aQUaEX7>Bx_5Qh54|HzFg9K}pgBT=
z(Zx{lo1PxNxwS8~wpgVp>fi>+z@v0eMLmK+d(dx<#|zdWqYMj>9mINM8sRdu-0ts5
zVq(}UQ7~>J3zqhwzYheq*^@6z4GLHCQyPSrvh~IuauO)y8sWgbl(|8Wld_Mldc$k~
zg`bhOsk8B_VsS(T@c0GawSO9^{2Z8#psjRp>99O|7qL+LL4ey5+1<NQQZxr46|plb
z5oU{M1TRV&N1Cof0B}fy7aW9-Ve3>2&2HyPdTLSrnXgT9j+XT3<)yRWEB6n{be)LT
zB(7kqP1$pW)9nO=N$%NOIjtBOiSfvtFBx4U5OTKg+ta1ZPX3gl?xL7VG6j`2O{F;G
zl7t(<=^iUnjDuLTY*sIJxrui1Z_m+Qf1i_P0CyaqIZ$AQ?@!<+g*~b>-VP=-81+WU
zY{_bG3l1`H4*S<px03#K?M>cvxZv~oGv)@7tbHB*ax6Gp89zZau%qCrmx|{N(uiSB
z=GRYx;g-4U7#6>)*MQO{jl0r+l9gL?FkGB<#3tSI8s570dqL{m{?qD{U$vivhez9D
z#YAOCYQblt^~=bchBNba0E5*Uw8Q>V(!OaKsXk}cu7muOEdYLj2&N1W)6*B$NER27
z<Vs3PK7Y?8bN+xr9+>Wxj0?w3NrFY7{P=q{lpXXMpUSU~n*VTuYHK+@nu;n5v%2a%
ztc^`P_TzMu=97-ICK=ARQ+IM#{&AMg0_Q^<7J%WA;q0f}0jyTxbDFm?@*1O|=WVQs
zCA={wbUJBqF?wkcr$N*qNk#$0XuWpslT?-8&KQTr&3y^^Cozu=!<R}Oac@if52Alp
z(mA~fixUi}!P-Q4%j<bcLtmZC<~Hs&qbFchjvVBeO<ybn@b;P2!=Ec^a<|Ue<X?-R
zt=llhD%Aq;{`S|5eL|vK?;^-VvUu4)_H`w?MY%d#*c#K>47XaA;IubefC2Wlj-hf}
zhd{?>&R<%Ty(gL9=$#4GkA@~=fMd->eBR=a=4tXu=TDmvAgVHNT@s6DD-@F>Dru6f
zS@5-!=Oh_<<CY^!mBM*6!Jm)9UZ!5I7ZMq%Dp!DjfWUUhCA{WHvdId9zyB!-2m{;l
z<q}z4PYl{=ga`)v^`QceKxb#h<tZZaF?mdb{%1ps28K>Np*_z_hnG&vO082AbC79?
zK>B8llq{#-HxD2T-YNpk*4KJ27Wn3s47FkhY=Y1_)L+sr#v}e)PQVZ4gU#sZ65_Vf
z=;`jy8hC2ats|WDH&Zh*J>=B5*SL8agRbjh4)gylc)FYIf6@cUA|rzY#12<_-Z@i@
zH%@;mXvcv4*NaBB)}(`kbJf;lriX{`^sf+LUr27c!I;RxtS-oNzYm5DfH+AMXv5Yu
z&d<hS$LPoA<x%~mDf`I)Tjs{iX)+qGR$++{+7sN9i(7B5rQ9K~Nwu)@c*S$lC8F}5
zvJDAT^MqnSSD<M1ks?!i|K2x}hzA9b#aWwDgMrU;gCdlr3*N+8823c92a|fuDJj?J
zzk+2X@M*2(u!f5=8pPqgQ|%^Tr$n0Hwz}NHVm~uA|Jd25Qm5IZHT_JpK%wdg$h><%
z%l}?$K!>zR<MiqZx0$zo9^eu;M@`48TU#^TQleW>wTGgpNV3FTukfSq#f2jTB$91n
z)ZAWoLaP|&W30HI1k1I-wYc3t{T%b^e&lfp-L>GNktL8#fV2^^VW;8}Fw;l~3ry)y
z$XH;!CM%F^l8ih}d72m0iGb2qR#tw^=ZJKBdt026>-q2OKtjTZmEwVgo+6yEjg5_~
z@L?Oj4!nPpnlp?R*J{2>?&-(-5lR#209XQjSEap0^)12tI#s?tOuBx72(r4N`$MY$
z)>F%;F-mI&m`_JuGQT>e3N`IWKEGk?j<(;>PqeACg@#A0&aVqS-8%?8TukalK|Da3
zT43xs#+yg>0rRFJaBl)`EffhT8}uMM4?r>g(P>Ai#7j6pHWfZk|GV5S<NuU5_NT*>
zGB1!#vfZ+IN9(QvT{~C=WKR^p!3*Ma5$ZSdLTUaea&tvJo2`8bo{oMd3-K7QHF_Xr
z08vpzx}g8~ITI7pm;2M+jsSK4$Z=><O9qb*aJg}tf3!k$+gP^v>B`fiI@#pAkMB+<
zY7<lB3tj>Z8bHNhT3atYUl4Ia)_Z?gBJL$c`LcKvoe}E<F$6L`d?~^YdK<90((KfW
z6~!2+Ce~A~wDFFi&qxmz!f-?WzBikCeNBR%c64B{8;`EwaDKColSFLj*pNic#K>p@
z3xDVJzYoI)?~97Zyt*}<>L)z{6!aQFU1QwHqMwJ4gV$Cu8RK|z2fUh!+g70v{%@Qf
zxS-Xy`d3XGP-WuW@kF;u0!v!rhxl*mMsX2PngS9@3Bt;XFUB`9RyBXPR34Nz-z%NM
zaqF&3S+gI`n`f#5&d2%2vz@m2`@a`4h7TKWoy8Wg!%H{+>^FRhU)yo(6B>zLN1Pu1
zM*^dLpECSq^LnlG4P>OeFe2RQlc?h~R&q<&+l3hElmI!ogz0(AF@jq3WYvEB!46CN
z|3<8LH3UVUvt`ts$l6MMG!&p2;LNP1C}?x;g~iFlje9BMnaFx-LAmHz+mHI}C=`Id
zGc%rZ?%vIJpM*YYq`YQky$2t*y{%1zFFoPk;>Ccq@p+agSW#WwS#P(#ki|7E&v|^E
zo5#Ih%@LIr9&k8JX<H3+qI$@5sHy`y8pB_gcIG<XRKIvT{)nPjqp7$8r#R|6S9eG+
zBy6wDIxstnr(I|L-X?2e`OhCCK*zs1%pV?+G61)`Mx{}pK~HBRI^`HA2B4ZaEfNG^
ze48gWe#M>s-5uR+P(ni~vtopPapx>A-SV6sxyVqVl~O2D%@oIQer9-1uqAe(VNXAS
z>q+|aT|-&lcZ2E!@R5X+Dt2Y&YNcBUWbLysqRh?}HCt3EOjjmSvy|gqDF>z!fzBy^
z@U>$-0K>=R2%;1@X7PzUj}XFti8u&M0({RuCI+K@Q0%~8=wF9M22Utv03vP<-{qWo
zrC#3|hET`uo)~?ARp^8mfI}eIAZ@W&BIh)y#3{gf>~5tPbsG@sKd(hDj>-Zr_n-fH
zQ|RF;-ws#wCdE?yhfX^%1#}=0v3b<nvU^&<!!n&hP`O{?VqwU@fy-Y^1%e}p_s(bR
zE7(MYl0E_xgn=RSG17#;;ZlO2pYPM}hQM!s8SFN%UcIP&9d_thXZ*(l+mJhbLgMGP
z6Vt$;B-^0&XM;}&9GzO5+h&e^vYbDlq8w_hGViBG=U?|{G|Mj@t7S7R(<JVI^C7aY
zPeuR)_VK!%QMtY1T-Q=k{m-7kg1KxZrS2PKg1>X~%}<GlvX=+4v-Ml`4{OQsEA9aJ
z1b<<jWDLw0W^ljU3j|AA?_?%!At&FAYz#EWi(Ee|Pj976R7|oym;btPpPjAHH!CNA
zBWFPJr6Cg`v6O#SlwJ}et;G)q8HNHJi<Dz3qc!|oE+<lJM6jU+h55nqEj*Z^Ggpb?
zrr|xfY86$LmYAV5)VLNQ66?eYsm_XV|FBl!&j)jr5;=Bc|JiL>*+}1?9#{hczrC{7
z0!NIY!-ujr89==inc0htdlI8gpAroHCSC1e6>?6iCcSB6EmB<G_EU(7tV5MW>nxZA
zP-RguhD8LV<VTUzPNdQo{nHlP+a6yc!{5DHs>(Hr{usPo_w^mvk>O(Jwh9fjDN9p5
zeVnFP!!t9u(D;{TWGz~#e#Qr52k;otm)FZ4iukb^8MwGC+^=LfRK-;!3tE7IGOT!j
zY5|OussvMny6v>a1)vCor{*p7)ye`3{u9f(w_)Rh<1i7DrUdv%gwa50M(7>dCj$Pk
z1(I)2UronipoWJHlMu^;4)65z+^dotHRMJMC_RXvkHa}db$Dz750yWjyeMhL!BJVi
z{~I(pf6n(Z)^dPK>*-O7*Kr4fnr1cY{d=;QM9k;H=;#IM8<8>JB{mZ1(Ad)RVX1n9
zI-`ltufjzTn@BgQd6>|Ay_Qc-uEedie%velc6UP))3^(rKMUGNn<NC+UJVoMeZ%Vc
zXtKeVm0u9TEJUkW-L6`z%O~eeLHamLfm6UT$IEga4MVWm>^C@FTFBWWrQaO<Y9MNt
zZ1+1=CQS~DnZp=!$llVEtBSxFGXeO><VJ~j9TZH+Mfy%LDR3ru;;=dDm(Y(g`3mWd
z7XqY9jTV605#;1Tu_{yrk?&+*<Ko!dFDz74rrKtYvApgpRmud5`#727hq=aG*I3?p
z{erVSZuyv+?JxMk*DoG}M&rmYOU2pyTSjnn6Gv&}J=q5%M)6ev2VUm0E`p%JzggcZ
zI5LCdwb+?xV@(Td|1Y-rfo8Hec84ENsL6Rv%(9c^Oz?#>Av5{d5fT{Vj0{*_%)T1A
z#NySKt7rh?gTe=sbG*Lt;r3F>`oK9Q#PXV9)|r?t&azy`TXIZd(Zi~IEOZ@S`pelR
zEo-Ks!f2q)bN*ThD1aOyv;RIZ6;Te!Q4t=f(5Wz3zBp2IQPVXmBgH+^AT{v7mmj;e
z=o!HPwV(Qk{%?>DBvf2SHu-p-`9!Cr(dbCF^xLH}F)JIeF%y%RbnU$*p3i>D;ABKe
zSx5++MI*8*`NhPm`sGB()YCqLk!B_n;>0-(EV4D5hgCnmkDBgp=N~cjnT5gWl_ull
zd-=S1F8Ti;m_9RyyS9@z$i8S&K>nj;XqaulHmX?R;X>9EVYjo2spL{7=@Vz^ur-94
zB3mx={AqMhYqs`={ZbA}*)3k^(6xz>LMwj=6c?j0P)P>|At$Quz`(m9FbGlq%V}Uo
z1(;ZbMDR4&Hn7gHA8YXe;c<);T$lN86Q7N@nOgO8TFOKVd!hT)Nf`Si=hlRS%S~M5
zxp9IUd7CL##J3Yxu8DT2F0GtMr$8%II+E5%*N2IH-K*<=A<7%Vg8zjmnP*sJdZ4N{
zwsIMsx<aufxlHIM^hK(PhoqiV_;$P_&7{_-A5*WCXV87_o0Ub@)!WPPf*T;n%hzDS
zkl<%jOw=$q>%a`uO^Y!;ut_)(K#`o{hcyj&d6-BE5sGo4hYRIE&r!ka*S{XFv^6SA
z^*n8E>8}{dzw;O9B-Zgi_f3egyt;ZYFq+V2y?MNI!hCqoIW$!0Jo4<j`b#Mhbmf@<
zqe*L`4XU6{<I~qk{I_s*i)$A7&xwB<)jjeu*6s(5t$aa>al`)kPO^nVoS3+~y?CM4
zif3?63r4Jfk7AMuM*H`{cnseHNHBi2J&6L{%jmGf)KK~<Cb|153Wl~&y3%wo_|O=?
zcHsw-Ef$!_r#RjVIJ^_kAp!$1LYU_O!-dZG2qYV5@y`d|pmH!8F(vdFB|#xRIWr^W
z>dNaXlL_3$Jqz+dT<E@AF_tbzQ>&M==e7^GHqPi59s9oGNp<q+=A!5B1q*@*Fb}?P
z8gu33)_;9C)?>da@2eJI`N+}XE)Ax!&1jYX2<pnKG}XGXzcF!NOWc_^(3cY%pDubz
zzW{bAqqz#hn-g(&*>bqp86@8;(X1j9E!lpXo7NQC9S1a$(r}lRQ%lx$^E-`qWpxO!
z;4B6~8?gxqh#5m6&@TWKM1kPQwF#uE6$d{M2LYarCYA{r0U10mLhSS!cH+{85vKj_
zSiiG~w5toAf5qdQ8+NllhxyF9R{MAqM}qqv1yVdE3lE-T1V?q!&H(2`jve0<-D}$y
zhJbv6Q4NIRo{}shdDLVNV#I4S|5KN<+i|5y8Oe~Hhamkg?C9r02w+Er^ntF?a>My5
zQ;C8izmzLXSepnDW0QqQqUnR<OZLGT&F-%_JH{)^*S4pBN-plkduFp+QsaQkd3=oM
zbF*)KJ#Qe1mdCZ?A@t%M6$OZ8hC`WU9zbhF4KDYlV>;d*9v;TV#=<<tK=SV>vZ4Sc
z*6d%V{nXX#A!%pvnMGaTn#Efb&jz5#A&~uFKP3mH)$9oqW@hGYO!ah$%((GstySCG
zY<bTW7JZcwCf2ahmRuq*Eelvl1jJ52;?JV$E*bMvO8d-w<^N!U^Qdl6Oi&EMqZC$k
z_ReJHtA$Qqnq?AdzDJBDwM;0AZ*cJQ<_B2J)U-gp6im9&K@pV@(q_SCO#E*VISZ9?
z;Y&5V@V9QAor~y|mJJi143nW(+VYQ|JYqRmv5QKj)-Bb5_^u~{-kb*q?4>7t-HO7A
z$Rrly;~pp1(!VvaC~qI?i6n=AQ__wDSy8}HnLu+K_Q(HfZLOeZl9fb}AoTimfuAYW
zgkvTg)L(!)XEZc4YCJNhM7Jrk!XS5h`$naw%^{=FA@0hkxR6fm&(aJLo3e;`iP;kq
zE3fV>^Mff3cyZbp6C4t&(Xpz}lvHB;>toads9@2I5j(f~5h54|1AvrDY6*v&+DK51
zorKA?8+(ky<Hhm_W35j$?N|FSaAAjUGB;`9DzZdy)U|zO%7&<0qKb*nKlR}!9Aax!
z81iIz*t42(C~_*~yqwsJBz-xwHm5zbJzv{jgVp6G9Myj$#ES|SUvO(2o6D4eB8&OC
z$`41t)bhDYrb^o?;r(x7!#|iMV^M>$7eTcxsQ07sRh_|=eanPivJ}Z`F<7v2{CM#B
z?23~U=O=b9LA9}@d&)L~_j(0uJb-~_KLkFKj?XfpqFf$7cESP4v<@A?G)D8_#fiG0
zP+nTX;FTFg#DfM!+X@sjmv^~iVW20gC*~L*h)I#ZCMJn2Wx}ANqiX{>F@Xildd(p~
z(N!y1J%va<Mi2i(r@`;;sHt4lW~^Tq8^-GpjP>x<{`u(*;#iZ*=jOi!q%-B{N(XV}
z;H+vh`2Gd2?AjK7z#*=3X#tas{dKc5Nv~S8?CONsKZ(;p5%1oAxqc3FUbv>tA$O%#
zbduw5|5|!Rq-hUp=<UXhyC2=*>G2qju9sFqHvaV`qJagAP5ktffx^7g>lP-Y@-;Bk
zxosj}?v*EZO_k*&4K(LABg5&6OXcV9>%=_6v{*swYZHnEC&v9BuS1t_Z55WIIwnhE
z{*=LOvzosijqau52(n_Q?0xN7!BLG*Vtl?<#`VHb@43=zzGzcXCiA>BA)St9+aL7L
zf)68BEr~)qcBsXir~NiW$lnCG{z&4e{oN}~?8py)^BIYD%HG}{$Jpbf{dr7`Ek1}5
z4D*~odGL~vv3O+X<0FK6WbRsMJynKVwM-1HFzUgSJw$X8M$Ss_C$FUBN>UAa;xL~x
zO8sDGg>-2B#engBM111$N$iCg_F|EQFeFu&2)EXIUO;E-|M21^O?==0?F#kKtuza|
zIY~;44PdE89hWXDuwQ%^%>N}$rUorohmx<^VH*vk+Hbm35nikC+phX7oaj-EbN)HA
zjK_3WQ^wF$4q9C*S#5xQ0bw*W)40|!VY7T$jIzCE&o;PPXbfyX>wI^Sy_7Wt_$-^a
zIi-XZHI;shmm<1?EwLJdVxtW-DI3tnxB06O>f5kc9@QAjC5}koR3L19YUm|mx%W^?
z*}-yl90-QlYWQoEf(`rz<Kn{DTSv6<>^g6UDU9G|kf{h)Oi#aKpHG~($HtkTviMv3
zMl3Zr@==1{?VOZzVRvc$CNt>y>R*B3{#7IgIoL`V61m4-FmOG|M-ICGGzkY=+wamj
zOd1s;WR-VUs_DC_Uc!Xpst!>sdd>A2Y;KmJH|*PLzd3BE4LEF=X|zIC<^&GDe1^PM
z*p6@CIIga6?~iXhPThm0I)1{}&*Y0{e~8sMXnC<doVS8RGV(}b^lQ-9cb5#3#P{5x
zFhcc#Uul#A$PkcxkNkWtY{qnZ#MjB--O*sDy(vt_DQF^@SVm;hyF)myq&hlHgWd+i
zj4-U4CJl9%`khS1qRz4BMnM<`6uR79ybbOkQhA|=_ihyMM;VveucGJZ7ZR>z_5(zu
z$=hUWpt;{*&a#T*Tf09JuLWE&5)u;rNqsnpVFWcarafutGYf~sxXMiKH6PZ44s<T{
z-UF47DpksqnnNGw+t8k4Pjhn;-i7{dTSjh42duK!{cA8G5TS^q07(5|GUECfDsQ`&
zLJ(sNCeA+;)cSM?xx82fM}PyTnhOao7As4yNfU0CD?6#<YKtm}azljOxtoer?Gyj&
zvc1U!TE&bm;1gcx7g4%SiSm)ic>psw{EGfdtP33(wObJ9$t0EqpLxrNYWVkT@@FAl
zm_!POvrw|2G=D|k7di5SrKcuAx+q)IE-&Pqt>n&h&rROy3Qzd50NpXnElr|c8iauW
zTj#_I2yf|0hb_qn$p(7x>blIbh<l5DsbxCG&Ir=Q-iUv0B_`}|0^hUh!T{2ziz>pY
ziW9O^Zvh&VW2d*S!Qg(@p%H@39=GIe%Y;koi0N#mRm$*vJA~a~5??LSMbPg<*h)j7
zD(0aoma2*uh19X|39>J3t*w@<wg}_<v7mS3d32jZU=YT!L^}RY6ZmW97mCHGs<Bt*
zw36+q!xQ%M4Hz^R!+8dcI`1WOm_V2{NGzbmDjU=t&M>r<l?}}r?*z~^R`NCh`nHNH
z6}x=88Pr9imn49lo`;-QJUlYjn<l7{gQ%RnpY)(lQT{Hk8w${FS#-*jerOtn0;^p~
zU3s!{YI5GI&v1)P>F#mKIY{2Tbt-yMpL7Ea&^S4<)g-NXhcqP(O2YN*aUh?66>D*6
zxM!xl*=H`{W&5%6;B5n<RnPf+>_@6kYgB~j{EB8N2~hV<60$k7n6(;m7&`?57L$*>
zNN;cu#V^)x(~!2k^c5ItbBsP2P9EHmjMtc9Asg&e73K%V9$w81vF~``;)9?i96Pwe
z<9tO+0z}lsvPv&#IA%%e(;{o)aZEPoCI+(wXyu*iUo5634C;y01e39H5${UZmAtOH
zDee^xKk;qCW>2TAG)BJW1&|gJPM&q=#y8P9rvns^A?$EIwom;e{Y|WoJYM=!-y*(u
z<8N+6RbX9F4CJ+WCXN04TVp%*o5{(E%Lz!X_9qL`osX9zb(@@h%YpQO6B9;UTs$B!
z&=(Aesw(({_*mR9nN_s+Z--nyYaj{h+4+`$N6y0&{Wo{luqTBnBt*fCB@af#kM`HS
z?(6b%SGoE6`N6}>g<<+1FB5$(P@TGxcGAqkG5wul5>vw|ocoM&7{n<5RiEd?Z-Xj2
z%>xj>czU*TT%LAZ{t1C#XJw&N!-L+OpPxUku+VrQ{>?is)6~tkNQQBWq4PPjdA0$;
zx7`2^FpaqcctFrZY5W3md5*ONeAQ45qeGd#0X(IDxJml>$0jl*ulGreQvd4-necx_
zh{7L?;p_!uD;)j65o9Hp0PVb<$%$h3_Or94dDaftUO5vWbG?Hi9U)Mp!}Wb1l~~pr
zB%ND*+!9>)Cc|6<VK;W|6yw)}0X?L{Q^A9*Eu=Qa5$U`n>xFz3z3^G}zGx1AVBr&Q
z0*F7=-f}s;IzO8WS9be7sQ4RG%Zd$F(o#ccURof%0XZzdY_J9OeMX?tH9jx=Yf;rN
zbRL-UJq_^s+%|im<t8M=Om#ou6jW%&U6E+7#n#C)RH)$V)#O4}K}a-S$lL~_)W^Pe
za{7<;XvWrzMy5}>M%b~1u9&fxq9yd0woXn6c&%F|3!#hOlKQbwfw>`-%PK((JJO}f
zhKO=;-sM+8e9q)?+rk1Ct!cAQLtuk}-sz#Qt;u*HGC?an4Wd#ELMEhbExVodHK>@`
zp%rtt?Z1jy-mvtNgR!iv@^nz3n=NVHAgw|bK_RE*?^bFD?x^1YDsTY2xF8qz@z@sz
z>;Nd{6B#xAuxS-mK`2S$^anlD<#)dv);ejdnB(8+yBdAZ1o|^u!fOHa%l|+e@`nL<
zcQ*Rg>?jNmnt48=Z_w__yEY7ILI%D*jMJrxES5=S-R@D_jx6`~qt#u-vCVtoKpb%?
zlb=Fl4Y#*kg46Lhy!53L1ql{~(Bw`<Ar<+pe=*3;X}}qI#gt?*)H(l9;X1<0d&_~&
zMT$GZTzw0$d0zXC5Nq@@C~e~c6+Rs~a#=D0#-CU`A-3-=e)3<_??v=ddnEcM?1>kV
zmGiZ{oFvTtd&TZ+{+G{6N2EefJ$B9o!{$F+`7NM$-=97w@xHeMEKzV@jeAzDSuPUh
z=p1D`(=*6~c>AZiktU_)U#MFGYH|ZGYK)82EQvqZ#A;FFiFiKwm?q*fZf=MoVt?9R
zPy%oL9eh~=sFa6R#-8QtlyrkxWr2B9Y8Kst`ekVA6b0RM4=(x_<jl^I+S0J!Clkx4
zCxz|~IJ)t5E^pVm2{>LYrc@a5YUI^5^GJ~uW+Az~eRHB;`!#tBF?ig6ZMDZJ!?~eN
znAzVg$&}orfs%hK!qQkHhz-i#|2l@U_gw;xXY59mOQv{n)F&PFcOVZfA${+_zpGRL
z=hX%o^Tw<8#YR4xXg>+<Vq+iuJG3rq8#aNO1`|SgB+CxnDKBmdHiGcsRQ*7}3NCwI
z>xvr3m6tF8%^Bk)qp^qPcv;zZ?kV&=+GhJGuf8g{#Xo(4^b!G5tB#s9Y8B4Hd}rD=
z`!yEm@B?9P8ZB@X(fSxva%bMM1}g$=wmP+>oAN09>Za4wID=-zM=0jyf`M@sz*hm6
ziS+koJzAAZ;-ADJ9IE5MYxR-mdb+wiu1qj}vygD%p{$tg4m_MPvq)22pW8Gzu0+1u
zgoOSmY5x*wd!_ZpK~@=$nKI-%7hj{_`tvqDbKADuU)ylN@d6TuRzXt$4H*xCSmE>+
zsP=_vbv8ex#Wr-J8h=z=J6|n3W1b?4cI75IW~;kONi;EDlyaFVhI<qe6Z*WsNyML<
z5E>e4n+bG}m>`ogu%TgRcY>hK!Xz`&qPAvhQ?UgtS1KpFQp%2<iH4?NKEqPkU9SAJ
zjn@WuqS+k1MrzJtino@@Y&{VU#+t&MIw(1{&3?`VzVf&VE_n@q&@1DANr<7&AzUHk
zy16)l;pVqDtS>65*opNRn<qovb~F|$;rafPyn%|t)tS<j8xa-mTU|yC847OirbWay
zt0$+vdV+re3>Ddnq>T}b^M&RUfN27fl2t>mR6RGzTvlr11zcZEMr3SnXHAfWxc(B4
z%D660xLM-^5szA`u>5Hb^VIP>N_)avxL#vP5wb@^)&1o>-Bc&F=&I4Mox0J&wK-~D
z-M9$qxYB@riU0B!HS}e4ATVLxA$E?argXJT^>lvi_Tt1J$<Rd$mAC_^iQV0mO?^WH
zW^w_R;({!aqYb!S_ZMc%LuXc`Zr56FVyBt!!&_67ch8ViwkbM{(O|JnJI<!N{ZRQy
zQx|Lr#FI1K{Y6yPn>-(D=g0T<24y(~ne3zz`)UwHt<QYF9%#%}VGO13j^htHehV<l
zV?Zsm$#)?~y%>~v(axLu{(u@c1oAsrmYw7fq@)(%@y(cY@_+w>v?bZ=6jM{Xf7Fe(
z*NEzvt5=fG#p1o^h~M;#mC*bTa11XpvPZ5Qe*2EuV^}(k8cgBbZg6BBNrcB^vC%C5
zoqnVji0EwB?1ctC{$cxEdO>X*GOzvhy=yWtzYE&m?p@n&eV$%b|B>#q^ItDDpwh`L
zKTBN@b`mUk6Y}E?cGPLzZ0T<wWm&d+Mj6oE@cxl3Of~`F&FeY%^<)n7ia^g+6H~he
zc>3S3U^qCoVCF5}i1evQn>11zl%#qJMteGCd;Od;%VhHJrb3snHrlWiC4_AeWbOSF
z&-%V$ZT|F<@y`cDl?624w0e=D)2K2R;-|)bKS7&d0|Q~JTElI`piWKvZM3uxNCOG0
z8$w&epRK}Lrvj?=hg<2P?zS?XJDvbbtn>YMCR&TS>K{Wf#&F<w$|RJ3Nbt|utzRF<
zo(W>-JkevV)WuRRSe3URGG{F*tt~LA@ICu)8J4*?<c1m_`?=bhOdI`LJEJ|}-6;<d
zbHHGIwQch5Hie<bxr|Lc1h%K2C6kdsOR%J5bdA+P+F|)`z1Xp+LeAy%Uid0Z`%TN9
zi*-|_{8}<DDJe^U;*?d1@5>+B6fMuVl=6cQm_-kYB|9`Y&{~7r>KSi1s!(lv1&&eR
zhsw{HU-N&UV%hJi-^8nisef0sdR@=trx71k{h3P0%@Wx<?^@d#g?3!;%c<m=iI;r+
z_qX4e1AMZk*%JtcYPDc7lE2`yAq%7?eV?|M9don$P;_ozqH^KPj`Nw$aBM=oqc+tP
z!wKIw>Cq?#7Gv?1$H@wkdiS;8pJ!#9$iRJc+M!=eHC)rlm};&s75^sj@)b`YUmDHr
zO|8fa&0CgDdcxW=8w*4RE`3=*V0}Cy64&5#ts>z!qqfKi1gLONDI`Z$M8;&n?@WwJ
zEfF=v@_r5kh~?3De4rQyeKDRPKNS_caqn3xCA}blZ_UXjq;(XvH7#RUAW0(mg)Z@f
z6q#YR*Q+Jv##7|-vp*$}Kg-W*Lv8GKY!S^EG5c$BawtHqFx)a@Y&&3?h<4-9co2Mx
z2UcP*wxTO~rHnQ&$ZyW)K0(a1%+T7g=Urm6@tu!4Iz}=fy?e-fA*ZFN{67gAd=S?E
zUtb?eN_%^6W}8pD(N{{)dxA`pZ2f!eWz3HGZA!D>U+ypbWpZrT=L3SBNP~`@^i5RF
zkXP=OKJB_>RkCetZ^z+z8IxVctb?(mEmnB~9K7E~Bv+4Ue~#1!yMGpkKWmL{s|%^4
z9A6mY{8&7Vgm0udWam2H(`Jc->c&>tlT%@g^iOJb#zWw=#$GjY`I0rjMhnfr-Ky$C
z#$BAqfP8XWv=~w-D14}~Y*aGeC(XOi%v|BME~Ty~Ux8B#HRGAiQ&<>U>DEn0II=kr
zj-DtvqH!RcbT>>8BZpO{=*O;lqDy8*k$Mi=akbT*K=!WBmPr!pg1<UCQbE$<%U|#E
zQ7Oh^)WY|&PZEW_^41btdz|pQ6B-^-(e<+@IN@*PRE&|4oYj^)npC}!U;bJi#hL6`
z<SIM;q!sQFDU0p&kR8b5bZlu$Aw@AsH~}@C0E3Z_)BcQ;@`i<myi^v|%U>B^aQ8W}
z#9`OA@xsont@J}^?>pki9=Yg$sxb!KYF-qlo_zh-N83CQeiqX^F}gQD)RQo#BteU}
z{tR1o@s+!n3`VzX4TyvE^I)?R9BA>lem6W;^$ID%@ntUz4d4D~$G~tT#9;6Om$>=L
zU<L8LxvI@v!#vLGx4(!(?1H)YUCst{Z5NZnUd9b&@uhjv1Vz48kCsTy6Gpymxxh3b
z45;+%;U46O6js*O)~0mSwqU@>Qie(c1u@5rC`OEyL_Ds<Y+O)ae-6G=`d~-Y;^e`x
zoo`SYeuEob7S*;t8e+6A&sF?i$x6lgU_O*T;>(YFMJnaLF~6I^4lza);Ea)4z!;Ak
z%Fb9JW4ka*5QOj4)>Im$a165)Lp_ExEWuBFjIITzp1|o5%r^><PSDrt@TRl4RF-O}
z>`}%Lb3M-$OILDU@Ym&_!4>XBIzk@d{jjgQ)8iBXwEzw{1$>l~Mcl%l&vy0T`#KxK
zu{7-#T2XD%XvN_CX9t#Hsj<5VS8a~sQU}fNizL>?vG*~m=YyV%0<QT_J?{snGYC*b
z-;dfCOxxGKfGcCfxfav2Cu1aj!2BymaPZ3wrH&srE9tk~*J>rTE*wLBJre}R1A#zy
zxsNTxgVhvSpPS&MnG21BK77R_L-*23koor8SV%}{by;A^-_OeD^60nrIol)#>RXt8
z!AaM-(e;))ACH1dL#lDUt<bvM1c}tU9pjMmMG$JCmOK0WSj1|}@nz+&^<iL>>Vk?V
zyCpgf-FH_}{b6>>JK_|pO2gdd-ep*RSpoXuc3F;py0=I}!}TIYU)hykRYRU1;A##u
zkt!Iw-#==f5?#FS!7c_ijtcy(maa#s-bknRucE=HKP8KJQ@jb41L(%FBp_@Z?*`%M
z7H810htpZ9*I>oWwUO&LyW9+t1~v0=X;;0YtKH``?^2mNQZR)OoJoh;kDqiS+#)2_
zLml%vs_%~s-9t8#X~-uUP2JwZpoGu+SNS3{4&~{0G;)W7w5uVK_?JUv1_hj=n!_?<
zgWU%~V{E?P0*BHw)Ae*e_zVW-(;03zJ`KJSo(giXhXUUHa`Wlzf#!s*zwWQA$e|14
zEkA#5PL3Gaq}StLbA0@%%#(zl^813=zH*7ohG5)h$LjU*N_0VB`8A|;!D<Sap7i=Y
ziZr!cz8r6zp=|NizlELgk()rHvFMzOz_fT-SIVqY&>n~LJVh+$bN~R0i9Jv~9O2U$
z?J+G(U6WAp%|_iDInGrdyW9WH0%*f^uU!r-ls|xH`;=WJ7NPC@vCL(SE7`?xK2trW
zzl0Ujwiof{9cbt3=D&t!h1=M%1FO#IL_DkFk=BU$D7C$cb)XPGiZ8{FX`|!l+(53R
z4|kFVpA4BD^>qgPLU8z4NWv0@-*20~ig<o)oXs%w^yg;@WaHhJzfJcH-pL$+m5-(c
zojalT$)AlU547w&lF=xS0Ref_xFITPLMWYU8Q3auYP;7Rj>pg;@ONnI=TFIwyX0Py
zUTJ+V7^W*&aHeD^6bxqw64}|=!6qk+^wsR<&rW@{{A!*eQE+OV@%h(Z8i34G=wx}|
zyE&h@BfMaKLblL=4KDaCs!}44IR>RF*pD;!%<Y7Fa21H0s{;hB2a0{X^^4IiCyiw@
zU9U8!?FobGdlw^#bwYjWTi5b;#qD+%6<3ekUbuR_%`l`|h>n2LiGIgD9(Ht(h+elv
zkQ2!IIwgDArMl+m+>3XwHCm*m1JkXDwA+e(gRMzMmY+uCx2##^ftQjW=tc-p&*K5!
z)^q^1jZtZ}lt>Z<_-v7hWT&S%F!f;pEeOWy2GZzA|KNw*e5EUiIHDzuq{6g+xD6VK
zfJ`;5{WH^jW5GY~<8-z~^AsYq?NY8}Xy$bNEoNiGbc5Cf1Mr+4tC%02TA%KOyX`_H
zCdbE(G9<K4&Rfux3qq=E<&n>U!)(@3v(g9wn%XDXR5M|RgpWo->)2DPWsd^roj6!;
z(UR_$Xt|Mw=Tx(9{Wy+#_Htn{Se1HHZkng?7egf2dXF7R2}U2#_<1Pn)a1rd*CXUV
zI^pwBzTroglxzs`l$|pC61B@ZAX|ex#2<pEP+*77fO3wrYBBMAbn7j!9A-1mLe_!P
zjTKd^3dfH20}hMQ61F_W<VUYK>SW6gEWZoCN)!NhaKi;ZMY3~lfuXN+C?u{!0Qgk*
z9i;a04R5?DIGIsDUN>mzexaJ?^TYcr`N~DnSW^DGzOg+aWg<MKJwlwOZDnw1>|5(}
zHHt}UBy_Lq1V0H>*I~bhM?!KcSB##BaXbOlRO~G`5E?I01<6(6!-c_}4+Q2u<aD%Y
zg0N6`qq*<&3{>o8*{q5HGalW>BtjRwh$$|xAO$?F#WP@0EP}zyri9!)OfqTI9WtOC
z9w1dTy(dj%(Z}sD0J1;SjC_tcPJCm>-tWZO#b7(-Q(SA@uKJ@_IPQL^JsfMVlNQO)
z5CI2=N26y`kZDEJghdhvmr6@ZI{~U>l)~1Bt3@U1Ao|a#GmFoCtrL>1<535zO6W#-
zV+e}+Qj!v}&<6W6!g6)CsTXpu3|r~omWLMEAbbNY#H~2hG-})VtJH=4NZ3n+-6%T2
zr!zleL3q{5`DbFd=E6nNcF5%Zw*jN<4-Opy<+@Elz{_(z7g4&j?*Q_6DC!R;L3m`P
z_DT=uwfCE=es$i^0?B1%3)!SX<9V*M(#tE4vT|hZ!|=oL?5eZ7u$j~-pGGBN);EYw
z_Jm?c<PKacWd{g9104cv6^=rIQ@Hi1C9@->s)~PZ)j{?IV)YLCsL&V*6vNRk6ZU{w
z4y&`sf55ys=|q2(w79#l<}^7wE-eR<$treHHO!!)p$W59@Iq6BeMLg@W6yCx8VLzW
z$-;bl1xW%b*4x2A1+5@PQj;kyz(ZVVYD!BP5x564bc3#qSJbU1-}RDJ826$9-8rh5
z`4#pZ1z8&wSa+(H5s)~6afJ+lH)$iG*gvi%^3O41YP8xy%6xInP+B<qa}rwbr)Len
zoU6Xaj{M00<R4eRc^EV_Y&rUBgrDw}$@a>QT1oAO#B23i8}VfOkB*uFhF8B0gT7vA
z20N)jC=D!bT3d(o0%}@q>@+h5b)WQK8<FR<WMquH(yI7lSQx|Sb;0Uj%fAYxahg?O
zmsiNKi_8Q1;lNK{r*N4TNquL&S}rUsd@sf6yagGj@4uY?<`H!ED|#qbyWa5)sTpaF
z0$0c1SszBATp@%3E?nE$%eBtn+~$JULbIMH%H4_|Z4!drz@7}27K}hxVQJHUTYRox
zWD}28hvJI~QAypmIXT;&roX>b@6^Pd4s@fBvq+=c<XBbV!-E0mzNu+<SIjYZDpS9a
z@NdJqM*TU{I&$ME-1OLAL--<a8IyhaK+0M5`&o?gYc{MDYsMj9cc`kaKHXH;pT=Im
zwb@(F900(2=;m^|L=M+OBM;Y5bx5F>kg~Hb*|^ljjA;CL8zP4ncZ*B0ED|TF@7~@=
zNGiAdspoaoisz-k_y*G%yhvmqcm~d+hPuxyLKNd0WX`L6(N(MCxXpR$dF$Wy%d@Ja
zsCrbAvtMt$iM6;+Sq^uaoyn$9mnkXRQvvG*puB!il-HiT?}eC19Hzya<ht98=`h1&
zqSVg9q37Nn*PnN+#0e&F+8oCQRA?;jfq@3aWFZjs+lihBCWs&8nQNkFA6{9}ACmx!
zCL%J@4w%UYTxZF`esBALH#m<iFEmgP^H2Lngo6{%A4fZwx;wKEw1DPsH|MExb8|Jf
zmKF{~e!E}T@YnbB)$DWqWJ`>-*q*P-Ou$8JIcL6=KElA_nR@-0V-uMTG$c|Ot9sH5
z>OWMb`0oF{+zOnk=*a{gYM>-U{Q2oOz3UA<Vc|D4X^``iV#mUk>Kj#u#=Tn|k;zXx
z?qDCRFSr+jVIc&>(Ojg}%$|sHkB~qN3rtuh^2rgk^jEhNM`e-fFZ9~6`7?dy9mzj`
z3AfIQ(+U=`>aPJZU4SEa(3)14e`aGTScr%!<A@xYF@Pq0v^k-<trh*soZT&)$(1Hf
zE`a0!5s#vb4jJb%oDjFoLf)7J5m3cVQb2k@d+FZRxA-{(NC1V|1#6@q=|$d^F>A#i
z58}7jH)Mqu$SjGNXx-`Jf7GOg)oF#C{drsq{?L3sunDwl`?KZhK;5iR0NEWyYJ5w7
zmzz!;4*at<qjJL1<LMObeRz@j0fTqU7imkCi4pFU@LCbz9Ww>}u*%@Q{pHbEux3&#
z1q7n(OlxYezvy0>->VapcXXiRPIUSr>U4vxHG%QvOG3r-zxfk1OigUCIxfM%F4`Iz
zK6k7N#yN+Y5QlfV?<EWD1ks;Ae-3qROGT6XUauN@NysUKe+S0=ATZ@-Y=qLjXigW3
z@2}}iCRw1m5>9l>Az+^oe*C6>Iz;3fw#*N1EaVS4VykOwFrQmRx@gR^bqo=dG7^2i
zV+gx?xKgsx6a3kC)D8SUiV~;ib6RenP{f)AE0TT^<NHPkxD8$wPXq;#xAoWjZgN3Y
zs||px_j?u>cO9>0i(g{=HGDC6!mrXPHSaKNNuTW8_=nSZdFjow%lL8SIsBH`j5v=Y
z8*$2GKj0deL1596gOKnID!O9cUgyGT+w-N1Bhg`4pd>q29Z)T|{QY4&;@G)+WB%k#
zKT`N@L^_Ka00)0*P&&<rT;tuQzkHu7_;wWiY<4aVEM$no$B(FeoXbHHIRuuM9kh#e
z*Y)slX-aFz2vMZMXpg$@@()3YlP95Kwo-(PRPsJoYHcc(W$ktS-yc!%?1!!&_{~I_
zMANR<lMasjt64=Xi-mFDF8nEq!5L5Z|BFNh&xFFLX~&6Xu9#1Rl?TvBdni1tZm2_n
z-pZNrqKvka_J9#F6}JD8!O*-Yx=V**@~Cl)uyGyu0$jocC-cZb)E&{?)ltWT8b`iv
z%@hVaA>|}H1AAqIEyq!XH>Hzw$7CsUbzRU4-ytZiEd|Z-#Pr!=1-JiFq=>kQmc0B+
zA^Jz8R1}^y?mHeG7hdP%`G=<i-IqEls<&QBN+M*U*|5ks9HnjY4z=i9T`Kg%KUEpp
zu^woxlRM1lRaC}0J|6j*QRn&j#Sai_AbKK|#fu^$T8EM5<r(?;`E}6IJzXy)^S?l%
zBwINr@g5PpUOxYuW_aY0dG59P%jtS;R<bn*WP%eBtuv(iiGUckurMpvyL1Z6GzA6!
zzHGZHg~?wE$rvsRM@UQb$!e;q`2@xXM56WP160EeLZ**riLF^6Ve*!9(>Zg0zS^AH
zrgyzJ+UnN*cDE1=ohUl@Fk>wxmY-xmqoAU|)6;f{B_xRIo1&-s3uCE`9<!zS(m60n
znh<ABJ1y+wL?dmv8Y|>X;!8%>1WJVhCd|#Dh?@~rhcQQ`|6Y)rM3&E#nD@M>X(fi{
zW>HNMW@y?ULEBu0)DKY~w?>nHM2lDr2?YQCt)7x(TVAoYjv>J7{%JPzdqz|R8~fqA
z!&e8*HOeS-N?t9m)mq9<c7qg8+n%FWq?Hz~U}rdAOZtS?95p!$*h^(8V*(+y$Kk!z
zoKB@C;Vc{ty1{IMZbhaUOHnaL@=Vd-w8i!?`Q#7X5=j{uOyJ6=G2dyyXrji)#VziO
zh~yr?^WcO9MTEf1{nxwC1-C<Y+dp4gq`BG?2m3&BfWcg5Z^{QhXlzXB9*hL?ZD?@o
zeYp{o0b!eDcor_8GvyLtXtr5AqP_+}1Cl2Co}%AD3ql8?HH0zE$8_-s$z+5t_qP>&
zKO*+He*DD>6E0%5_VzVoFmQk1Fa8_F(FJ76olk#XT?NRoI+jdfu^%pe9;`gCMxD6d
z8f}F;I`TeJbC~XJ=cbz6dSp|tn!`x3xV~t4USu*M*kJ`f*nfMox%$?{13pP+|4Ob8
z+W?*q9Y(YOnOv00v<&O{kJmA0)pw&kiOBjLZgtg;%t8_|Fv*Z69a~L@bdL0JA7=K$
zKi!1Qbq6?CapESiLbt3T_f@6c4POP*`7MrGz?b_b`HJDM@?EKvsr^-#zjx1~wlTlc
z{gT_LXcl>>nW<?Yn9B;rxH&#j4eOqmS=04s(8o4%V>p;c_;_GgerL*jA-9SAB@^*Q
z-FTRRjQ|pMs5wI`tcrT^3xVOt2+ngQw=UQb-Gi5zQ(7C0M7h5TCEQKEOEgE?Hj67~
zaQuD_)o=!*GgRK{d&|U~ow(u~W>@mM+P)EvZfa*o)zq936Y_p&=KMoDl-c%^|JOGe
zPB#=r+CiC<Sj-&~B-Yp&!I@C$ek4a(c!`ZShj_TaRrLj|__>R7nx)gONZ1w(EUChX
zt%_w`Yc7zn>vg+}g?o$hrw!&+=+&~v@miqU=UD%nX~t%k%FF$TH9)mC$vjSF{NRgz
zH`P(}6SMADUD~i8zj3>}LFP-@8ca&5RS@LR0|xXqE?JjPgg}P-lL_+c5A}0p4(Wv6
zG-)&z+yeVRamZ~7EOS(k)3qs$ywF7Y*L$3sXx4Vu6*@XK8cOR%|APB(4C)o`mmhVS
zoaG%NIdt)0bWIjA1UEJQT=W0XDe4}}zS__uN0{M`yK!4`N>t{kD*tthf)eOQQ5~ia
z(-zWq8ZC5-w1{rqn`K$N%8}9$Z}?V|=;a6ptS%7@%|zqX?a2<lFLSsL**^e#_y)8h
zF|o0}<{55MwC`s|Wv!m;DW`IzNBWRW{#jqoH%#NTdo`$@CS_;$GTH8`6U5K?0+Zsq
zYecOktAOi)ffYWFxg~E}3R@2L!-Gq!Bq=H_7)b@r!$rJMtCirte>T7WE(_aQ3>Dx0
z(exJ9?{$iR8@q{v0vKs>u=o=lMC=~%aVRHbS5$=fk>DHe{ZW$WDfmiSV?bt!{Oj?s
zkyD{thQI8T{(b2OHS%wqALPWbsuaF+zLg~ZW}m_kG>__`K>b0?tj(<cxn<5@^0z<n
zLk9~!C>B%6)DUo(t=7)!`GfqrILnVZjknq0knu+kKRps2k)cXCd7tw2YXO_)*0$ZO
z-uq+;Z0~4_(7xHHhhy)x{ry3E`&m6c5@Je_jGiQ8g$vOM!8Cxc!L*Rc5j^d}{z~H$
z9v<$@HsV4FPft%zb%=0CzT$DrYGV4}N1ItniU~nFMvr<#hsXz8;B7aPP>kUuC%7Hb
zOR3`;gMj(b;$b_x`X_4Ai-mf$Sv*D~c6szx2Aj8=%Pk(AE2HQ^K|veeg<soUJYW%y
z>%RH6S?ZsgOMLZhI&z?AMvCuEt26&nKe^V2D?=3iECp&ejSSR!K?8J$J41}+Df_6j
zZ^wTw)~pB6eexT~&(;&$DCv+H;?T0WX<sYrPDRap4}j$4=jF*b1t##uSrXzmeUXZ>
z1Ve$VI7c%3%fKA1J=BYhxVV_~2ZpYKeCI#gmE457v@0<ZP)nd}#y|&%jhk{p?58w@
z2iiz6a4c*4MKf@~-Uf?^V;x5FEVqHp5Cyu;eNuculO8{b^OKVpPQHyd!zo;-!VmHC
zW>n&va+gt%oN0gYaz(<gmY=>+yGA_X#cCGjCns-If7}{M#*#u*R4AgHjKbOB`XumI
zTihVyCl4r0SEU2A)FE3}lzE1K%R?kYWeLJiCNM9)vgn@sEw)_Qp6A#}4~r>@afhLt
zSL{i;G#0jP+=b#)d%=TgUy^hQWcfaOX1(j(n(uMCC($W~22WOA#=BhlS8HN^a6ar{
z-t~t4HNZ!0nbS{|(xH@}+hd}oejmgsQ~-)GO8T%F_4F<&30qs%eocKIf}P;ZUUn_r
z^|=@c?HUVA5N~PMOU%<KuuDj}^_~?rLc|J#)monVM<)_1>}_Zp0ETkdwsB&Kep=5{
z{Ph}B<aF;THN`pvp4AwdsS<S-Al89Oumz3%1j5loMea%ZFtJ!-ggIk%j_st3&JuZ&
z7G+HqmT~Z3l$2|Zlf!K)xZ}XO{B}}LOpne!bbP&l8L?LVg^lwah6Tl&3Q89y9r#u*
z4t!adw@O2Vfvme340T&QA8cXmjH2AeHTx<F5u&rzCUon|x+<>=IyI7`tx;6SC|v26
zC{?bfGSxasrFOGDC+34a?~DyIy>7O@p)}jo>YVA*e?mGG%pP~Cb_C!GH2eQ-*Ri_l
zOREhA2GxT8;~5^xUaoxrEjJx#U{JCzM~gr_wTcQ=f43M%4NC(jM&1rcmRu1;9L+TW
zO9+?TASNN9;H#^)N+jw&Hq;%PB<NAcxfw^RXf&2Zly1C-O>!$R3n~|5;LCyz7J#ZD
zKq9^1?F;FjnI7U;@}(tXIv9@=ylk+;lB?0ek3OlXL}%ycT&iFx`GcS92a^{q5&H<F
zvivc=7tyuven43;G6`F>DdX}Q+Bg~vLwT>K3a3o;TqADY>_J6z;bVT*Vugc!efAHX
zy6m@wkevQ&Q+D|@;k$!s9_mS8wPpF6<zc%aa{z%%#cbGne6jX`C;V#Bp&ArZO_*X|
z=Q+A$q)mD9sjq3S!Vb>)Xel)If~PAK6KEb3()kcPJUoha!HA8{g~pMtR>9kK?#~ig
zYuI-^SYHCcNrMl=1=|r@Z5vN7<~F!M`$FMbLL|llU{}*g4k(6-NS<hDPVZM5w})J(
zh(p1ylecILk@3}?Uiv{<x%LYm#@8pif$O!z7ay9h`DJia)HPDashw)-ro|2%Sn`}r
zIprUp4Ua?OWNV-<*6jo9kYN?CWu8V}8|O;$P#QmOsDR<&*3Pzo8?FJ4$bEg7hi+gG
z>#=E9WT=5bDvJZdfTD~;X<LZE_fv#Vb+rQzi6FYJ6o_Pl>l+%1R|uRJH8G)zh*W)a
z++hD=0`EvZ79&wzS?N3XDxKBP@5je4(cmE$kib1}VXqD<wHjH@wN$yJS19KS3Wvcy
z+@Ul&45c7&6`Q*0!F@Af5nNuGi$sQ>GBJ0)K+B$cu2oF1zn)Q!tszB_3hOPL;-*-3
zU^3H6Wl<w>QjLQSj>(d<YS&v%X`?Elt&AK5(-3TIefX~UZi}@|yyb4kS_v<?Kt3l_
zYT0d@_-Z&Gd$_LQ@8aiHS)p-6wAUeE>bD{A)!R(6{_Fso7y6%mKnsf>C-&>^(IPJ;
zrxqTG-_`omqi{en1c~eRT7Pv6!CTAT!W0%mSl2X3hX@>UEp?qCNWWT*(Qp-~?MHOq
zhXm{}-GJy<R$^Py0zqLs?DSM_jB!F(a{?G-Upzfm-&g~9t-v<0t_x2?>p)1u7a+6#
zYP_k&)$<J>Mk-4ciI<Ph?+dY<5K3U&xvc8;Nm;*0{RQ{Osuln=fJp~3v8jP?3uPv0
z8pS0xwx%ADn$HFpKG32B!s@<tmAa!Ge{iQ8SPCphWfm3J*GEXB5sqZ5qd)Z0J!`3=
zBg*Wd24W%<HFWvT95?L!WY_J&E;9Rl5|xycj@@kY)k7}KnfsaLhhJ0ze@G%l%1H$I
zGhS1U%_B|S`>JMSoJ`#wA2uM<mZ-on%x$9gR#sy;R$vuuJ9j#iQo<^u>_;{-dgCRy
zKPNx!>u%iVLOA(p+f_;}O7?lV4_O=bdQB~4sN_3CXM-B0QUYFdgM&J`zUFKIhP#m(
zLdQxkDLGpaT}e}Z{~OCPSoM6762jjlBJM-<U_?@zFf06Nk=x57t73;B%pGndBw-6R
zH8qV^FF|hWxi%`TpPFw;EEL5JfO!Wi+04o+{JeISnw{x5g?`Ui+2NJRLX#(B-XcZY
z5OC7MCQA^R@gSw1N5M&ZTNU%^e+CnalMCV`lyMDS{eNVAWmr^QzqgE_v~>5--6hS?
zAkqTTH8j%Q-3>!YNrNCE-7O)lgoFr4cgNXt-_LWMbG`2ezn}tp?=@?$|1U?d+D;qI
z+p}<r2S3Z?cZrV>Bn`6{MB?cwl#WSdx3|r|NY45FobpMczmyO<w`q5VRo1<Yfa+;G
za+UWhZ}?uCZf!61NNM^cbcNBAW0H#aku11<#pDmJ>U1Hq#n9M*(<?`aCVD7Sys6O2
zS6Dj%-8d;xhAFT~V#43En4qk-DOJQ&xvP-N=p-$zm_@yt0GdI0{{XbwH<AT=5AOLQ
zTsZ4>upjF$LxFirVpw|AZj6eOPP-3(_C-F9mzJP5*=;&#^|}5lEoyENvU076+Qf;k
zpUAcF*Cyw|Z<FE%YVz>r>?qksk7xw2y&+b7_(+j@1rk4izdS2;Z3Lk_CYZKgIpc&W
ztHH1lRXub>>X<t&e*eyjl^~Qbo-I>jWMxf7-eBh>qX2>P7vN*5`@^jpiZ~B0?!DYk
zw1*<`&>l`U49^bPuk{Q@AqEJlW?q_jZ!|~Eb&*Uh7rR$nP%x*D<0EcNQ^%EyznN#T
z!#<T@Rnjj>*3qI92|i0JH#sH~$8#||74|7ysS2QrRuD9TsnyV}E|{Lkn-CW&b(UVB
zldC#+fP7*!uqh~FW_+|kYI%`_+3TYsp*}-BY=XpdQR~PF$^&)9HCI+@D4RF=k{
z{a)vWXROl$8SBZXp{+O|tu(0^J60!QO4#4O!|Y{Cs_F#hisnCVZ#0I#M#%%S-J`d#
zELD9XMS6G$c5+f!)h6Hc%AgoC^IWQm79BGwOZS3aqy*ANM!Zx`Yx=0*7QNJ~BnvJc
zbYpWaJf-BajkV72&{Z8Wh&+V7y=jYvkDDy1TuYwj3$OiNkSyP#p5Qz7nm7qvF&gRn
zs2?YC#`3h&XuEspsJJnS60IEWt2}$Qe!I&prDCI}8g<dT(JOa}!v>7Ws}>D+3w`Y{
z7>r^ft8@(6RRvK`sYK;)MTe(A#m}M7FT2S2coT1&Z^D%w6vkxoHj#`vxn14j=hQL1
z7g+EH5yzT`uZ}ka3ZrG!djh9Tv(;;oDfT$i1kPLGf%!%7>V*D8UV=7yXWp?Vy>-}%
zI4y1%iThGC{7qTT+fr<F_Dhh%%_C@H^I1*LXVaX<3XY`+QRqY9rk^QKyWX_2kL4hk
zM0Yx<;f;*2Ta81uc562!e1|QE@37^^F59ByK^k3=<ei+cB*J87BQ1Aj1^h&kyGtAR
zU9IY$`t2Pwx4%KO1BE@3fLovLM9hN&W@rgACrh;rvN$~Dg+8Hn7FBTulg-4}swasb
zjggmYMc1>*<Wre2i4@`eNTU6ohW)uU>bn`}Ya+V&A4?P4KwAU8au<!UbJT8{SA-XD
zfdOB}0+8Z?EMS?od}N)H#iiDIQ7e+2MB=lkTwcuF2inT4%%`UEEVJa_JU)hW_|XOX
z;@4&qk8qILw*@bgfV+_!5I}5nGdPu%#}|z(>XYA8*tlG5(Nm!IwV3wv5C!HEAH*IG
zVZG$%S}Zao(A83PLc>hH5y`o<KSb>$`_apYbqb~WmQ12-{K@Ud$H;s!()ukmMJs{J
zF?x&V_WU6fOK-=2Y6z;<>mPEV)&S`(_dnKaxwB$47p-TQRtYHwuOQ}_O@DPF;!dcP
z@ivOy8UId_r$r;6B%C97&3t-tg#XL6591X^eN|<jkZd6Kjw}4tdQP{7aBhwmJEnp#
zptwn(Q_$2(My&rlQE7LHW#epaM!S1J?N6#^D;1CRsoMK0wSzb1Eb=ccBnd7Rgpfq6
z#=27snVsp`ne;XG0W+xQ;|;*8kyf@0!b-5qn3lr|>I<=Ix6Q_yCrfJbPeUCFA}-O%
z-wzrWqb22GhEOYLJKMVuwiA;^bEPX5qXy9O$aafH^3JBK)@*BP;}Px+l)EA0?f-=6
zh1ST6si~AqBn_93rYYxsrKK!*;n>q3Y^bo2k@#${>BWFh9Wn+TZmhXFlk#FXn3<y8
z?fDjf9mAgtF5HEBI+Cy`^2w&&#F<hfG*h>6CK5<|p?({zm9m_)mlk@}K4^1q71b1b
zJ1zThuc&sWtn2xMz5FAV`41Iyf{5}0^JpS8!p0rlWb%YhN|8TLXlUeZ^MTyyvMO`^
zmu&L87L}t)f0r?)d{pzZB0q0PD->%Hyu8Y2l9uu0r(R{z0{N@E*J=BN*AnHsF<Yb!
z|7t`9vxb`qSyRed+4TvsQP|H_XuWH8Dlpextfl|R1I!acRNWB0p|!D&9VXC(3yoTK
zGdlY`Z5>gT*;tMYG8Jol9(cW2PsAoA(JgpPa8BV<&Q`R^<!&zHjcmy8o|Ve762dQ|
zxOH|XrlF`$Hk-6;%=bPD8nW3C+$3U{vh6eDJ`L1gSes>3T6dzzx$*#iy!ftrI22V|
zKxe0*kc6!%octNy7)r?&vjUUD-2&q@^A{ihTm&dc0J=+K=d~vI2@z73<V;PRkSSc=
zc4-k+G>XK<IVsT-jQ-yANc7Eb=fNG6xf!0AkiICR($p%gcp&5V2z?(9Kl28~ORrg)
z>&-V-X6m_7WV}TiG7OJgZ6^-T7VEx1ih0|1R~b*rcr{B4g4r7NpS~9-i_XQAz5sfr
zXj6H0!yIy?Za!I#vTiQ$i8;Nnll7t*PviFzFW$~+{R=3SOnXC7Ja@}WGnH*Y4lWs9
z!{sUi?H@3cQA!%mToJ23fb0ClQPYGseIg#!W1^GX_1IFn_zHf}mqv>q#&h;C4mhJ@
zMh@0Z>y43uZLvUrGP=`ONbK?2Y(%yExMwU1xB_7c;N>Aq;c}>EghdrmlO)E2^+kz=
zL|vst)?{SapXl%RXF_rlHdsP_@1Us|z;r&(U$o_5FevYwx*jatIZkvLqIqS3!zLzX
z*h)oFt6VZ?vhc2k#zTM@FccmpIo2`LB1*20xXNmN@}N?mEX(2C!}BJw)qG#H^4Qx;
z6FqGmTOK2gO`NUx#v-KrFWt2sIgVx`6sY@!7#dP6FE0bvj;chyU_W$2bvpi+lYImd
z+4agYmL#5$xof{A>8rVEfk+a^4e-YKg-PJv$7PesLS|oug~jmrW^elDY+FrTefz2Q
z5OW(ijRN}x8DbXA48M-i;Pog@PDMxf61w+#ePkoi@xJm%as9%en{tWUg!9}{BMmM>
z^2{9Bz8-#XcefK+DbA8Mo8;Oe-uoIOJwN(}S=rtR<;J99B}=A}*ieKUw+r;w@oBHQ
z-=^WB{!D(G2&JH)X!gDNf`^el(jeO-D=CF+dA%UBi1&%zh?alMjWb+uH}R-CckK=4
zRPvWsDFUzEm`A|Vspm@+wWVlu?<$>S-W^Kq3#o<mfA?~-0ZNR3Trw^gJ-A0sHvcy^
zFw6Yr+NQWbLXcKZ7S9T$8d#Da;gHhRrQR^|QSj~w@IPG%yZ|4UMp%cMqNopZd1IF^
zbjg>59iR0f+lTSaZJCPBD!+W|$2vrcH(pv+g^u(S1sSbS$?UM#lzSTpv2NLN;VPU&
zR=CI>Y4FCHf6YbWBo0EpdggP=rmh!Op@<{Tgh8J`ol&#20@*2F(CkS0lH=y&+R!JK
zb`k43IjC*HulFX1#vG?wulm9rn7Qd^cBy}S!L_*2)#K7A!+$#d=>R*n$&He6Yh3i7
zVIv}D7*#WHBhi0Y##$FeMcLia4Tfa%TI0|sf0m-3(Q7h%p9LSl2;sQ!*~IG)otMHC
z^dS47(_nLNcgG{KYwL%h>lv1S8FVQ`;_r#pat-_ld~|{{)X14Xe&0MxRsao7Z;_19
zGVn-<pnX#0UTH_gbxB5W5%2dOe^f!bM7gkDK;YzdpJl60Ty0b^^IPBgia*dO^p4*}
zSyY8rvJfR@BIr4WWYBM5*D>v>{7fOK9?C$2^F>WvEnI}Vk4&}Z^|HuNw|c3sBc=d{
zGd4CKnxy8}GR{|(hOVWCc$vOkd^XYk{jz$BtYiv#Or0s45Neh~#UEi^FCV0*A-K|6
z#@q;#nWEq?RWSMj;CLZ11kTq#`1E_IPZT)iAtSi3MlQwpCY&4-QIx`ToTjPAT(r3?
z&I;=jrGr2s!(6SY%i0@1e2R68m{f>yHE3J|U7nAuW(f<8-!mi!PfD%t#WNaR3=I7m
z#3l4tOF3Xx;dOLz^wjl~^fHL*J_{Od$pTxKL`?(jfLb~>w}~HBlVo{#_}sGCe?A-$
zy;g=muOZ_Qn^Q2hq!IxRIVRz48Y`aEBrA?L#bj^clx(EFI>R}XnJ<~^D<!qTg%NgC
z?$JcJ!VU28qpLGE<h*7h5AZwuTvlG*Q)}XPa=#qpc||FcIMUG2z#ExC`9?raLRULf
z{q)duk2iPzgRmBIc<&IXet7mK4ytBkA$71(b^C2P^KDAhfPZVH?Y7}8re`0a8Cm~o
zmC#oE*2~FAUOY94#ItP<CE7&*9Hfa>FxRuOVXms)Q|2rtz3+if#S#qEPA4=s;}wO^
zWK)Orfr2ra$$FI_$KHKa_UV?(2uR6+N>C3c!@;+lBY8x0doB*%_L`xrZ6}cle2Q<w
zsj7kkEL`B2aK`2*(H9J2nrm0r_{Vgo7Jq)B(27??Wzn0bw--aBQqd^v!`f2~nrZLf
zyuH2ns@{Hg`wb_`($O*WIAArbs_NVI_NX|o`62*MBWTPB>uPG94XT=;eE2hpPefXo
z*CLR<^Qe=vT{RH@q?HI2L^Y+<r<Hu;pwBNmx|e)JZutVtMD-H2f~AT&5@BF|`Q2$L
znkJce5sivMvj<_tMI5bUVTZ$wus7*>IKso-`E)4`j6PacHo2+5Si-Fsj-dc?6%2*^
zfn(tc!+-jLTT?9+_~fmJM5<$o2qelD90-PPYl6#)GaGUuaobi1(ife>{>X<er2lU0
zRo6_4|C*a3?T4e{YPO?nq1o$b@Hsv6(UPWuAZ^XgOQ_~*H)27jJz8~kD+dUR7$v>p
zmfZ>b4)ml3LvSs_;l`wi+}9C6Qg>sGt?00pf?gm&aS$Yh?h%slU4s%6%>-7d5Z*NN
z0V>BCK6yIu7UOq$_h}==bsIs{<>$FePM*EEfN+f7=s*oNni)iFhS_^UHoefE5Trj9
zj^&_i%K8wF`4q3{gxjMbGU-E?=iJYD>~U?ZUZRl2p)@Y`_v&-kIjMzZ>14k{Oo<9>
zG2?5&Dan|u)Uzp=tN0rKjZi5Hl8qu+cu~OJA)omG@UuhaS)G0lo}IwVC~N%m_>c*(
z0{t=d@^A+&P=ArmreESRt6cauwDh;*JQ0c-q7H3AXsG;Nl>D!@7w*41*5$MPX}qc@
zyT5#>P$4tmR?2f)m0=p5kxokz_nAuBztxI<Q%y7V0-dw?Xu^Ja1qeC`hTJW!KFW&8
z)Yti39a6N{LDU?i6)*#;5kLK*UCW-f%xio{=Qho>nr-F{WmY7_FU$!Ifa8Ic2CbP1
z=hQ4Xlr8vw8Sfyt42!f`f&nK@gaZXX&9koAJb|FP69CUtdP+%VOh#Y;-i%Ou(2L@W
z@5dM&S{NTb<;4VY>SVzY&7OY{O{zwon!MM49Qf+P#ijt{>mUkhTN1TWF~0e}nQNTa
z9;e#l=oc0VKY({GI4C-vS;z>WijE@`l@ubDDk*&D<{o(b7|>648bv=Q6h@x~K$8cX
zo0>qpt-5XuyoGcT#X*rrB@qfF;`9IoF2-hN;2T|BIX|TvE*Gxl*O&;Kdw&|4q@8vn
za*t1sWonFYD*9Hi1o}X~6i{g?W5V|a%so-IM_oThH!MuhU-x~`Mx0A)#4uozN(T{t
zifVZMK8UEihA`P#$8?!|`{ZQxum3Wdll!=&e@!&xE&&BLgSEp`p*TWm$adKE<p1}m
zq$SIG;j>YOq0@!^3h>l{QO_Ta(!D?@3Iq!4G^Y#gy9`_Y#6$AK8|qXuX5v34;gi`l
zl3AC(oFaqCT*5E35x7rYz7H2N3Op+Nf1_3AOL%9@0o+q2>ilxa@i)-UHSG|6#s-lf
zj4D}~9U>G@ALjjVFhFQzr^%xNoNxck*BHd;0L~_JAvb9_1S=Cz-}{-O*6;3;LRrsa
zWx#}{dap($gP0iC+Q5>Dggla}Z(~DKKe?A-Q6$DIE*+Q9QMSZ71zIQ~ZjOX%pV8n~
z$R}GkBFYz`JVQSP>F_$+q81j`bC8;hmx(+%3G_%Ejfv&;`phSId15Mvg7N)}-E|6z
zTWD-FXThU4aSm<nKI%TIrUZ6+NhgakYv&Nz&ogIYnaNugp~p1GzULvis=qz`Yk;go
zdEQsjX<2{2@{JOtSept6a&lsQ40sd@M<=F9NKE*5pdkqgr_G8)eSpXB_4E|X**g#9
z0hwd~F*+{*o&kVqfEO%lC@?QR{X`|uH$*^1HO4{6VaPh?IebmsZgvrGKJil%w{^Gy
z_%@y%KfYKfX{elBI0Qb!wa)F5Cj%n$76RqEk(Y5mQZo9N5s4mjsx;rSY~J#p*|9#8
zl*SqV_ASPvA`A2de_>{3W+Y>fRRI@BF57vb!h1k37t5J82Fw8%yq0-Doc9!-wR~Y5
z2_I1BeV(+{`DiixQP~_*QT1at(6#2&+Dz71XM>{>SioN=sB#4?;GhakfS(hXqbDdl
ztTSU|pR52+6}Pe@t5laXss1M9UU4P=P@;yTf|m;k)hmp`hh@yeymnC~sN{LE>n?yg
z9Cc)$sTc6ir7@xd68uJang9YM%1lTb{#zeo14&;tgH}!8<5Y6hd-j}qOekBicg$tr
z^fd{F1RS?+Y)s?*1B2M2j&B)!<TL?QM~xZ{PgTl&*wV`-`|%E~ckE2J-s+S7;*<z!
zGBo&IM&k=o+lbFx!29tG4wQQdeqfT#|3-5xBJ(#kDhf#UHmZ3HUYP9qWy;OQ%d7bt
zS4CM<#Js35>)3hGB57ywIXqs2kN%6*geF+@A$bBpK&FHb{&A`82R3@21`VTJ3G0=H
z2Dd~*g1sB2sbzr(Hfu9MzO9JFQF*uA9A1A#-nrsf>YA&A7%vF>_SP)SWi$)AXpyZ{
z0?=CTCgI$i5Ozfz)iyBzD>ot|^L(g7p*PG4!~Ia8#oq7+$f?m=qFvZ4K#s(CSAgON
zRMaT)_&Q!t3OWV8f4d_emt+lyVAFjINBU!_CUJ0ZoQB9fcA!y?d@m7nbaWIdRTHhZ
zYf`#4l65|`ZZ%9)V2m^%p%FlR3!|b7*{FF=9fRKx-O&GEc6DbF4i#PkK?+JiNd2sA
z-nvxmBJq@8?Ate`px7n3DfB8da$f?m^iondcK1JVajRUj;E7E^aN>(rJy)W8XElmZ
z-yB4D17SqeNCa59?dIksT-$b4GucVjz=mDc<+i7dPA=R~S8@%<W;j9PvJxV=p_MNg
z7V)qiRenCH)fdcs3kG^L+s;L?%@@;HfZHN62T-#jZNNSgc>Os5X}utVVp>UL$!r?9
zp0130@=}{OfA<x$m2ug{A;t(v-cwyW0djp)Q)<H7uYl#&%m)A}ZG895{NV2vSk3s{
zDtV!v!w#Mv&ku9+7YkB9pVir8LD7IqX-P^+m?SUu!%|q^i5Skf{_zI6%CJMPNUgc)
z7DudnzQbh|4>q3&LBqs^w{1?nDcZ6G9-YK07LllT!zPX~P_%K&@0RDtg4L|Wtpf4L
zNZHOQhy3cK{&-p!hXs`{X#}=Giov}zqoIcF(vebtlmWO~2%Nh`OsoXt0VqN9iPncD
zhgnRB6R-NK0(G-oCK^ORUlmJ6z|Jmv32Por#`$h5w<A~AKQ58a_ZD6m`QLnklo*t5
z8NE$!%v8p0JMU%4;@rlwUW9lcs|dXLcVnUh(QNPov`h<fL%Q-^_UFcu81aX{6n4w0
zWkTD^OMvG|88^#Xfafg7m|JpM3Kt9$a}W*|c=7chznM=~ViK6nR3^Kgi97(?8K8dx
z>m;l_pdBeuDf_sRNb!fCi%7Jt#p+3y2nn7C!C<?8pu@&y5XWc>oa&H;OYM<{;9w9&
zl>66e@(r=2aumGj9*y<Gf2LPB$%X+3=qf60lhU=R=O{VX_)ouHvfU<gr~xDt{HMh%
zuy0gPe|5@zT>0(vs>#2l@!`cF8xP1k8rT=TTaQH!^5P_1h8w=vT1-nfk<2-9I(KL3
zES%fDZT+#YCskt5rXBNjm8rMIg%1aIdDs>*^_lW@!eK0(JsYekrO<1%{9_$ga_7Sf
z0s?|lIx%iI&jeqaxzX^S-gxwJpum^TdnS8x5W^8*LM3yB-TnTya{-d|$%frUo6yv-
z{A)d~+l$?Zamj22citJoNYA`ji3n-{C5n(7C+7l@!{t6I(83PQ)8z9+J^{X=ri-Pw
z*Lu{7oy>oKNbY($0zlZj-`m)n9A>8d`ytARw}>TIACA}5b69>BZ6mu+oa;-Md@^dQ
zAl@fHW#{z0b{ZG5`tJ=4stN2KA8yc#PRFaiA?v)B!e{+k{ngLJnt13`{w2GiG_bUl
zQ$`wG7AAVS`YGYkBw4&0ZQ6?7ig%qy-52XM=Pv=s<lL&-tHe7Jo;P;*t*x%vye=Ga
zXSZNQ#1nvisP94PZ*a~cZuk1yr!YpjvZ6Kq>o4KB{(iLSvUHA*5mp~D5|xFIIz0GN
ztjXK87griXm6fV$q<(0np^Tlv>QByRRBZ(Yy#Cq#;VjxWDnO0lesjv#POKR^o;|5#
zc5!~*pY7<6m@X&m4^a0ro#pp0ItMxhHT~qHcOQXnmu{Znls|XzMihr?s#u&rTT|s+
zO3Nv)ew2iQ0*d8i9^0H45O!MW_!VbpqUtgs;QsfE(yhKjlJLo{8#16HW(HjaT8zSN
z?>Y)SciAm%nSfo&<yTao0tT}oV8>yx&Kv(Io_P9)kyX>0o6Yxx;~nMAM{lvlP}!{S
z%05o#0fNp}cN}U(qVGF>b)#EbJdO;VchT(loBCa3OR~PKF1+v2(Axfh(&%{~l=Q>z
zmIhtG?^y^#DRk5+9FDC0gVxv}=ZN|c0HF=ELcghQ#Cf^{5yBanE7HUNZf{2bX`*o1
z$+XZnaBjUfTm^Fi)RE8a`F7hsul-$5?&RV!^WN&l@VQLFNRF@@>gDC-v@f92#<bmQ
zU85AWOg8YPQ7(2nLE1&_Goz_ysgl|oO8oedLsYp@I(0Hr_<7xHQ_>5VQ_C863bZ=n
zF5O03Q_~o5NC^fQHZHM{@X!b70eP?Nk4I02_6-MsuqYE^iwBEu`+*0$?3q*Yycm^<
zB+4~42mp*$?;e{e0ojby=eWI0<-KOQjXd~st;S;fCb!utZ~hVV0^K=$DCZ~9wt2aA
z2PY4LezJdeIWw-Z=ln~TK|$ytQ_PnW>-QjM)C%K^@4T#80@!YRS_7#!ZCLG<uY%K6
z!s<%st%RQ=m<$R5jfegH{m@ncF0R3F3{ozu39@lMp}y0|SjZ3=$NOf98Zn9tbF`MB
zmxlq*6pZMtkqK^E1;6-yj+4Eb<N<oaY==v7bGu%MfMoMmfQ`72n23mql@<LXUBEnJ
z6#a|dEa*WKnJ{S$FvtfZq3tF?F)=ZLwiBZi+AQezS=Y!3_Z!PQ@b^sEcIv3hyF&UY
z3g2VNNCPt#7M3UxT-q$E8Jh&WhNrdWm;bErMbw}#%43Zrn5+57-q)I@9*G=-uNtM4
z4Pmp=J?sI3@ws97A!(eVUrF=K$gwlk%XT|x$yI;E1z(+_SNLE`{yMBZjfxoEG0-Y5
zDSW2Jf``21ji_bMXX#L?g^Y}I;23bHFN&jLhx~ADCR;?CwG;X$Nn3KILNP80`s>%P
zA72BE{{8Bmw&ZvkQ4l>GUQ>WxzW{&lfXk<%GOpp0HK6o)=NIl?P!S<*){m8yI<p8%
zci<H-rL9c>&z6N#Qw9L-x&;`pFph=&4qM!1M$vlsMMXREj9JEneg44}N^4$IQxN~N
zV`5-2{-{U!;3RVzWdjM9qbX%3L@OUl-vpoQ2D+ir{J8&rn9;zz4{cIIU<}lKwl^}K
zo~&<<bWtE)G-9f6t??saCeOez1@Dp}r6G4x7M${4Z^+{TU8sz#tj%p6w?%aE4X<fG
zGW>VM1Ys!fni$>zr%6*FpCTDwdOJ4S#{%*E6?|KB9QPXZ4up%MR0D2PAfw3mPHzm*
zf(GRqEWV{v)6s?9C!z1?h5VV~jgnAT$1(I=M;d>~Mrl$mw9el^2d>K%7g5=PGAgWx
z8fbh`aUmutmvyj1w2-8al<fmpNMm*bzCn$%1r2r(ER4{J{-qO!KmPAo2VXnNODa7K
z`ClHXVk&C{$w5`itacbqNTZ)#To>U2A;b_gWaMt3z>mpV%cD23S&0oUUGezkRa$()
z-tbZgesUe6=_VB;j}f>UL~eKuze)}c4v375BqU;Uwh_LLQL0OJcA#Tfe3iBssd7cQ
z2$F^4IYPLmqB1za_EJ(H3jb@9+>2}ae;&^Y3c}I{u}CsGFn~GB@6elnl9gmkq!Smw
znG4v@C^2;?reD}m{*z82sN+j?HzPl_Ph3qUI~c;5Bv=no@LIsaWH+N~FNYnHqVK0?
zZ6p9uuz(QN(oz;%Siq(DFQ#EcNvkcO%{v_iC)0%_3IRGGIO5k$VojbPbTy3JzycyB
zCdLXv^1eEZ14<!);>UD_26vfuw>SaXsl(1Y2XjkH?x=ZT!PnK=fSd-osh2iGq2Y}_
z&S54TC?!S!pw?3pOH`Le;BD1bKPSq(wo!&pBVjq6TA>WkIP=5kCR35qZjG33yz6<U
zPyYBOrE^$TOUOYUtA2!EgtkSmmP0=J-$p4k@b@;cWx;#h;ntG!@(e?76l-hi=|Kbi
zhH-m1sP+xpIIbbxY>*F7YQ}W~c3Y>zWube(NHutU^1JQ)-G!GE&{%Z+;E{&ARhs9G
z3YaFF51N+FiIW`!u7K$V%MMIKV_f_xY3hJ};JGDy`_D%~Otp|<48ym}EdlfT3a6p|
zg9i3?4UtY2ssoxz#`TRVni);;uo|MR@07uh>ah*DVY3QR8?DdgP6z0LAed}1zt#q>
z>v4fq<MW*<mGnQP)zT8p4A(nF3A8sK07n^6JPkFFeqU*T1w<7M0E3E}rY0iwo(o*P
zFaZ~DPYitYu|ix#f-6Xyl-ib|OVm`#_FdPeUyHeRwzpqmtEFjNNv;7EX)~EFfRUIG
zu!dOmG~KaMCZqxnK`&MUiC^6ErBRifR{SeXO_)&l^7f0jRBw3hOz^^=4GTFFI1$KZ
zb^q;?(y)KV`Y#>OUjM4-1$%`A2(FS;(p-Lisjh2484S^2#B27xs5m#e_5oyFgr?PP
z9t#E@3J6G{<y7BjtNUs@Y*q3^S~}X+F5x2*47t|YiYI8p^xs1*9|&lm4DIR?N2{{O
zuzT6SP*^UU@;Aow<ochna_`w#3mUK+?<gaVarLaC4PVGONzsooD<)>(pd8U{-rE&|
z$EqoV#FH$rqv_<C7^T(RMGSNBWz<9-ID41L91QV#d29%R0XSSUGeV1_b;cZycnjZ3
z;A#MxEx;?({fY2Koo6R5hLA%l(b{h7CnEH#6Hr%?Zjr3tolprJBm*C*fYpCnKQE&3
z@5v{KUrHi#-T&fZVFw5bL!=&sGgd@FdGUc~Ku7i?7vZn^K^o<}ss$1<v}|^3*m%r@
zRQw#HCwim$ikp6v*^x>{!vfjAP-(Po(X5+69iIS!8S9_$O)w0%!mkv-Mrt61erCd0
z88)_Db*E&gbLM(3d^0shUGZSK`0k%Sbq^!|g)88zyb_y}N-nIix@H{z0u+>-j5C%e
zs-Lw77^=9S->Et>1yqy&dvy8d1d}fYvXBX5EWJjME!pH4Uu=I=(M%lFWvXC|tjkRa
zSww88iPX$+$XXQaUTpvSa1ZRV2*q*q>gyLI_%LlBl~Xov9&P{?>wQ`Xcs!%DTkRz8
z<zV_36=>xFmA=Uh?&uihHN3CU)!&;xJeY7fs4#Q9MVZ%<8~bgFC`=TTm279Xn)~W}
zYWK$&v8pO16H4T0p`9;DMqo}b^E**qjF>GINFRo_#Unf7(QZh-g~lFA8l4wY<l`nw
zPC0ka+7w0RYpZ1pw?^+jc3|YxKLchb6&juozb)S03B0gTOl*C?^=*3+tWeJ3S_nXp
z)f2dGsbuP2%wO^3hH~X-P8jZftzqr1psQTl?;@GWCGBLK<rua4XGsntZ~lK`t<%Yb
ziS4k7)oRq@A>YtuN;G3(V=P(Kte{ec)>)st;fJ#%Y^Wn5t3A2vF!NTe8}t@d-d1C?
z382bBNTI5{y0|I;u!Zt+`i&CZLBkdljvXNHfTw{IYh@`!4`Uh}f~6xuP`R3e=@q8;
zlug^Ca5}GI&g8}=wj3#;w#Dvi;8cEivBA8?dt)72r-kyVonOFoP=_d}CiK~z{GMWE
zkJ;a3<eWz?0ze#fd_H?iS@%(@OM_Wq2-m%6ts_U|AS+Kg@$2g<-wkW=CiXc^JSv+o
zkcoam*>0I*nuT2N;N0_Xeep$KxSuPD4E5PKJhZx1JJqNmGJmQ2F_%JPc-SYiNUrk<
zwSTN3sBIgEtQc*m&zEb|l73nV>hnG&bvRx>jLsxVu(Z6iMFB>#kLpSCt*FqcP`xeB
zofr_!1Gn}qWzc?R3(?_vTx4_*=YSe(sa5d!W+Nl^fU6jX>w|+9RnL=J?3LjgwyZ*+
zU?%Ud#?w^G`(ffH7JCN)c{mHIk_0`)1NNAVNjSC^Y2FFQ`&&>tgwu<kNPqkFVWqi;
z(~`N_TsS5#caNJhwUAK)N>I?pmyB!G4-S$?B(MN^N5Zei?TJ1cjY&ysVDNWCxJ<@y
z*tt#gMbIg((6hJUp#Nq81TJ7|rWnNos<4Vz2ZK02afB(XH(5O#KQQ<9nSJn{RCF%c
zi@Fw({4^Cah5npK7kwa^R+xNQ2ftiVssyCa(>Z(SjkiEg!_AF9Kx`W**y4~7^qHJN
zJ?Q#VQU0PCJua@B>OV4}_K*}+DlQSSs87TVQVQ=H9_7j*6dgMxM485^0z#7EVI_H&
z5@;H|`H}ffrm43!t7{_v90XLT^T(u{J;;`WQ`B;a;+<{{#MAMf6|?mS6uaf`c@>;_
zRBlj7?OYAp+On&I`><hpVYH`9c}*A3MzcuJjI0n_nLt2QWDqhd)$~W7tDq)&y*{SD
zK;lU<)ZN`ZS+}&n<oUsQc-&?G*%#$@LH`yT%gx|7VSx_B+S`sejk9FB6%0~8-CfiR
zGdEnT#uz{Qe9iZW9RRbHqZwv5^}+N_5f|ZSx5Zcb4lQz2`Tgq;=f(9$A!au}<y9vk
zl2PiS^~^aV2=7CT1_T>*TMzmuAdAmN?&7CQQI@5+6VtN8suA$;I^UCG(Txt=bhlCK
zH|Qrgr|fP~vd^;*-_T<*%D+RMWJGeO#r7&kIy^bj_`8f&PcKwvqZi*(*;6yD^0veE
z*Td%0@bGBwuj+I9#dpjK1Fi{F*D`V%EP;wSj#X4fX7G2q1Zj|s0_hEmI04G9LKcpZ
zf|?_&Q*!22aOd$2kEa7vJKQI;0^=I1+JW^(VebSXSLw+tZU7=to~5@2QFNnd!d5?t
zVChLo)HjfnaiO@_SlTBj9RHeV^4=QfhCI5lCbtJmqQ2GJNCs}-l_QRmhHiyb@K5)8
z^}OOZPelv4qzqX6`Iv92wI-66{URpR0nj&#Bc40ymppWDru)!<=9+0!zxCn~)u5;$
zA<~jd?QGiks;c(%OrBR}Hv_7q;;U>(lwSeo0dTzAA)E0F30iR*D1PWZbFIO&qdR0w
z_>=StZ%x}(EyqS3h`9{K8ZO5S%IY;B5@ta~@QLQ?E1;<5+*LxSgN-d(KeS$arM`FJ
zP%LRopMOInz5i+2Rh&;=8QP)wUri6|XOqmo21oQC|CRLef(F4!@+3%Enn?)(pRvSg
z96Wbw(9G56&d;Q_kJ4Mve#PR`X5p0EhG6Hml73K-lD7Y{S*oa;8Q+NgVYsrm111UP
z^5{V$D!;6A;R#3zgQ5ILK7alu>Ei^-<gM`p(Mi8LUGBkX*fnqjeGUSeB-JZz=c{^N
z&ShwM{eD!rq)oB@GT9(U;e{v#R(Z1*{Hr&U%WPAlLwIX|HQX@^)bhuFIGYr3S#8*+
zGPsV`3J0W=;jes2HI!Vg7@5xzMLHp=T{gp;Z4Gwa3KK0Y;+m>3o0om}lZc3@!RG!w
z{1yLfUjDZu0yrNH;g9tu*-L_&pCQw`dYTy&(&th-8*_$Ix5Q~K)x~!97~Il8yL^+a
zfPVESu1~EFC;|b~9;nVk=ueH2QRqFH>v_{`)2$1|K(#Lk5WGE9XEwyUa4setZy!R2
zUyRcHvH&*<yON-a4hGFyMu%xIW~8B%GoGXf6puA>n0nPvVh+5CuzQ&LD%mG>{I1QN
zEmg64&;c670X&PqM;eJvYxnA2<e2q$Z`#09#7mqLRX39{x6N};D>O#4oP0W><{M1`
zap0R1&JSb>f3W3D%-r!mA|<KHzsB8X#_LjTy{|z)YP(+-bSDVL)J~3>TsbD&i|kI7
zD!)Umc0VX@oAo}Jr=<SK1v&@<>`mt5ypko25I8xc&*{ZXP!%J2kQ97LWSQZhaZlcs
z+;VeU^{zQHyK>NGN(Ub*!G_2u7hWwsa@(a4H=^!*1E3WY059$=s;<#ldNfpVsvg`8
z*Lum<m_@&$OVUy#TQ*w6v;eBwhA0DKfj>FEyfc{S|Dgw!iyKk-Yl;PsiC%+6N(qu;
zm~a@Z1&Q^9*FTE|A1n3+rD)-x!&OL5iIdOk24-V7xj6_Tk{1%yHEEjcUV|q25Y($9
zDBp&0696w2JuRTCZ-~Mtc7&I&1iz(?I`z3rf^Uo#H8nD-YDqn<r`JmwEpIPP!YU6e
zJsU8fDsp|(@?M4>L~G?P^rMj(F(_!A3KRc7s+<vXr9>z|YNv+!gh}<7XT_3J!m;Ks
z2^k8IT;eO)|5ZZaGS}KvZ={QI%y#K-qErJeuJfr_F1HViG-etjlvbP%`)rb9avKXy
z)Sa(yi2bkEvHVMUtZuuQ{xG`U2{LyJBUwNEuZOfRA@#E;&0C3GK#YV%e}<N%l^6Z}
zyZk2JQGt$_R-5ZuKe8(_sG54FiAdjO@N(Pa%J_goZp!wzFcG@T^!tSb^qV9<h2ctx
z%fkdL96`c7$Ub<4l;;7g=A>fr_j|=Et-+VnSumA6Ji%(C@0EBzkoE{u*7!|BIcBg$
zUCNJtHYg-t^Q<bj6b!|97^OcLLJ7GvAO+May4u&^9DaL}6zkJMz+?!e1_2Kci<7;^
z2$>NLhts1_T*h}c>#JhjGe5S7?%aD7S_{hg=)hDt5|xIbV*0)Y8@}P>C`{`Di}UQL
z{#ni9l0j3LX|d*3)8k;G0`JfD!@o17z^?*^@~-(KUf)o6=lRN~Y8O-_8}b9vpuTY>
zt-0dr4+3>|l%`#l<IJ^=`Xt!a&q**?^BvOPFa88HIW_{utP?o}?y8o7>B;JJ6{$LO
zR<%;cfaE>IRjUd+#To|>5N)^$A%^O1Fb7bMU7Q33$kTp_Y2cN&RZJx5A_#hXJbqQ|
zb3t>Pj9{V0|G%nx6~Xh*lHTT2zEbWAv~8$?I8|DSgU;+7`8j1$raK~nBS+_h@xmVo
zTm8SXZ~xb-)?;U83(m#czoR_#@!?OO%n2SFQ=Xlk^SuqY?pn(KHZ%nbR!08aa&cB#
z)}Rqt>;s6YTXBNMdR+uF)6o{v&bZX8`$61Ml#(9bKG$A?db-P>V?$A3?qU3G>4q(g
zaTB*qOSMJksLK5RK%K5@k!^}PvQC-U&$<GiWv1szGVRyBtzCyiW$PG1F4*qr%b|HC
z+dxZT21n(Xvw-8TF>}4=&8Rq1!!&e;dp8L0JLO-W)P#I!)7|?>D4<UYgQP^HC>~KM
zmyAH+kDM5!XBeudoumX9-2wyaqKG%9X%b0=L9d_V*D{I+3Jjt;zCg&_b!KSc$!ELQ
zH77aqBp`_6ms-a8*Mp|;e4W1h{Eo`=E-%>j#AeyR56fi6D#^G0#k`e&D(@6x6o0=H
z@}v9GDwa9{2&N3Q5FBy3?tIT~_~qhA`X9j%qU|YI@-PSsE*LtA;a?+era?UnNa%@r
zg?}yTk9700U|8OV;d^Cayg4++t6<#?$tJJwL{P_*wJ5jk?oP4IgFEVSTl}g|RooyQ
zSyi$G@LKi!cxScecHj)xU@qb#m`s>c!$#Y%x%>s(Q*=!cA5>|aZCj=J0Byc}k{>=~
z7#yoi-IvUC>I_1PE1*nFb`>#UDCBDEnVf{9hsd`dSVqCyWhe}D-<77H;!Zk?F}9j0
zo_saj0KiDju6UJ%wOA!7_^6gej>uHKux|||WxeFL*ZSu2`O&<@otQe$BMVL4*~(8&
zrYFkzj!mV7_O+`4;cFI6+AOE}sc$6$HBTQfZa&^WHzE?AQGyv2J<@P!U8XJaE}V9i
z{{r{6#B|gNHWG-bqns4bPIGF~{JJMQ`u_e8I{3R9GVg-bB8v{Y@zaIPc}bA{p2ds1
z%nNVD!l*Pg>)JPSdd2sdA(6sIK?z_#TCd^av@V6#Vsd+&@reoKSEA<Op`qN!+$zN?
z+581M9P_`+I=vAUuq7is`-wX>Ui?mINZ=A<By?aIN<30J(-!A&t&Zc50)p^fa!r_%
z(Xd0lTXq6X9`|RHCN?&JtZ1DTI1}$kNwm8=!$Fb~tlHdYvk?}q**;R~fy<86&?d=5
zJjgSikR-gs=L1!0$t~0!jmEYPtb@aP$j<nu>tWuVGtjH26?4x4FXL2|)RL8efz>!6
z^9f*?SH7$nwC`{X5o8zr+8|?|b&UN5NF{<d9@W&;WF?(9pfeSyeUWIaZNOi|oySdT
z_rTa6^yy1N=1*#$w=#y2>1nwsJSgl<YgTY2XK5H#)8ZW9$I&mkOzN7f$ISoexC3cF
z?b*eQG{8kI0GYsA#T)h-IH5aW2MO2r%PSNy&pr_=H+;w#BOi);1N1YjZQ^>mR6M^a
z_a)$|8Z@1{rA#{8lKz3uEYFLCZ5yS|yR}f#(mk<)dlFO9GbB@;r<EQ>I240{uXv72
zgQl((kfUZ@CwtRt@TjYk);n6I&N_3`{{5njk37O1(d?Sl2py=sR0tZs`&BJh2z#7E
z50{R6+fohU>zZ4uemAW`dj1^$b5Xyh+sM!qe!)~pV2s&{?E<V2tzwD0NY;w(h6&kk
z@HuvIUgbNOGO1W7Ojt`xOTqH+sQnD{uswx89Y7TiA^|?FTDCf>DXP#n`aJKaMC_4^
z#!&fz;9||shE4T!bF~u&_Qz9ly@%{sCx2$$3hF2VY=6QRn7pCo3LJJ(qN@O<oahK>
zbN(hFeEp9m?YrSUW0{o1_s-Z=Z971f2KrjPPy+l%ZYo4S^xMQw;n=_0jtygmdDnH=
z7#Wcr+fQEH0>N^}-@3PB1AkG$?UUM-0`-5QfUuwcASo3r@`3PTlZspigYps(orU6v
zWJg+cBW)mbvb3rU>yH*6XXA&!#+G}%PX>kM?Km=FR%-U<(d|#<-D9%yye*cJZb6We
zG$W3P++1>3>k<?k6)b37T^%RG%X%RpAtDja+T~P3GTYH;r`DVwd|(#AAHTZo_|cyK
zWXP(DqECT3TfmfYw}HjcS#G&_@_UF5cjfhAb8ciahe9_QqriY1fJ;Gf5t*@sra+W$
z#wFr!!w?xmyMn;Z^XA5U_Fl1irt#8qbxzt(nxzr3-PfT=jN>PR?rID697#66HH*}?
zOu|v@Ayt!7g|Do$k_xOgAuQL6NZ@=LWRpL8YI(aFs`>|(Gv6=ZYeY*!EqLY=1n>J<
zZ?h34W67*_Aqvhov{})mx~YU9?IY0S#+B|yDzIU3sz$4BxCv?YzebNP{M_w6ivyvo
z=oPU_dyOz~UYg%-8n!u8$~o8cf{A*v7tE+IZKD*AM$5J@tYQls6kMp*&9?J=4iQF@
zl}i+m&*Xf-YjRn|G+Cwc9HbZ0#*{81m8>$id}%LsM9f_56n{{!bafghjf!U(X1Ob3
z@$MBhb}1~z3CI+VSQHjbDwqROv-L1ShYA>dFdp)g&3IDvrdaE5HC^rCY3Ex5fyUtK
zM(=uATN?X1egJ837BDh+R0N{dMufV7;F<y@xDHHJ?56k})A{M)3e3WS=673`Ckpsg
z_lf>vL`^d~_}7SfNW8)YUNe06HNES3tb_{|CvS#-z8xv`6vRzQ|D~XyP-7wlmoLs_
zZ!NG-0Qjl`1DHX9XS^^YOqQp*4w?s-Suo^u?2<;&Z`nhw(O$kRbq7yDC{@LNQYsmB
z7?|^AS*ysL0ea}wfG6^onD;bDNr}v-FM&<4ff}@ROkLkEj8i?q`T-F}M{>c&V7fCU
zJ5FAi%5dyQ0y9}WuOv+XMn+nS>ig(jb;B`JZhW0YQd#lk&+IatH+!FeOo|-wfX-ih
z2e<~4Xa-&S;al_P*;|2+!t%UR2|)!2${)ER*0Jz4nxQ?Qzn`=qC#erIdW)~X57yCc
zE|5=JETARE&&Z-_Xgku6^k=^_<Y^a2i>1rPQ5=&YxDVuFzC;cV4x9R?v4Via6JQiD
z{7P2*ZmNJZ9Dvd*{)o?i_=6qhg4zY883yJH8;^jw8nfNQxUZK23VB!K^1NyjOFmS2
zA1k@Xn_-i@fTZc7-_l9dJ=es}nyup{uWVt|&Fb#rax4VbPx-F^(&vi+j2GvsaD`P?
zDd33RefYx>V1lFatVh8S8ydA6`zyE~7r@hTgZB6BMgX33yJ6RYF3F-dr@juDD^*}#
z{Ipa34r;Ehu4XDHeB{W>%D5_R6;v~jX^X9L!1@YUH);klxtK5fKKwe+09<AZNi&xk
z<~<_A#1t-;W;ui3g#SLbpf^^UekG`75&Z8&8_~*yHgvQk-t+Y?p=h+9*yVxHnD(Zk
zqJSSa<;w>xpCXNYZ^Ks9(h*g&So>TBJp;2GGsImi=osq>)O_XPR*2i9z4(i@@+kX!
zs|7Mzqi*MM$JQ>n+^F>*r-F<&5w+1mM^XER&`?B?t7Vtwqs}{8dp487mGtuVb~25u
z?~s|pp>BDZk`c>1v`FRGrz@6Kw1sa^39apwEep<_D?KqO&qri0=2&_MNNOfg`ud9F
zn!KL>EJ+UFv#O^l-WR4>ZdYqXEqeVF8-O&Hu_b%-eurnCNGCwsW|((gIVEr-!5tzo
zEVKx5qqCkm|61EUZ}_l=O=vL5ki`$1w{?uBkUg{;3VSQ?089|weSObqY6Fj>=(An;
z=j(xcYco2gF>Dw^z?RHA1!?g)>LBFe;-dLhj*o1&)TDBH#8UBVY1?pkD<_VRZkRVI
zi~G#(-%JV5yMwQvgn=Xa4hyI%spK=u<k=xNGV+r}uStjerv;@Gi$58{L-Z-u7<_QF
z>#JIS#iZCB`opcYwa$(m%yASsH9dW!>*?XM>tc30fjF<WKzJ)J{GZaGw}1Y;zb$`Q
zM@l(jH2c1C{TL&fY9>^1r0(vzKs$1TC52wA_8Voq857k|k78!t62@d*JhC#ArfRd4
z^Znsr3*yVaqaW6<S1jpeMye&}PCj#eTf)uD(|cm7jbJdiyQPKg11Fd)h=hV=6pt}a
zj;~M8tC5xg`S<)6FH-JXccql<Y4^u8PbTod2fGeQMGvM5Emb%t9A1hG^&SDCiQO&7
zw3TwSUS=f$w5X|65sN6Emp`r4G&I1lnYOmJ1)uc=n}#{vlj&Qy&i29wV;@7hw8X4o
z#5}7C*2L}3F*xC8aKJH5*5tB~q;W|}2bkPFSG)+~?JpzhPFcT1IY~Pp;_<$G<t-Lv
zl`16}5MWr(OPe*);Lqu`jQ6mWD1?Ji^eM}lX0*(Sum=FK*g;Yg*p@*6vzEmy6+SBL
z^o#=b_X2n;20}`>qUigkw&Z{izPQPS(b2uKs`irlR|Z*dnHg(e77X^+F2RvMt{dJ=
z17!3mA%*&6hUkP9&$-$t@>B!%dQhJIN^4q(EH4J#>J%;@5t~T~+p=~yKt#vD`1CS)
zKf*!!=7q2NNfaRblP!t51!Pq@{58MI7N$fk$n;$B^k`T8uE^n}<$Ey9>F24&@Ha3e
zfv`FtpXmvCN%s-jo1;<ulG68W{Y*tsxr1A!@g#kJQqsHsLycR%&`(bIZiO(<6lr|6
z6m;4zA39`d8bOJqcpfQ%@Uss9*KNtV7{VjKfDLr{BZETx>bM-ctUaICem?oG&8?BX
zek`BEEUiqFfs+|;f<qYbt0tgg%_1FY<1c^>p)?JM5(yV}kpr8_HP7V90!by%F+Qc!
z+8nZjHc^SKJ08#)A=a~*X&%NfQG)0ru*?<j=k`br_X_z2+w%}=8mnla15?04KgC+=
zO!|Hzrp#j@=JUF<FOtpY`O%b?^&g4K-y$C(%(reX7L3eVw<{Bkyh<l?f-0n!)@k?g
zVso*X9>&WbANyDHJ(F-;m|maa{)`7aTYG1vo$3MMn6hm2GXTRFp^!pvGWLV(8u%`b
zL;^tD3<=G5qA2<XaX?fr2M@L=ZG!c1mWGd?c#%?5H4KK*R{%2B(b1Zo$tPO>Cj)!5
zRR|@ZhFsjrDa^XgCFQq!hs$>SXN$@S_%tiof`=Pe8g;Nd#jv{G-^f_a0foN6>8Omc
z(A&YFEiD9n2a@Etiv}XIlQ7c0CD<D2dEtQ2PN0d;VJvkgTV@(_9zgIEu|4*2=4Y_`
zGtfcsw=MRNp#%z|TyBzw99LTn>xJLGeu&Z0(X=%CK!N*aQQxylZOq`9n8WXDiwMZx
zDZ<(Bbdns}ss(_Eqt`g-l+~*W0^%%+3H`5ESEBeF-{@H1&u;b&K?UP$MtU<;xBAS>
z80NRrZD}Z&&uC0Gc4MlCD%RTXbjjma2{n+fTT>~ILe}$$e`w@{MD43KViHa8Nl1{R
zueDPq_m&n+eo`W-ht&7ra-PJC#O5RCwHQtbo)mz(hCX)#AxGREtm(XssJQYW&f16|
zcXN{|D7c^P!@wym61O6*W8`2CCC~G|KGE8XaRkpi-l_go?<2|Zoo(y*tdu!C(K8%G
z4L;NUNXu+5Z5<sTb{PbC{4ut)p3YNL#e#sCAH<gLckH_<4ZY~MSbcL4oZ<$%Ev9JX
z?e99DE718P#5u_<qMi5sgfKi_(H7Q8PGhnH>fRZ4J&kv4*giD0`8CfF0qSe3JOgSE
z>Hi9ZrJj`}@F(VUA^am5Zt4UyQpZ<UT?MR0fb*??u;4iD-TU|GO5c8&Z&)X^9jYfC
zsKm*?eiq~oBh2d}r}S?~kk6)G%n({tL<GWOS*phiq$9pB<a0>94Bb%mdn&HyJ9q}r
z;YFZAr>pq3n+GY<)HkCpwLxz{hL!sB@-v{2`dA4o8Y)ET0_GLHabB<elp*E8SwD>r
ztF3pLkQ7PlKU4Gb+C{Ju&H1KxJOEkeJ;QOZm6;|Q55cWnj6ahsrkYaZ#@;vzAub9J
zX=gF{=>k?W(50q!bkMZ&FFBP$%4vJ|Y3Za*Q{zD`Ioe93Xbr#P;02C~F{|M)paQ`G
z36J5ds=P*drwTbWjJ_t^#*jl}iGrNzeJtSjK>UXc?#3{wm+hHQON7?^$RhQ59z}m!
zL7+&u@Wqj83jLO(8w#a#zE-7IVEe9&OS{O*ZA9>h8*q0ZU@ffThpa+o@{68#rslzL
zMr6FfY7c}pOq-R~h(96jk3gDNkWr*hHPtVe4P2p0A<Fjg28>}W6d1x(;Bg|jNq^zJ
zw)9h3psJLb)e6fjk^MR|wn*AIGaQu{#TfBlfiv@~^-o=$v`Nr=e`;{AoS$4^6tCaY
zi{tw?JUEW%aOC&;mbk28BK!wN56i&kwZ~V75f^ycOV%+1fI@R?-K5~}BcNw2$ULdo
zYG<%a{bR*6|B-2Iu5(U&DYrVW1PJre3qmjl!o2uVUkLd#eAKAW3`upGrlJX%TIk66
z6Eh9mmr3y_5qi};R_l}5wh@LEP|$rU(e-358YLxVRkQ+9%TT2f4VKKk9%Ax*Q~(uH
zeVK;_em@QEcS^CxpvXy9%{g4lsJYT4WyClt>&>Sr{S>ULY5DTlgXV!UcF{OohWPdV
zzv}(B(3g*#L+9T$Q}e`vaEIaxnnloD-VeWg2mX8fjN^P_^_?m%Fbg=_pup(m8-ThH
z8YU(b8j>?Wr|}24bxoPd8ggg6oH}t+--oeY@OYc0T)hzadk;x@XxIz~B&2|a!oCx*
z3Swba0xP2x3ya?Wi)zDEksL5nC{5!nXhj^kVZv{!<@lG-Fr)*)w_ob_LYjZD)i%Lq
zg>ZLzI-0_j+dd!paq_<%aZHH!#8Z7bZD>%KJR%gmj_K8~#fC$w$n5N6$PMD4UMoFc
zU*CFQX)G8Hi7P$@@hnWpziJck>5Af_j5EQhj>{<jWWHD+t<_*wGd<{i%i-X8HlDEP
z2{9-Yve3nj9!=$R&x(J0>OzpM7f=4L(fNI%rh9Utn^a%4<iEb?xD?#L#gn$N4y{d7
zKcf71UC0x?0V%=&&3&zpE#@3rp2->Yd)5zYI35<WPVp<Uj7T1DUyb$3Jpy2W-?nhr
z3S)(37EDkP@C6vBxhJBI`uI2myT_tRE9nag3ikA9F9kMZzE;vdjoR8ov~{WGFj%oy
zF-ABMO&y2(Sz4e#TcIwl{X&g_dLulpY)&UzuKjDOa+~)Wstt>#*qm}{mdV+w842A~
zYS`a*G9)Ev5BHq+CP=d4w?vGBG!<f9v6WprmjSaxF#1TzpE(-J&gTyyCMWq03rILb
z_XStTV|0ZNp93CPAo+Pf-tdp5r9Dx41L8oy5Y_v%AuctU-+oWhGPx6`x*;Ao^H%G)
zx<+K^P`C;Bd#!~^ELSmPtwya{u7;lJ(@#zh-D#^Ft$Q11%}YpdiWEuPAr-E1&#Hvc
zgK+O%!*awm^gi673SNFlJ-wg0L<Vy6SXe40zw67C(SL+UhZBKC+PygX?Zr3Zp>_ZG
z%2v#>nVJf6#<(`c1pfAq`4fdvQ<d(?P5cj2XBibo6K!oA0>Rzgf;$9<Avh$s2FT#<
z!QBRTcMa~Y!8JGmf(8xlE?@Jmd%t!6&SEjsRn^s9=j^?oQ9g%<nkhA6D0wOd=Z`*`
zzZvdL_=3ryc9m77vawQt!E<1}vI0csGR_5s|F@xhHmpV>j|?@Fyg`iYU&@3rFcF}|
z5OX3;1P?u2R_2s!perx(EB&vnE>ecuy(e%a=6OMX6q>@10?y9$ZNqP;vUGE*ckACW
z`0PU1J*{mmS<~peQW?87;1xLtWv6*xSKPQe!k^qy`J;y}!U}50p^i|sBcIoO5r2o0
ztHxQh>nIh!qXO~%?3!Ib?tVJu7ZEt?-c^bl`^N#(>!p1cz1reH{ctmX>?9v!bfBVY
z$C`$R?L<aMm{L?*t>u6(l}fTwv^$NBI7`BLtaHp$!K@4EO_#Xh0kAfL0S>ySe>|v2
zgy=tQBm>rx<K&I4(MYjJ%3R=}5BN^M%dZbK$(QxOO<wz=u?=+4l<K{<?Q14_6}^-F
z?E^d4mkfBAvILxP<qZUbo%SplTKou`yoscC!FC&ZBiUr($pUnDo?mc425|Jo`4#za
z?u>C~U*i!JoP%254A2uhNCQ}fIEi@-f@EyJsL(A7cQ^xNP)mk#DDzqHPYQ8WwZ4R0
z4Humil`-KfeTHYvU4&?lhzIGjlhvtPAOIegfXCD=Y%tSic=$+z?KKN^$piPAnW+$r
z_THE{Y2NW%>j#ook)#t3qXino`3mNtjd{a|CK{3+pjq&lS*s#P6#j$lIDztnfwCr5
zto9EOuGBQ#AWX?+oGcOZ61>f$Cxc8Cz|w|67k#SXZYo<%`QrU~r^TRcYFHY0JqETt
zZs1KVJS~k73mbc6d?Y^(&)J}j@NG*RJ>Hf-`)Fuv7Ie4o<|V|1#YT5$`28j2n&{PJ
zNbaLSSjB}}P>2f6+FxOiJj{3DcH}MT^8EcbU&V<SQkGe`oRkpEijixIS6WP@pAuaE
zE>+1%@LSmCa-)vdnGr`*e|8Gp^~G5I`Gq{O{n^G088e$p*7dAZv3`du=GZS5$u>@X
zqMDo{^6a%-&VSpt5mogeLh)7pfY(4DOzMUcDOP~QU7UvJ!hu|!*b&h1;Ts8qVts1J
z*`2-Ys?Hq?hLYI5(Qk%K8^>Bk&f}8jl+7(&8KvIp*!z6S<6QcUpVLkbr=8`V4fgv&
zcC`8UZShVwwKa7${^g-;QXsuRu+c1pf@7#-pZW6>2+(~6tTHGUw!US<&0&hz{k<av
zHIj@#=>?Jpp56T>fZ#3uhT)1*sA<GC*75NXDOSv&lOnWAU_<1a@q&w#ri$Zk_cDG3
z<KExhBHNXnhJ=P_AG`-S{G$_RR(g(?J`OUPo7yB0^bsPpVeh3UGnpV?Rg_Q9$e^aB
zm69sP6i{!yREZej-ZVDg-NPY$aatAKJknc7-%v!%SI-45P_7s&1<dqWp2(wf+=KGN
z4S3vMU4|};FWv(8|F@79HtJn_kq-7aTd|NbSi%|vb&Pi{u7C*NcR+;iqmP)Fm^ic)
zJnC%&3<@WUY9A>dGmm@&jq1}Vh~@XVocyfZ+<4H6ICm}9IH<cH-s%25@?DB?ydWOH
zkES?$^`0Cj4e=+$TS~mV0H3%n?5!LN-RhI^%r?{+`C3&VpG{kj;X?dC8$YFTX*iFY
zQ{QvycPs3kmkD0f7zx)}s<WqaA3yeVv&|RBnR2pFQ3W(W32xc`)F1EWAkS--<e+s&
z_8EIo<i2+K^ZWc}p?T*ipBFXLTE0?349lX?lHV9i12ceWIKl1ORDK6<#M031^cs>@
zO1M_*i%9HhE5;RLnjX1qyqpD>G}^J7t5=Ex=RDG&U0-c@vcj}oz(hWg@h6Tv1>8%H
zp;`Rwm>PHfy=cy$9oeA=?v)_P=>`Cn@i3yo`_E1(ojaIrA$JFG>lK0be(8&u_FFfo
zs>S}G4k7LSW2ggy{<d~lngUX-VPH!eq)7)3?NiJ|ADw@wMCxe>!f1z~=|;0IWHCz6
z)%@G$+qVG!z{ClYXxX13@9&)(jn6z_k+fE!Lw208cl;yNkKkrxYwr~ou+&YKP>-AB
zNjuBQsb-q2_ZpEPOT9JE$Fz*2D5B5F@*I&dysUivgJ+ueWGU#~NwwvSQ!I3_!sFqC
zLBl$-h}Xo#NwHFbRC>OA5Q1UK!i15D)MVn&Zo&8UE)%L*$G`P)ml<-p)#7YSaMf$k
z7Hf9#Re5J2KP*CT<bW8=n_|5bBpJ99FkGqvxHn|NQC0PlVfnzMzuz?e{v*3xHXpo}
z0F2bu*-wBoq7S>&;qG*f)_?h)ZdPMmC#tZ6hP%!(h?XVUQ1&dIN@RNbDc?0J$69fM
zngY_0P7b>S1Mw4w<G*@5tnQ8;OJTE`X|!I#iW{Oj|9OrcF0`Ep_f&I8w1$hi%=6iG
z#DtdG{`N%wOmu9u{V3ucaN1+?c|88^d|0d?p`LUxwwa*c?Va1#in4d4q-1Yi_M`U4
zL)AlL04%B1?_aVub*0|#f%18~h+;#2QSNxU1v2CYcrisXPy-~S3oLTBnusv}98%W@
zU~IFUrcBr9GUXY2f@~{1e<-W@Gmu2yx6P9G=g(^S;v8k8?iKg%rfcQ>SuCYEwYod>
zpmezYC3xBs5r7d%%vbeknETQnbW=KTt<7isYS~`rJ}D3IU(fo(iZ;+zQ)TFsGQ$j%
zpTjkCJf*CyUfI)Jjpg7za{tTiBpx~6OcmdnwUUE6kAJLjZb>+3sU>thSqkhzPAeWV
z(^<O~GCrbnQi=CqIqWN|UN+{@G+8MBP_cgs9IZC~{tXat+Qkv3Xr_U|@_)qhZ?yVt
zxt7v`4N%H)(%h)iqV1S>(d$?{*yfY{w_^mJtcNF9-lseRMwiW$eFJtD$Mj5txn%$N
z;~$MkGE|idXi?UwAiqSUFi-}NS~w-6dbYpX-i-A6GN-CcNHa<UDW=f4AG_R}+-lv+
z?gLcK+`-+~3%dMXcl=uVT{N`~*19vct<~?5yZ~J=RXhv*HD@hUhSNd94EI~&A?p?Z
zk)AO}KVVWlsYZs2uU$@12msS>Z3HVEedUBcZs4AN51(Jdfz#jn_rsI-CqswSWDQkY
zBDHKcwP!=DW1tp>Pb}zq^uv|2(l6}RnzPy1Vy#ZV2l<h0G0($f*dG@0&97x_R^HH~
zz^HK2=5ko$^XwsXaQ9>FLwLNoHj;r5%uSZC47Sb3n*lnb+aqx)8G95Il%p@h+#h&U
zLlwX|*58{j&P!cMZ+~eL*S-uuB`G(F#w0C90|FrMa4hA<$m+$;7;Tycm7HS<XyZj<
z1JH-VVOxZ=e|Z^W>o)91;@x4>-hNaWX;It#+BJFcoP63I-T@#sah^757LWb%1cLIw
za@vlJm_JaE)P7~?>Wj(Ue4+Y0fs_c!zkhi6Tax1g=yJVznm=ijP~^(zr)6Y-4;L!6
zIz1Xcp@3wWQj!9DoRO8z04!)kuY@C4Et{s4#6FWgd0Mf2zVPeQrLj(Lo197+vHwax
z_M&~5W!90LI6pdyfzc5gv$ZC{-}w6Xi23Zm7!M|^O-j^GUBU(p(Z@4ROTkTgz(T9d
zGr@io%K(+{X(>Cme;nbCz8lfex#wG4m_a(N(xzzO`T*-23{vGbS%P~8ZpyQB#vJ9k
zJ3|9E=Jx}2Q(O#T=Y14=5-j320jg5P@LNVa&}DEGQ+LMHHv9z-EA?n@UAK!8Io9jG
zk-hv~W!aa;wty)L2+@R_W1Xi-(;zepW|pM-?1Z!*Z_d_v6_12G*Hrmbqvcc}AtLM|
z!~GK5cdHv6{U_JI!MJp!HqK?;0Er-us8jCr^~JK;+J=WJHZ$}6i`>uN-C%1Ch9XPM
zDcb4O3Zp0(Ua5GM|Nd>IZZq9w(CJw}w(^`_!GX*KpY4iRx6kvi^9>)z<jX=Y8W~+U
zbyK|fVp77p>I$41eCCA>V^j&=J!ALOuiMJ1+DChGa@W+cWXI@M&5X1f9n2r#>-BA7
z30jGwJH2=;1}g<2V&siW&X(M!i*4`L^{Jo{Yx#38x>GU;AlhV|HJlm=xDjDEZ7$B8
zpRgy!D@Dc;b^o4BYPS%JZ==3bz5t@~xnDexo$w9XeV#uO@!9G{3IPeyL%HIC_D^>X
zej@5RT~ipvF+WoCV8Yc>?!gF-KI&;flMl{#0d-9%<R;c%-P!Vi<p6}YBE5S@nG=OW
zkgFGu{3_Cv>KG^ybcY(abo|G%sLq<(dY0HbeJC%&+W?uN?j8Pu$6u8*J)?*FsRHG{
zGpF4+r6YwF!VSv5B%iK68$Xo7Mt%<O{8G+wAZy@!f9IK*DJo$OVxUQPzy@;@*hRHV
zbH<TY_FCK=Eon7HQbUqi71SKn)ETj$Az~9$xr+H5UoHUx$u}URmgf(HwkHQxB~X#v
zh>`78Gx+OdKxq#Im_)rEWeI-Ix0vmX`Fg7^eX=zuM|CVh)^T9jMl(}}VxcbFMx&F#
zYjXxTN($uhqciA$xI|!*fd+95(ANum#*{53gHusa=^y(lHNst#q=A>rZ?OcI6(imA
z11xCLthXQm=ctP9z!Me5s;bK`V7}xN-c(F|{tcOPq%#7{@#@(sqo^c7*D6(SzUvo*
zkCX27fhJv|#h7%A89&YSc$w#|Q)5rn?W<78XT4a1<VYK5ajla^0XF-1QB@W|Tw_Y2
z%zQie$}mp4wCM}{27N4>gSk>IfKnxe&dV}W#c5+a&^MG{KeaKdc-r*=8FN-I0y7iV
z41Qy;+Vn4Pob3Q8Vq=bEEGo3w!fe}CFc5<FYW}8}xe0F!V;&9CT3Cy8jI+ozyVxG-
zRY+DG!9m<j_t*&siBWxKK{5xRT&-q`^k^s4MW&cj^B^Y|7u-p9pJdtASBzMY&O}Ta
z&5WhbdrNN=nmCP4$$ll#P*mbKsQh~ZcClEVrY7Smnxapb8<5L5YH1MhBIXUC=3}O}
z<YA}6omU{1Pq#2upf}bm*mX1dV@iNZ_+yP4ADt5hs0p{xq|cnz#RCsp*PA14@x>b1
zZ<E==N#D5T^*JYgbYUWMqJYTsC*B7uFpX?>7mEoq$;!$aPH6E$wtu)mLqogmB>jbk
zzcQ{qvkl&ed^l$7_-NY@0s>#yULWiboSb)%1=QEbp)vP}c(fb??T7e$A!C3XSG&tQ
z26z?E0K_Vz)JW0WFeoCOZD*4%thc$KytRWVEE9iR&d<SVN0Gs4QRf62)w+s^(my>v
z|HNZcIZ=iK2I6w-QwXBU+mm!T2ufP_PoovSMx{<Lke4AIZ{7%Yz6%cQUaoDO146B-
zWE+Tx@D`wi+|F{((guCTrA^lu=FNzRJs4@0pAdNCLFIY1<q*p;iF}9l@tj7nXA!)7
z$b;8goJMBsDJ*TAxmP4Ty)`Fwkdg@9wDpm7P>j6(C7t&dF7-6j*Sc;6wA7!sbiDMH
zXjoCY{c*0<U!R+iAcG=Hci_H6##z))|8|~=;><Hccn~H+wR*?Z`E_QA{lrp0VC*v>
z7AGkpmI^Z#NsSBU=q31Bzs}X%*tl@;sv<puvYZJTn)iCW+gG&f;eg3{sZJ{1oYs)$
zyDZ#jpx>Mf6S|HIcjcT+c>ye2yZ(FV1w7+?Rk;c3&l+5lq8E;5&Yw6>a-jl&<|b_1
z+;xcioEM@oN*M|C!os)LU;qW(YoOityYfqQ>*k?mj%hk8WY9-)Nw1V<WMt$g@U*}r
zA}SNmn@=bFezH=kK|igqj7q})xd5U;mSbJvKlp19*Ejf^9?4Zq1my&9*F?P=`0<0m
z!m(Q5a{Fy@Qveqw6ow=d1-wqGmjY7;rqF%+Y=s_42CMSsJ&u)Sa&cO6YK}CK6=ML%
zN_P^@KPvs*Y}@iI%2Iq8$gi<+@hYCDNQ?GdiQ=B&F@>azFgBihmVX#6!?j+39wmbX
z2!gKdSCONd?&$1v0K&IF%#Z*0C1ce_L-Z&a9u~&nWC$C5a}U7GEWO4|yyU+q%;IEC
z|E)QgRhXrnUTjG6UXblh8sJFu1QrrBa1?s`(LeOw)>o_B=*<6on46Bl4k~`m^u*Wa
zia*m7XJs}FI@R=sZdWv*^!=-bbF^xp$dWie#|;CZIe;`WU^_Rnx*ur(a?`aW^R=~g
zAscM8n3^REzED8mKTo>3LPz!UH^RYdPZIK9SrLSn&zM<yiNoz~miTy+InX(TmzZ7$
z-yrrT1EEGQ7xu<UZVeCL>wIN^8>t^R1F=%OzIeqSHQRlc(CAygKhcHirGuHgPAh=^
zA@|h-)oWlFzS<uN>HhB&$iThbC1sz0Gnsl+4YR?2^CAOR_8yw<4J}bZB{J((Z<}wo
z>++-<i!MkFKGMH<r?-|#6@Wrur$s%|@PRmLS^KOr-oiTkN7#rx7Y;Y;XI`GM!WKXP
zI1_r;ogfPFX3=S&@HG<c{aYfif4CDNYJPfOq`F}$iX3Oj$wFz3p!1zPD%jV!=i=e3
zn~Po4{if7N!^#yL-1v3>jE&isMP$WBo;oZX95{6J7|)v#*J@)OYwL>Q*W^{AjekFt
zt(yMZF5gzq|6C5}?Q$Sh%)0-3IV8}$JKrh^@X*B0On#Bpti_bedV_2h-`~2|-^XG^
z&y@niDLRJsoeOB`+<fEo()u)nv=HwK%)1i}XY(2HdUWk+0&%7DjMm70m+Lh$VGoLr
zABnOiCVMhg3R#xFL$^Oa)d=p;2w{eo9x8nvg{wH%d#8XdzHIAz2cIo5>}*&%Bu9f(
z)aYzYW(22tcynf=X=QUD*^OV@yQ?;o4)o=GfFvfl>6bEwG}8kWe%502G|Icfjr=VY
zQnBHW|91h_Bh4k>)_oL&s@rNr_g~pIKUyJ1$LT78MHSo(-mKc7nAV0&FJd;JaJ);a
z@%A7nR@#RpuC(nBwF`M|3LLI!0JWD4^?2%$bE_Lz6z$A^_*9kbCuXU~Ud+-R@Vz@b
zN=dH&WW^YCv=W`8lp|d%MIV{NfmwyTAxGHyPOH<NErdBU#5LUug0${Vz$izgV_HCo
z(7{iF@buNHD7m~5-v92-Es}_*OvZ^$A=R&u%j{?GeH_)JyCe|B*Jm_Wt6vLtl%}PK
z%?PV$w8C(FKw!>T%)3Rw=4djbTtGMtEYh1SJ;%KBI3(-W|0OnL7Hasnzs{f<Ic@Oe
z7e5qqhC8p@0P=Nu($k0jmpRA3Y3c9>qSn#JRslaTc_YGPiJ3(RgDvoH3r<t-&7Y6V
zNx~QD6i5rcfBHP*K(IO~n>0{Nyynx5?NO->!KRiDFz2rDw4talPfNN(REa{EUX~lz
z=fHrjfBflvE(>Dem3nGAKLdm+=w|)Y)j^&l8KJhQ&xD$sEK`rP<dMh4Rm$pCNzbPG
zk^Kir;$dS@=?%^UcaZAdd2#PR784SeNq>xR8ou;E{K7Rn(`?kQjUly-*>;^E?%n4M
z`UewhlgKD>;*Ebo9itD7C{;pmSOwpWqF#ODWKX48+FHQl!<+VhQT&P$od&1et2{;i
zb)}Ski0v+-j*XCc4MvhXh@M;{W<62N9HQvDWw$MAE;%V|DQOKQ32bx9AI@I*0C&|<
zG*`wuItbj=xc=qzFq=Ea@N8_AuSq59raU8Y1V{S_rvmiiP5fH>tN2YKKV>wV**z}Z
zoNvwsN3ZB=`VCSghmH{|K5>$>(b_e-Uiefn3lq)ibztB)lQcUfqQx!@GI(|t1MMTA
zRHB2nUexZOpAZwh{Svsd3|2T?`%3K4+O(+5sGlewh{O)rJvpg@Kn?qpY6;o^wsYEe
zb2+1UkZSCYZIcrzX7UXr<@KWeszplbZ;TxtjYD@64rSr**AP(+CjU8P82xEw+db-3
zihks(u@9#wk-n#HB+>so%#9{yIBcdPt|7wdx`wcy{blkOoGxthH}gHp0V~d0e<wc8
zD|aJ?!KrE9f<4Qp;Isv0SiS7vf{>E2-bR$|f6V_h^%PZwc`6hXc_u?feD<X7+jrfj
zoJ{Dgv)X^ox5Bc9BpB_J3ySA_4^InyIX~7dN)N@eyXriZjX@8QI{cbif5FyJ2TO$)
zzf@KL-^;4<x(_iw>(c8L*r?rs=>kZs%p$X#zAb@XbTbFgW3VCvE!kTqosxAyAEYS+
z90f>LKY{<re*4PxYxXgYUPpH_bToD8OaMIfzAfld_OY{KRPY8$J(i>urn8t?swdA%
zqFmQ`;=d_J$FNPBP70qC-X>5(c&U325tDj*HI(|C;7(=^H^bT=izSZa2w?HP{OtD)
zLbu4;p|X}`Bca;LERe-KyF}@q4;=G@VuKr+E@WUr^H~$OH)uc?Xn)?=Blma8yY!Rn
zy0h!G>2oJH2Qd{m{X_B@LKQ#AH$Pyc;qgd-o@Jzn4pXG2Mrm<J{On=HaL@0wU{GSg
zyoxD>Hq5i7S3|s-SiO~xaIc#2c6>|cvtuP0bHWj4<OvjeNV6Xo0HtftCt&P1!})Vq
zEP{ljW7jFEI^M5K0ux?}baRccqDjd}hasOl_57WDNg~82u>ONm**K{w=kZjYBvrZ@
z5bgM4<LE=K5Jxlet}8AL(|T|@&tDz3Z)b0p;5|a_O2i<B5gg7&)o16+ASWyAiF(SX
z62kmwDJzjFAx(mW`zslVjF?*eA)%|~Mr=1%oHXv+w89}FH5^S^(eB`>dh`U$Bs_8O
zW=v{fpfevlZ^$cr-`0;;lDj{ngp}n5_{a13eiVoH2#$Z%bOIJ6i3(=7vdRj5>!wCO
zQIP&A`IYy*Tu~<P@b{!IUpyLXzFaa@=337C9IlY@-0K|Wznv9z?dKFJDIUlP9@7N1
zT9P($wgl_pF&Sk<_qt(XQhQ{M?~yqYt+4zL-k#F)Q7skK$<(pIqRO+(HA#fO2BfJu
zFL1Ep?@<Kh>8hrK22G~_>@TS`4s8;m=g<3j554BEljNoGnu5@y*oaG_Nu_$3eyhrl
zlz&iuNsbijt@^%!y8(&)oZS(Z9IEq+oav{tWgN=4@6R6^)IPqyhoT2erOw2PEiN=%
zTX@I+(Db5)N{3}Y^hllJS!#~;O;kW(qqds%Mt<O>v~Z3DhEY?y+n6N#Uw)TtyhE(K
zX1>-pyJ^z7GvT?7>r5B2^{GntrO*PyYyxF4X(&l^@bEa~2gdzs+{r{JIyvj7;+?O*
zejfYj4!29(1b*u2P~J1TBqrvtNI}0Dd^N|=;^lpM+yEk=mrXc3-``K*P!D>5M{DpB
z%P`Z4R*Os+lwMAey8cLxdMc-H?qU1Rh~PVu9+i4jkqAb!ko(F$GRurGVNu4tgUSfK
z5Vn0Zx;o-eDT=~B=LvXzX3P=@>1v8jW^#}-51F=1lKJ?;kAsq8QvFl?{%op&wC%IL
z#dj`4dlGp8#G_elCOZmdmwb0TW7YooSmzbtG7qv_asjBHV@*jGdS!4h+U5e7>;quz
z67ckXwv?WcJ*DeHF~Ve;)w`1LRB?RaNxRYS<&M#ykrCBR(K<niA(mbtpVIjbQ}w58
z8@H9j+k<#r)!Q2luM?wvAEu$)lPuY~q4@|u;OLBRr>$El?ML^eVzp)tJ4>)VPqX^O
z+|JJb-srrr*BN@qoa*=BS9z|V9kdTEXWRZ4sqNqRENbrg%gylqoojLY&OmF~`WI?T
z^s79041i-cHf>v;46rwlllr+Mb`OS0A4fo`tT-c^Td#i-wkV;@8`0;})O2<+sf3@%
z$Ya<VwR33DLvFswn(owO(^)P+N&8ofR|*}wl?N-svMA(vV~y$b>lFad1>kJTKvXR8
zpi9K%W%{@<epza2k4*n`8%nXw-1^wpl<4^*?@-`cLdYT(#>qpVce$2pWkqjlc@6I}
zAV0#TrQ)|W2iGLazg|0?pT{r++63V9-{hNH7UU@1<^~DxN+~&6Zf|gn@?FnUB0#vM
z{WIYKSpBR$1vqI==kTfxyf5Sn$E}|b>ka4YsK1!vgynu0wWjo`_);LK^^{G1JR<c>
zr?kKI{wZgjV*Kue`F_6v&NdXBzYO9n5y{#!)1cfGN-2tGR4wkw8Rj1Fcz$3h!Y$h}
z82~X(p3NYOV8Vx`ttw_QM?8IC!8DeBE7O~kaE&>m3B{~_G|g`Qy)JP=sD2vaf!gQl
zo7CMlGdN%`x;=d$*+b%8e81Vbc^r$;L?p5$-P3g{@i$5{uR5Xir_-)$h}@1^s(3kZ
zg}`LOOX8_U5__0KMv2iVrE-5&R`k%^&U8tT%DN}2{vpDU;@*q%A)fDI#hA)@tF%d!
z_>Ktm!G+dn*KgXlvr5V*-J3>dxVkE;T_M7XijQH5nK0XhnSI40to`Mk{=-&fn$1)D
zG?k7DhBhUQj2a{Knv0IkwI`7Rc|FAm>$zrqbvpp+*>1X6k(fl-L#=tLtyNJR2CwOc
zhNUM^YbKOi&oQ-@23t{Oik(?Aw)pxdm$dug!l<#ORD$ZHNtog&NL-0X$u=WYixcqu
zD-@RMP*G7$wYgXU#3;u0?K(V)sTK#YmX21>`1pHbKHGjEdY{Lq)KdJ?7@NQ!I0j6X
zjMQb|qqR;oHYRhA%;A6gcl%i%shUFFYcu!xB*Okeg2<7x`r9Ap?-f_BeN{od5zn2s
zYuM8?Z2E15x-yVIM+p(H+Q@at!x6%1+>Pu555kAFhJV}Bg-DFWC~aQaUQ$fE&kzTv
zc``f#O5GRJEV$$UT%SVdNXF`CGwy`XushDIzq@of)9{@jd=Kxo4_0ZJSqu8~%k2}%
zGi-MPIS&^V750zx+g%}fW?k&uEjh0Ny1FsR&Cb7Oa_)iLyUbtKH>Kz5NhlmS)0!MY
zAC~f5CUsjKX}t*!B7uPOFz2NE1QPOSLVf1pY89werCXza_Vlm=v)=bjWky9=u&dcV
zM9&Y`*e11UW-r%XMpnF+32Ao^^-w}UQSj{gnnl79j}#wvA~w_#2Rg!ZTXG?uTfG&x
zwjHz7<r&Y8AdBm1k)Bj>Z^6<vn=k27u-tlY7Vzv%SLq8{18Pnu`DqEiXVSJ<-FTNT
zSJ3@(&y<U8T_2`;gjecOqcb>t=77{P4;1Cu>leT{YS~y8U8ym;B8gwGQbk(Hq19-K
z`_bhfNc4rmH}k)m3n#Uq-fkji9w?WU!f@@K4+s2N&Zjv35%C6yqG06Jd44u|Z?z@c
zRd!Xhj*KxSWS5IbHm1+3=qJnE|7HQCtF^fQ63=kYuxBlT!fwNa?nf(<eZOOZWiy`2
zAptwu*C!co5v1vH^mT@?;QMjCsL}o30j>S!m?jC1U!EdVSLv*AyJ-<jB#mpWjH(>}
zxEILb+s*n~8ELs|C7NqhIbu|vU9w0h6WAfV#={tADKl|6M#}LPFv&XSU^}~*XGF9&
zDq=`8pOv><sUi|h&P8aPi>DilaEu8heH>wZ5S{s4FXswk#I2-@u;82~X<D-(a@&_w
zQEVr~$gCpHB_%a0K6c2>v^4o9;MCk0&Mb<v0K_ub|FrtP#5rsYQa-5Rd_B~C>iEWa
zJe-K+Y%Y;*N>uw<>i5&NuG?9c5Z6Rp1vj{O+V76-{6`~8oFXy%s2RnvO#iF?sHB++
zGdN#yf6lx4{gCG*l&Z8S=__msXz-_72d${!KADMGG#)zk{HJvDl$Rn3Bv5{u;E_MX
z#COi_JS#*Yql+eWMlc`DG#`{I`^Fj!GbABNl*~~0(_}-eQK2RNkBJKCuPA^l%+O~*
z8PVRgvWvo=7|m)iKTgL`<$dwu<*3)vg$UcfpRzKK&EfsIJsqsjG<#>G#(Vq~=xu~3
zFp01RYwzgTh*6+G@Y$%6`0YY%!6VW-xy#jSloc!Gb{AD$H&JP_C{j3><KaJSX<7I8
z=d_MEuWScsmi?e%W8>ok-eJ?pa4JH7VIFxsB7TZS0i8Z0N9?Hv89qQoezuED-;4?B
z;MA+8YQ9k>98~o7{e~442z%}gIaPK^R%rj-8p`|ht@8-nL$iGKZx;T>`{63iScijo
z2Myxq1MG{%<izfUXjHLDH7Hw;<6?zP?abGj84tWR(<xrkL7!eraVHqioW9A*{)K1+
zX>e+I#^2~18IyFxLRRJJa<ig{K$i;$mmg?!06IbBKQ?M=*m#8k_vTe_sx<nYd*!gQ
zG_t$aXl@E@Jg>2kiy#-`huP9b{z}4XXA(1MIaXIG7?dB<dG_LQ@Xv4EF@hyE#`uiE
zk7N=!SS*0kWzacgd}1QWI6Xj>MH|SfBLga*#w-?5)*%S~+dzb+2>i)xGi50SLcO#L
z6OE=^D>_kIwvCP`Yvz)fX}R67o{>^hu}&}uTQHKL&4dvk5kWMnzY3oPQL)E1aOwzH
zjn3;c%H^Mb!RB{xo)|Ld33s{jW4EXpi7()lMG;E#L&5*G**lL@K_<3rLR0#CgyxuF
z3cTubS1Yk3Zx!crC;Xn8C#!*YPmF{Hk90MXwTW~%1`NHRJkqy#8?7qwZE=o9M>yi^
z@T9Dm_?in3&{!uOQzt=Nf&v;&$6R>Ww808stm7ojuRj5B84LGEb&Js#V)7%DnBY7u
zJP!nT4j+Q;io5;-zlGjH^-1s6M5ms7BhPTGOV*|;gK7%}dY|a%yZ5W)lq@3Y)?ARm
zmqRTa&dR%%AJJp$IssxH?@4r<2UlPr!Kf;a^WbYvAlyumDT&Z|e94!`z4~S3s-D#P
z$9u_B6J*Gssg=;?W{*U?!fcNWQ?We}2m0*ZM4-tiPhl=DUHhfOcda<`rA(Eo-^ei;
zNsuWJ7%M7xz5akR&6Uk5#*+{h7-q1TCK7g9yaIMf;Zqz^;ydV>*0{I2Edx@jE;nc@
zFDXoa27Z0wHppeUT5HN1jWk;t8(xK+K$-}nKpJfH%Ajy;8BVd@<vflJOJ)_m($8*9
zfrgl%KSH<1-urDXK}e_JpD@&k$3^;!Hx8IC8alx|OL2r`&J+!=khv!*k|{vpPqQ8Q
zz2#vMcdZi3MAm~L#DH8e$7$=imJ>;<V+8$|*KHlK<Sr%jzMvBqYsGbc(MId`ULZ+4
z`LRB~`|s@q9gbSs`TEbyR}B)sxyGg{wb5=^JObrlOmz*Y6&t=7Voi*UypIrZ?jMrJ
za=1Ct7cb`cCjj(lD|@%gfZAv-1xE9C?EOD5crJ}ndTs=^OnRDqyaZy=-B+uK_hide
z3I2ymqa@WB$~(s#h$ppUQ}y?#UY&;A^W21H5*mi9^yCn_R9B9`v<!h0Yx&=?79?sL
zKxnN>I+8vXP}VeEYz5#->t;^QPwZtMu0ScQHsZFqL3-HHq2}t~_~aP%gktdEK3|P-
zAy;}l$}f4r;?FSiPDT1-*@Xspe!qu0N<WB&g2vh;CMm#9se)vHszrCDlNZaK@UhnV
zV@h7U2Wm*)oGWD=7GeZM<@RLN;gMI0c4AT*$Oq%FoMWK1AWSLcuggA3W}TfbHKOY6
z`z66MzGS{w%=d7`#~y9WUVKWj|7woSeAz8$BQxEwwF9kR^DOor&rKK}Js4Er=^I$9
zF$|gfbn|+G%o)XXR`^+tMXINaB^gszD74`^H0+QM8<EeO;^~0JqXKtXfSBJ*#>xx9
z;4!<4z|9t0&-S>F<{~5fU*5h%aoEIlhi>C|s~>b9A`*))eSyc_&tA0IlePZ54r^d)
zR21lb+i(CVU3qpHl%Tr_Uy6+o#IBoL%+oJ;9%n4q**1nNypz5thjTQ`!}vt1u@p?>
z8NO1I#SUsFU2MN`TT@y?f9@6gbA+w5AlL6p@{DoqP8tSW!;QC!NJ|!<BklmnAB9!#
z{f|mOxe`bO(-UR65&|a09|Of49XY>xol8xx1HzC9{8Pl%%Xgb&)7f-Z!M#ML<x|q*
zK{J@2;4=19OvknKYFKUv2IE&%JQ_qRR*~u1%C^eZO};Ru9$Ts~SD%J!YjIfxgn)Dx
zj0uSd_<Eu`h*RCY>8b#bumM#v14>%5WEb<D{NZ9PI9N8{^%;HC!-K@=s1E&$MLW;*
zUo2|PzS@-P;)tz<EKsg8-vn=UVj>$K2Ufk1hY{`M!#t<aIo;9SohT93gq;-AVr<mu
z@gH{pQa%=-z{SBKu072CaxtNFwv>XN5g{9gGzSm&_i3ol1i5y#O+slv^;_|mev#+f
z&%eH%c_`7&-p3VQm#_FhtUVk?`xaYmTYp5!az|{U8X6l1t(3R6V1txKPw|kt697es
ze7?)!f}-AJbab?goE+Be6kv#|kiZV(e-*LU7DTY%5b5py>~Ur3L3j6UHcdfc-<Wkd
z^=XxVF=i!@?OV)7;uyA!0e-})$aZM$uu~DNux9?0C^f-N;p&i%9Z8U1kx#@by9GPQ
zJf8!i5j)h6&1jz1gl(MurIQX++$m<f3_b6|g|F4{Q8qP|O7s!k`0I(RrGUlXRK}&D
zT6bsQRb{|cRul;L`};CL<eToSeSTKE&nzOezw2Ot8ysd)6*5#C<V3+hhZWlGMts=F
zJ)Kx5&AJ}3&ev5r15!;er>j&P*c?9-3{cpLvY0JSf++my@@_{`>g~eS#{MFAO%dzD
z0h6F}8+@~B^w3stH&;@^--l}OIGt*ANEgO8t#fO_|J{&x`Ml^$?|lQm8;^)DI4A@Z
zQDY5555_je+3w@I456646R8E#>p9~r09Uejxke?s%fZ|MbUA_*<dPgZ#ZD*|FRv{o
zDX98lZES-1v?c1)i_qUn9cZ)UD`u5Zjr%<F0Cnl_|FS1UNXI0K&ohlV)BG<RS3;gp
z)+be_TcxE24S&`=+?ktFLT<W_AK5uLe-30uxII4nk(p4&1(^a%WQVZU@F$GW^_=SM
zCC18kn{LOQ@D?_QzI(9YY;kb<=B>Wj$t;3A(D^29Ik0xsr=EdpYTbUHp6OnbqpllT
zJkXG#g3U(7!iArFqhv1k;a~@o)~+AIppeeqL;xvHR43yE))AfHQb{Dozu}?5!N0&;
zrFyu>MoMcbq$NFBRs*@o-tqI1#n{ZTULhTQ+z5mHgM!dk4<8v0EJem5A!DKhb0GPq
z%|2-I-W_Yb{kvTeLu(?^oI70NqoM5W=f2-u`t=Z5iCe4V6a%CC_!zikpdkxm3-Ee@
z)JC2cdV0llnx{+qDP+3o#loFiKrsBc0iON*6842`nsz3ZB%1YxY#-YL+Q*Xq>0P@Q
z1M;~6e;Qsm*H-1`Dl?qGO95P5EKxIQ6Dyd5UI;U%B|ERf+en(O?(1Y~@7_+zKz;9G
zhv0a+52PK^r%sd3v3iQUFRpH{8>*mcQS9(2&}mUHGOUO`xa!+!fi7@DU)$6)IFdj=
zP^f-1b-(#;24J^J5upwDq;gy2b1Kfu>ijL<bfCG^sNCC^Mp;`hPj=v*kDYiiKf1#Z
z4t3&q0R|W-I~VOx&ZdI4(1|Oe_`aX$F#|#RjGtJ`;h}GmZmPp@4J{6}j-E%(1QL|o
zRmG~m+FS1x-gX{MGK~;$^%ejV<6PZgyj-DO-?RO33`oih1r7{f8OxMd8ml!|Z`bOg
z|Clj}GUY>xW6hCc|NANcaItp=j9vi`&>047TN~(Ew~Z@(D+LwcvuE^eD;0TcMapTM
zl~G`S<$BA^=6CoDN5pMTk&PKY{w22Jp$I7bkZsLa8m2(MowT6!;fC*pMBkIhQB3of
zMg?MLGQcFBj0M?p(Aqey8TdR-P(TLnj}DfMq!Mt=EoNqNCjml7)a5y-mNm|N#SEJm
zk+aWQQwd9Pn}(bTElinRA>Uy%1od$9a#AM`JoJFzUHmWJoTMhVDt(vJ7Whh9O2d6J
z%R3L=yvJ(T1z!_j_&zF<C{7uY2&`7y6>?kxG0nWzHa3_T7!)rqvfRIJUmj0WGC*5i
z1BDjoM)6JY3NyXfDscp)6}++;tO_$h5HM6L@*=JxoH$9Bj~z4aCS)(i5_$l%kfmV!
zM{6FxYy6H{)@MQ1Sl>rSF@Gr@Zy6f-V@-<M3=vjNvR_4cMbV(hY*!H!+=m>Cr4c(U
zf8JUo2m-a!8BG{{KHwuu6ah389j95}*BJ33gYNHRz|(sS>-6qeLeBf2YYfU+CtC`9
zo)L=T&GljiauL`)%S5Uzy^0~IS>yoBL@EMLB2bYYvQps)H@>+6I#s?hn0ZXtbQ+Y~
zw$X%v3{LEN%3;Cdu!Hg)KFU*rgW}UlJca93yx-&j!X8%`d-N$^@QeQ5TF<vsG6#de
z3OR1QSn??ig;wh_;9YGMD44ei@dn`Jx1x|i($>&cY3-`4PESeHc=HuFdR2O_wQZ-x
zDQO}}kvG}huOWcMmCu2ta)1S+LS4xk<qH3(L(enwAxvGnu6D3*D@K%cn1G3z362L{
zLUJp_qn>Eyt(y(-(}w-8Ie<i{@vtkr59sC?K!WIpI4~)^wuPl-kd3&_zKNI#Z=q~q
z<2M;DML>KIdMCtay`qJytx!(T`T=;8Rl|UGO{`9p|7|5}oYtJgzBi|tS!KEm@TtQ9
zA_-)BIo-TtYhKd;clYbVC}%@=i!4r&5=)M3`Zvy^-yjVOKaK?R$1l|{Ti=ww?mArf
z^Y-VOrqh+{l(QTnTa3ihWdKnPRMgaf&%TXjxyGmugNVoy16*|Esr^Y>dKUEXPEmML
z=rVH3d4rSX6NdI~KGoROyb<tYGeF?pli@h(jf)~7ekA*#<0hLw;rp4N<|+x~J%v=1
zU+X)#-^er*{=<)e1cBS`D}loHPw}b)&x@TD1IzJm?IISCoiPEi6d#Vlw+AyI-3#ye
z@F1TcP2dlU^DmPpQu_|}8Q~EX@TMLyY!1c#Cx##w<F6CWke4qNfh+u^b8S>vO;;KR
zv~4+0UOgvaTG$r-yX|(9I6@UPeq$QiaNIQijQ<)N-PpW9mGk7fkL~?-o#|OoS?q$b
zH0dHV@@tb_aX6RRnVB#bwFw{^Gw~6KMYYJAbYfy=1_nqS=%ksxqGCjsf%_2BV~%{+
zyNxElmZWE#(`8jdp+C_8*k`w9isYrn1zMqAFludAKL(l*e%wE*Cx$2LmN-b{tvaL5
zG1g?$>uGhuk0s2zbNRcaLC{456zthHfRuk3xd^Nm-utk6<=m;{%}3<doPsS90<kbO
z$l&9FwvcL>DNzYm+3)w)8NPu}<J1A5EpI?<hcV5r{kC4LR@(x@jCDw{X*4x|Q(<#N
z50`oc4S!68JgKbJUs=4}4l4$%n(^k5{eiRqKLtzV;!bae{!SN1%Ws6!`)$zy^Joyz
z?s^;#3<t$n&HA4|x+`G*e{5d}7rS4Rf9gGO&pOLCnB{oOfnBX?#N097N@C>U1jskt
zB#C6k7`XopDN)8>M0j}I*6+m3T4aexLSk<c6Hk)y?px-c{kX5VMQ$uw5t<LKAxpx^
z+JEZgqZ-NxF<sZj5B8a1N8t9d22XoZ70|9ARi;FO*T(C*%+?o|7MIGKPIHK;%U>4x
z7<WGT(Q7`LBoqyCp`QOlL^J-7`+ArT{Q0BiUpAj`AmH3;Jl!Q$p?~>9`?kuPWP}E;
zuD)KXVMN&TCJbu=Q9J)dHH62(g=cmVk;n?Eo<N&uYibg6|Jm*M(j+xDmO_F3{*VjU
z(m{xGqO=ypJ1cHEPX#u4ICO;rHwN45&$vrC;~@h-zM@ba3=iqNxNPdBla{?+jIsSq
zZFPr5R<kFe@5`F5t$gv!lhKnBT}cLzDJZ_0pQ0kC&(xU+_;^Vu#B{kP`uYOtjvs@S
zxoz{|)1n7R?odh7oqy!xjeWBsg3g|E;#Q)|^-Q?mCF9&Oo>9ozw=6yv=+h|=uXr}Z
ze7St_94*$<!0?ZZd~ZZFCJv|NVDal{w~%sv*wCUa-2$LM;fhT2t481{p3N_t?u$|=
zbeR!?YR@PhL%fa~f7h6xh@ubUM$WroR@Spio%85Va7X)1B;zsfZc}aSuNCYc<cS(8
zo@4D~k4lJH0`$>%T(%N2sr$#=ze^^+=IS3YUH{fWQJw66g_BzIm)A6FsF|r&^6AbS
zMDLD`#h5+<{4&Uh1Gl>5mSE2`w~>hnTsoe2+kfa7+1@E<h*sW~M;F!7dT3wuLMpIZ
z9BvU<V?zR|M4CiWyIejJeNUnbLZ;i%cf1n0$-*Hc12pdS=ihtvFKj*~@NUJ^$C{cB
z+WY<r9~VBM5D-8VT2kLGN;x`z>18pWTU1=4O?gvE(o*$I<4H%8KvGi)!izoeTK60c
zjg4rY^8b|&E-n+$*(k(aO7q%&RW3T@)lN!eJ&%DE8N3s3<Z5Gm%ogyt`e9yf0X9h0
zsby=7A6RU_wJ5okI62lYth!}$oDB%$7o^0>h?|@U=&&NfLj80t8(|{~Pn9b3ue&en
ziiU=c&JwxF842{+PcYkcR$0OvOLH1IW3HACzB|x(brhuJg{B~w79YX)Y{O}z+*Qqa
z8TXw>+5CbdulL)5VfZnn;8cN4%m>R#<|bZ>H+{P6c3*NJoBr@*-+701ByX)>DOo+&
znrL`*PNDN2qp)D5W`FxDlvl&1m5D0rDKzxL5!mLNtAg*#6a16c6R*X6TE4gEGr5>f
z13PU_Hy*zi4e9GsLgd_>+B6Iomn!pPAv~so)wZXGz+g}#56}#Z2IT$WW@gr|%ocWd
z8xD;Te(~X!det0lY2W*%#|v6~9zEjB`rGag+2_p6WtX<QEXDj9VV|F#K#E>A4n2sB
z|MkjFFzqsK^fwoJ2_Ai-Rg)4Q5YfDvQ6amen=7rj?>i1x!4oE9K9{Gjl>6nA=R3Pu
zy`f^(;kX|~Uyz8B0{0R?I!iTGGG(5Wvoja7prjMj)!LhkM)QxKLxih_3C#Ce@i~tN
zQIp)8%dI5gY*(QlO`jVhk`Lb$W2`W(5G37t9P-K~$a5Th>{>2QW|lQiDQos}Fx!Gp
z)CPGp)oQVDrmm$o!?5CDT<k=PO9|9(70$SHi9#Utj*go}hLHlkTxgrV^!4BJIK2uy
zYM6|bin3Jkv(j#EnLJ+gga>NQ#DkiuosF(hH}6pnWJ2(=KKdO?oyYEj>;sjt&QHbZ
zEDzMLP^X&Q({3ec&%c2~Vc4jZ+}O_#7nObz1PNN)S;3^b-W&}%9Dn=xwzkkkhi(g(
zTy{UfI(t>1N;1+gaip(tboC@xif&w?a<vGpA>frsrmt7ru`{o@0aB<2{ApPpL~~k?
z%WpwXvskOk_|fe*4qvO;(~`xFb@DA^qhz=2pQ}*aS|8=x2F8vNG5o{hEfm`HnLlT7
z*&YjSw1eZ&L$DupI0tVfib+3aD#b;plcovuVJ6UdN6{%hF2rZAicYiu<BZe_H~e83
zD=X`x$A>lj+mq+nbHtjWc$xBh@ucFcByI$EzSqH%D61ON$!Z<io1TD1uBqnwtc9)v
zT}9D$#6EA0A57K}O?MeDS|LAQhtl?cJT{dpm9|gu9s_u`-+;yD%hc6yIIf(x2iEW^
zE!Af-uE`8{zBeyZ{AK*~U<E3bE5MvGJH#4G5@ZSya~~JtlD*s%wbLtP^I9?{G6^q`
zcFo=DV^gd6mL^a2SrcQpGi(P|2mFiDaOKKftX~CgVCJD%3r@-Ylf>fthFd*Zab|jO
z+q`T=Vc_MBeowr(Qqbp*yfHN)-x)}=MM&?NG{2#z&%XG+&o>j9Mz|)8ip@xk5X_i?
zi(8m2ppSm}F>}M2+8TmhCp#O4NgXuE%r{W+PiKZE!?;F$*)-d@T~1_LjfT*S)8Y!@
zfKR_@Bq!&Yo|BP?Vc4$3qOS{SsS!OJRuQsKgwAJoQM4sMI-4<IXaa?r`uKrHN<ngV
zTjr|vG+5Gp=;~keRc-W);#vszYwh;4B5muxcUpp7!d{cr5_03*i{_Sv{O>_EQAbDb
zD#9iDxiu@PX{(cek8@py3Jyop^(JzorC93(xL41((-kREzKyK)jGptP%wXvg33sh+
z(Ee)#Y{p0VQZKkstKIuQc7!;%7n~QvJF7K$u8S?_#IpITTQ!Nl$1fd!nRLiv=APoL
zh*#5lQ2up>)8f(t>Ipq~Ms4xg+0oaBzav%!zn>(2QFVbX3$b}_JD>l2AE+kPIcUS-
zxQl#r@d_2}!63R-!1u{$H)XTMd8_#B+~Kl#A)e%EttCxcihZT$c1*2s0sAZY){rD|
z&1MN{>IY_`n;x^R*Tk4}b+vBhCLtu|uN?#^c+42uH)cmeW<WFr2b4IOH=}-LomQ+l
zY9z6tL&Si&n2&X#8Y>JSbi!7OXjS7Cz1s)`bwVSFXuATAc2k`5%Yp%$VWEaEJG`Hg
z=`h`(Fl9usS!Ehr{*NXmIZ50nG&UTr!U)U818K(fv5)NxN1_;3URZ;QG6mMi7K>(?
zifRE!Je1TLm3<JpLa!jlL+AI$dsMJ$m`Z*j?#t&nwzPStN;Q3Px=jt<vk}ivTg>RW
zrT%ay68ms{h@#pE#H~RVOzFG*Gj&50e>K2j0H>zGU17t8Z7#)Xl#AIo7n?+i=}5&a
z!&S=+%YZ~&L{X~g75rn#U$&xm_tG1B3~^sVz0TKz{vhf0(xlTz(@Dj;%TiR4=^?<m
zQS!59jhfcYA{8%y@-a6dyw@m=({=eA0Sb}U5D>Kv+Cq9V!B_W-5&Clgu&D`0!O)Tv
z^GA2)8i8W?<r)Ap7Eo0VqbJBWvxcCgNR!LJ@wc5WRO$wgMN${q@G@V9tU<wb;bi*B
zau*|Xr3o6uo2$Ehs_`8q_qh{?qc9vhG#PuHCibIXWCW6BQW<1`(BfJegQjTp#kw|C
z?njNa7KZ{lC~-p%3f>YMR!qO~Cxwoji!O@k^fYeCo(?G4{G_nN746>`u|yW?oPX28
z1PKC?$Zb@t{0M&(9;*e}5R~3~SW%d?qM5vOA_mfP#_a}^hJKaiqW$i{ZHneg+&$9(
z9#iHz|4b-CXKq#K&`+Yc&0eI?TEPw2^Bw{FkCb~GG!lQ^-hpm;?cOi4haTG?n~0{o
zb=Bvy$GP?M{dDOf3$vmq2_x|$i(;D<=Pat_RSd!fFAiex;QJnO-;O|OvU>#|7M7qd
zRkCp5l=d86B08RWpR7$OGUr(x^EGe%GpYr#el<$A-+a}=&p;qIOR`nf$P{JPD>nSo
zs^Gl*h*ebIo0ci&%=rbEw6z7rBe)}6yb_&Sj|s}Z)P!uNdrT^@nK+Lu&3kQ0X;(bO
z{7)J5lZ3|K=UMW<hhJU@W-RP$c^3vrac309_`@ndxX_3NAvN|NVZqo78`LMvqA1^A
z-QT$rddfPDkud!xqdv;%vEsU{Jx^G#@sZJVGmnzAeKJjTy&0@aNSQKUlhiu&!QnW)
zRuh-%|AT~rZ>W#s`q;AZO%KPm2j~1x=cdaAY9S-0qLV5nz6+T^>CndRCW`Wm%pZM(
zC~kFHoa^dDD7e-=CwuzT;SQnP-y2_3pYKT&mUo;CqUvWfTPsN5U|?b4jdIzbejXhC
zNp-(X{&7?fNnn2tm1JM5LHHzzprHh7%m_RL(oJ=AbjVC6x)glaPkRRk;iS<JAr!C|
z!sQ@-riOa%iH4&9>vQN=w*`mPj5P3RcqjW;3+6piT#0r^t`S4nqzE%9xdb8$T=B{|
zAw0MKN<le3HHOAqWcuNjv=0{|o3oX%pe<ny(y6U?U++oxs?9FL6LkRN!a1`n7fTj5
z!tQ!1n+&(@$<aCBzxHH9xrhYHfwS?q<)mi6^Iuj@6{kzm26r<f@MtY%8X(xh=eu(z
z0YX5R%HX1kp1fndo$U`B<_}J|Oazv^A0ELpI*!ilYv;7jg$`LpSwD#suUe?~=EAps
zXjUckxz9crC;z)(CYh%pBDb_&M+j>VbZK`r0<w6iTr}JeY<<7u7xxkOGH{_c^Q~Y3
zTKcf1Rxmvss+WE&lS1856kFs>NmZE!>xcZb<b<Zi8aDW5MksN=#b1-HK^5hbr83;h
zTux1*UL`H>y3L6-3AP$rW45kecP)k$3WvZ0dUkTM&`4x$5HPo%%~2zF{)BEm`nkD<
zvEm4(L{=ss#$3X!QsU+<J4N;KLtE-;B|bjgLsOA}i!yEPk2PIhRn=~(SCq;ZaIx?0
ziuDelF{!)qA=-0<rov7M?*Yo0@^cDcK$~P-1dJ7c5hAp!M#rGhdFd}@SZwc}t1}4*
zr#6(~I$mjIzEfxg_6iSf-PO7sRsQzKxKnN1(Dwe!5boJ{$ApcHBNV!6t*7oA=^VaC
zBrL^Wf^tEsClz2M^Ujck+H6hr@0Ye4Zo_yThuzZUA!Mm-xuWSfk{Q-&%sP2p8l$5+
zoenP*45Kq@&Cb}|6q<|_`mKk}`!>yK+9v9ZW2H23i{lC1>2;<=8SZu9$@(QThp7C@
z+Pkjph$7u&OUo?^Smd+?o(?9WhS~H*bCr>#aH<|-qKNA1YW7RLvo-aa(x3O2AE+MY
zHABCxehiN-!E^N~kF`*Cebxvu(e#Ux2MBOTvFMc>MRt8wj7zVr<B~_qUw-)h%u(^^
zZEe(1kGf=3NUz)W9@s|ayqu_s7;gAd@Ch&hku#7HjwCU!pq1>A>ercGPt;g!(al(z
z`vY(yL%<wG^goJ*nLMr1Mw?KHVn&f!;981kax4V+y8j<lZygrp7k7ILf^-ZZ-Q78K
z!_eI!(%mK9-JL_Xq;v>KNp~Y6At4|o-Mlxy=Q-DV&R@ob2seB7?7hBgt&iCHv9ZhA
z_umXVmd_acJEVm{kH2<OJh;xYw>bOK+8)pD=)Od`KMsfk*&B61yK<7B{&;OKrzl>&
zQ{v1dKi$uXF73St6o<p5>0De}vU*pQ_1Fyd_DjZG-A3Ltn~zRRd<P`fM4;h4BP(dM
z8Gzjw0|l@B4)1cjp_ol$AU!<+0Z+*HZ-`%CV({`Q%~zi|r)WG3PHNa;?e|7cwE@Bx
z<I02^eYW>{nFEj$G{K5j`lYLj9nU?NrN|{hh+dL#{5{}K;|`JDd=;>uzLJnk_6W!d
z!pUxaNqBg`Ff+%2HeX=;$r-={nWuW`g6M#Cz0C1PBRuf=1F}@i^LlF^Kd@I?g``(&
z|4!G@oJQn7{cP}PVROUwD(cM*v0`Kv-xyo=s7c8-ldh8)FFz-;H##|8p42{U;B^-q
zMpYF8oHR|gL=~aA;yE9BlqwOp^}rW@_oEdx%@H+M06(`<PZ6^G&TRKOy?-Op!|t|-
zRu1d{Vf1_gV8ar?Y*VVJLirbn8fbhYSgL!1%i%p`z+H?rI;W1-?l2=)@IAwn71_n0
zIxpI7xfIF>&xKe{U?D0@5mR<!#(YMp+BaTvOLv8$2(+qa`{2riwk#NI$*$$J^N9Mt
z9Yaw@7tEy#7;H$VQqB_!2U>bcjCuk#|GpOB;Zcu2N!PIhcGL-cG*p^9#c`z#Wi<h-
zXz(7-y7j^y0@!***Zxnr6E3y>#olDYaSozii%(TKvqku5v)Rw*-n~C8Thc&i5U>U%
zP!=oiOM{9RS3>|Rk-V$EC57CF3&p>(-rCs^A`w5nuCA_n^HHpBJwR2G7YFDfcGuZ}
z8eh^SO>yncOia@9GvdKD_sYYN^fM-G4bYjiMR86_18^T9RFMaMWQgeK5gpzX-fSaa
zxneEEKj)T!SEu^RH>`lRsGr|YW^#Z|wuMUUfM<T<?V%o>8ri#iEz}g>)jso8dE_-S
z09`$XG>r&A;=1#M(Md$YcsKl-Wba(p)!@)O)MP;P<y2ZSF6|YLR?kF{4n~RttpzPH
zHiD?*>;7meEn3~pwj`BHi=u2P9Y$9E*on3dF*P-Dh&wwwn*bS_`qOn2l=!tN<W^uB
zTg;!5d-jsT&FSF|g-51Z*!#S+oIGLY3dlBv(7E1!pKGlBCW*1~><)2!Mi!^TGPYU9
z`uGsvi{hVKq$tyfRnkCZjc1C*Y4UK%u9!Dd!R~70$SIZg>;FlBD5Vdp<NogVG=6x-
zHh*s!S%L5^F7w^{jEA6L-J7qe#K8f-Dti6zdHl9{aUqk|->bA_lETSNr=C&VU9ld%
zJQFA>2(IX6_of)&Bpnhe(>)X>j$|clZzSavjT7x8qm*($!vaXL#bDOKb3K89Kwv8r
zHq-!zT3=p1hVL2(=|}AXR*_mcU(p(%juH)kK{C&2N^wnoU(G>m?ilHAktgPD*pIn_
zQE2emVt@30D5ssCR}p1tz9A+i20z5qQB$q8p=1mx5A~r`&kW0TR57G`+g+9g<*d&J
zQsPYkpsi7tEJ;SY*nDfP<}6oxAOHvuzNuZifjV<j0H*k8S_Y7p@)VEoY_4;#I&Od;
zL+|jgOp}V|ZomxRf<x095Z|zzgoXnNOYpYiYBK}h>MyrxVI5N%b?jP*E~7u)gAu8a
z7yJ47IoP6Uhy@|0zHv@8vQqF=k5ISDo`!`*DNfwBYK$9G=Q~-_-AA1bE{Q3@DtkWj
zz8fUiF@&EFr5!C_vR_Dz68qTR(*O;#kHn&mS$ZYZ`i`xP6aLcZ*@=fy^3f#LbE?Zn
z8gxZ#H6Cy{SPsDJ9r{&RSoj~%u&H9LX6-SSZYwSZOq$#<>e7tbfjT<W6zj#gc6~KD
z0gp*9-0rSFn{2w2+dC3oSD*Secay7@Xk<+5ONhGuU?SOglc01z^e@rrI!sBaJxTv4
z`I5=@T)6NjQ#m;2i5@~*eWC}7?d%&IAoBf^>T!Hmc6RM7-ZpsXYA3Z}L3IKj+}@oN
zmR;zp^MO)v028<TIWbC7d!95)oO)7T#VCxflrs6TLWz(>Oiiew1ZA>v^@>aq@`3q&
zYAr7Ipfc(*5J4kK<^xX$?_3n*@BA=HV7|&6*l8-49}xqCO`@1Uk7%Iu?Y!fgRGXS)
z*B-=FYkSiH<zhXq$&C#%0CUa7CR#}`J5dv{^%t$Pe(wTwdojVFgol8Z9S0Nx{LLeG
z>3>bzi?y3JG9@{;JI8s1Q?UNPIoXyaVm1cjp2SH19^_*#SjfckHSIN9ZKlsGl}XD4
zU2RcPLqV!v%3?IFIF)p`fQc>7q8X~LSFz{izFH_X^ebV<=Mo_Q0+fS>&)WVF$|~5s
zmu(h6)@bp}kpq*4_PjiQSeO)H{$X?m3=^%AAl26qBE9c9qlc!GfVFPz!|#1=PR#H6
zeUnQfRkuzS?vNgKtl~vr^~v}2I*$ks8RcLrzJ<$B!m6{1eTliIoZ0T$VTJk3%AnId
z4_4<B&Q_Pyx1(`B8t|WpBCS-Yla=1Ry3QPs&41+l2cX|#Hs5x!mid6_qaisQX74iC
zj0`8zntVQJ$J#2qyp7cvGF@xcg-iHU9>Fr0M1Xs?cFm-2B^p0M)9oBZyCjq9Q9?Ia
zYlY?(VP?bzr$Q2e=f=~Qvu665jypx8tSt#=GYNC>RQ%`}Qv60FtN?eY-y#_cG=pJA
z&uF$0{^0ClsKN($k68qe7*}vw*QxdPcZ>AZ0zPU}&{z`E+Q&%5)e;G<{;W!9XB+5H
ziL3^b<XrcCE2-XG87Ha@)r=ellKMy(c$m>k2t6-T3(UrfpdpWU2qrrt$*&ceugS_{
zZofrjKn73#$HKoMno2bSp3=~s1k)DT+QR1N<S-$YMw0r-TBER2^kP%cq5ziRK)k&d
z-0M@b?XNbv9rpFoB^i^36<t>}tz=n2C_0BlW*?Hvz86dc)*&@w!)VN8dMP^S;iiZD
zD8Z$0H$)YZ>6}fU{vG1b5WHXBh&m_lI5`$JIM|N`Ape$gbR{w&P54W1%2_Q(9-q-&
z1EwQdUnfh~M@`d|)YNeH<yD9>T&bX;_uw7%Oo%U_nvZY~=!-)X+-=cefJE&i@{jM@
zckHTeEY*VCe^aa(+i2`!ZG@IdAm%DGuog06$}(dKTG0&eC)o-Td5tf*98i#|u{3lx
zriSu5ixGIX{(L+&^(<hI)6DT4I{qPbNjS>R%O^)#d;hL94GPR!hT@50Gm!=}Wf*?!
zZpljnndGMW`d%QN9me%rzV1+jZ3?SoXma2U25=6Zn4QUpKiB1SVTmYTie1O0COmcV
z8Va>;;a~B5WOMH(689Z4aR)>LUHh+F$5O2OEwQqElykbK8tS9=aUY8+wmp!4#Py+M
zoL3Ictlj)R!j6KyUuFWPR!p)3HL(mMrK!CDD{sgk>w@nxS6cZ(X3JKe_U><Ec(_~w
z6!P#9XH<mthu#xaqL4aew57d<N@)}EkNP!p!W;tDk6&?!qr&{hG;bo&Ju(av9|eG{
zGNwJW72uh%s})9_KU(jdp|0n0%1Rd7<dpW@dBP-HM~KeocHAeqc|2LB7;K!;fMm0M
zKsI36|4bU?ZC`&MHRTNHd$Kv?xEFJ<(kDZ>e7nAl82{^H^h%H@AN>9u(M?kdJ3dsZ
zMz<lvptZ6Yg&*5ai=AiNh#-+oI}qmjiK^`6wMgNgccbkhBM_E&sq73|NXZ9&l&q|*
zzw>{!*1~x7WHdjC+e85AYcjHxw2MjrnyLDg{(%DYSsgB>=bxs{Ck(?akE!i?nj;M%
z^D-ZXFWkb-`>0<kh0iXa1r2-g2kl&L;E88}H!s3HF|+|^bMq|`m(|=^i>i_<bZ`$4
zjS2p|yu8722yjTU{VdhE3bZ)b6np+tAW<2)qfv=Lht;0)VoQ>1W`@)lhADx3K{#WC
zwr`D7Kfw%-FxO1t*U~rcvCKvqmy!pu*A_@K=OEC}<VzF&Obw7fVkzFtJv%RV(LFpy
znn(0(Pc{5W@q4yBGLHGTf<mfVolqg7ZeQQ1ywirFJ@F=FRRtI+78;AxM3+;}0_!ZI
zw^s~oyD$RS7~qKrh>HGe2yj^k3bqXwqU!aiP*Sal)pS9-ce6EmT=hnLgKboH(+zC&
zwClR^RI_R4V~x!&&WxA7x*4eYHu8WiNfxX;v|XZ9?n0)!Qp!zEE`}|od{jzCJIj~y
z2Pm8XHFCk7vUq)oDu<>hv;KVtYrj;R<>p#yNQ<<>TbKrwBIn>xv%X~9%O&Rc1dJb{
z^qt&4zkXGF09r0$fDaTC`qs1h^XD?!8*?@q*z};FM@7#>^3_l7wf?{xfJ&asw^)mH
zVe7w6_WBKqO2sYV(~S=M$K;E%CocO;EP{5p^a(=z$V5PgLA18bnf7P$r8Q%dix7ZT
z4@CPd#5i*1boDEb+Cn@?#VFw?jgjJ6FSQ20S}f6Idqt;>O}{S%nwAcYlU6vq7ZBsf
z1C^PXLa!;+YAOSspKgpyY^z%o>Xv}ors9V9+Ohnk$t_7q$#F<AgDT(;ma`gobr>`Q
z&B9dl^q*H-X47~sfNkgR9mGoL-dcy*s0pUC1Z%c5ItFZ(dMW%N!|aq+bmA;Kcrb@b
zbzk{w{dZs@FkwWh-0Y>+@&BgPXG3r5wFsljacYxn$`f(FwK*5UUt?FW#Lxo9q?+(+
zpm#?WONppZp#5a0bk6H!wgjQLq=Y6~TVV>u&dv_(xF&18#0b*hcdbY}lnkO;;L#GL
z;=AWS@JgoEzuIE6^I^xFnS=LY7oheniVuR%_t~WiL+c8&YjZW}=bhX`uV?hbTU2IV
zw!?%;01tihU&%~NgUW0Its}#)P1mo+9Vk2jVM2|j!6PsBWG*qg^(q!>ar3D!8?C7I
zH@>qQo8<ZVMYZZMX6j!vtIVxcbLw`N%bsVnqz<!v)Lj+G<56l)`Z2DFM!9inIKluH
z$<xy_m*4Fj;G+vIC@^Z6+e!Eu{Kk&%80lvyKu$+tVJ=(FG{yM6#PTU^%#S=#5z^8_
zx@R;gJyo#d#5{~ho4CkUp6zizBhu{bjyr3qG5riD%K68<p}D!2sN`Kp%SYy8(m{4k
zilh0;EHm~p0<Fdc+Sx4dxphurcTav_`#Mz;r~zkNhjmcl`DK8#U=)9~7IeQND&>=R
zg7c+M4em+T%amyDaadyZW?V@{tk*z=S-1ze$r58OYnC=_Qp|Tc9Y@lbv;2cpZ}o%v
zBiS9dL4N?+shX*o*}seF{vU90HOQf&X__^FYmS5G!u&-WurjI^d)wL-yFh_+v*sF^
z*GKtZQk(I->zHJWk*Eaf-yhpfDAfghrpQDC9$m<uA1_oc4;QOFz*Pw?A)>!eZ!Hnf
zNrj?-KCGpN_3;yVI7S4RW6!ZM2DJ?JznJy5*O&q+GFz<}a#*!XXbE4%g+M9P`p1MM
zq!<SuhR?D}hib(CE+z2Qm93ke7ILD<j@w0i%dbhkt~hg}m6uN#Am*HP4=jFf^W_sc
z*HjB%Ud=-C>-~e}b1Vt7qSkY=K0p1uUvF!Jt<i7MY$;jr)MP-F?q6x(n+SReX<#U!
zJowj#&tZaleojhNi(rPcHYGEp)0Xt)lrNhS$n*Aq$dQVv<Bc@IkC7E;Y|m*tM9#_5
zl0%|xUSAAKf#=WU`8ZhdT%MC7-zQYQnM<Q_DEQB|HL*Ny&!tyY`;E>X;kZrFDnZGa
zd+~gyVg4%9Qp`9Z!J5s9Wz4Sp+eNy>7eF+D7LtJ=d<ee}`z&xAQbPiwcr-58K+^0j
zju;!Nwmv!crMy+LuuDL!MHtSe^Jxpu3z=e`p)?ApAHZY)hA*`rc#DKbAlBmgA}97r
zGS4KYLRV3MLIbst2dg+Jfv4{06xd9FP9Gon<j%i@uvF4<N!0o*#O^$znbi7|=%^1b
z>A<fE!8K3MuW{6yo7>SPh5t$<2qk-~wD8n!;yRvXWtF8&C8%+~afDyThdxl%Zi*yW
z{(>oKaMl51u??BrmHr>2yT{Y8YFv-*WnKpX-n<6r;03)LGqKg))d}3JEwyp+?8RNo
zMoV>P3Wj}bi+248wXy@x0u38~lqRHzsIzH}lSFOd0#GO#5nn^Pq3m<oTwGeSl6pU*
zYbb_qe+Z5QHGGiNO~nws_)}~01FgN21qD*wHxZ#nx%hW>wq#a{o#goun@_^GHE%Rb
z)J9y|Jikak$O-*Pi~6IMV?C9>jt*~oM+XHXBXT1DIX64{w7XiiYslq<!Pl?VPiTiI
z_b{N#xAj2}zf+@P;`>{@cY2f3B;Jq8^T8L-)}OQl+fH)3q53<J9P34E;C(_9OE=j4
zz<JkUHBGv8!7R%EMZz3_5(!6My1!5U(cmTn!Y}u$O{%2%{|iTw5YsSV<Vw>q*s}Hi
zx6NV!F^dm%I6Fxy3N0(-`nuKGegj1T!<&-YgFDy8@f2(elK9hMc3{oR*Xq%itBPkI
ziw5Cs4Y6BAcu1&&4e6R)UB&G#hcjOLnws^ii9wP0-By2}mNOFcZQIgyz0>i#jW^J5
zFMN1TUY{rUvmrmtOd>JwUlJ=B3&p)h!PkMqApJ5SYsAuRj2Pvu#v|_b6!q=RzR-V9
zCtsw3NQ&;7?|A6f;hFQ<%bKFX!HxNwZuBVLZRcudY^q;8-_ncBL$K~`7RsqwnRBUm
z=H-}_WG&xuYT7Mc|11=LOUzeKHk?2*Ac-Li2V_R~h0t-9iFqmc1@KRAv6zJGxa^x2
z#6^#^mZPjo1@>hKffTuw^3<=*!y{y!3Lcye@ls4tega0<F?lA+^05mF=ckOXFX5w1
zU9YVn9z4F*aG*1^WC40K9j-d<b{5W(|LvmrpXQK8a-iR^v5<QHzdr&Vj%Y-8!YfkO
zxfJKEet3g+XxK!^FZ|sJMS|G(5p2<7V_E_pC&Vdp#q3KRGY0cTf51DX24!7;=$(*Z
z{bgBXS(N?DDQF{U2Q^RlpL-<KA~#4`{E7i#!Ht&DZJKR@C=*CsHJBl+G}|*vp=i+l
zmjC7I-9u#zUl0ZReL{Pxf?4;Y2n^3*jg@K4qj~zBvAA*(L6JlSZjz133H+Rlo$Rbo
zB-WjIRev!BOP<jSay%Is{L-hK-<GkgJIjr+wL~L%WFo|;Rr0z@Jsut&##XaZVUM}+
z(_}Edyllj-*)kdPCB}vcg=GAzht|*#;jK0OYSM5yjf^x_PR<m+FG@Xh>iQJ(?JB#o
zid>FWF<Fl05cAJg{`Sx=7{D$x(`*RbME~!466Dv+>HE)Pod2z1F%pU1?^O(Z0?<@R
zjrdg8<CC9!GYuZcGYeHS4=Ju|--m+FTT8l-@5+gb%A(&Be_o3KcaSdie5M7epcpF0
zB>`f^-tKPrBj6yx#SZQC47W{CPcsk^L!=&#G?T_dDf+kHVNd1}ezKJ1ZTaGyGa<BS
z4jubMh!YvW|AIrl9Vxi;BfqzB-s5)QdnLDhTum4~G2X3Lf2<!H8iCAS618}dl#-a$
zBveY@4B>mj;jP?%vZveL|0R1)>DMZ0e=8(A#Qk&j2?WTW;g-2ozYS&@(qk~Ku;NRI
zzUa3gi-?HaD3*tZha-j5Gn*44kZig$quuLLI6rHn9)NZ2$I1S;LFj)<YP6!o{^==c
zEzyo=Ag+Bu+@YoP`(sjS-6Bea#<4lL<-A!<&r1LjY}>Aw|C4Mpi?nT}7-l!$Kh7lL
z$Mx7o0(r+@;dpb2@0IxXZS6qu!mAkrnQK6zHp)hSaa7A6(VWJ(<J8q1`i~&OlX1c0
zD{vrdjd+e@%EsM)O2HWNaAVY#xZE4$Tk>NrN3*T?#EjyyC0PIO0lOVt;gV!K)Wu)G
zhJSNOo>8Zo80L6C_bb%;YdTl<(7DzW2`DIN`BRCvEpYheKYBRriA>kp@*eOm6+%Qb
zr||oyaHfkd55rt;W_09->Z*N*iSleuDSdZORRD{zhNLJH`cIo3H%95ocum&g$i|P3
zsN)}FOEp1kh4E9>C+mw-j>iU1UNwd7ksJL&b|MQ=`#PLKx8@BFKhD1QiU=w>JZQwx
z*)Q4ngzUy*i?GoDM|X^=0IOAc+kq|zKEORWYyTPs_6l6J84O9&#VV>h4mi`Plv+Ew
z#ixtI0!94Qz6e>DaLD+GDnVNSVE-uQ3k;;Zl9jBX?1$RK{W#kiV8DcAoMkU<z9%er
zC)M+?_mmntAq;U(SvUm>XN=$!uxcrT#)Up11{<79eZ>@)`}OaK?*KQ*7wsF<Y^T%8
zDmNm(|Jw|uqnU^BX#AfV-bP~*?kl)(q%m}Yceu_s#GzX>T~><}LF4O-zEF))D%T4g
zM1f>6wUQ6Tc0C}kqQs^>Doa-bJ|2gSM7ztApXJpELy5->`3e?Jxrz*cDL3tXaIq7u
z#U#zV9NZxxY{!c+zfFBTZtVH#aSJFjJO@9<hi8n`zuWI0!j@~m0Z#jmL<#do#u~lL
zy?^^hTk+`!N>LeiC~jR3{Q<qU)OIFW3bgz%Zo&=jKlL>8{}9w7lddcpJc{~m!0KqF
ze|vIHjV*=^frsKB8q@IH@*Yu!XT{(-PbeT|?EJBqfE%OEO6FjeV-z1&Z<c^R)ifs1
zk0~op8iiA~Whf~%4Uq`xS5y7wSvw#?oUpb#^>BZ2)V;j^_qt}_hn?`>?juxaP;ZN!
zOI^KCoWao4n-vTSx*IxV6Auf#uE-TCFHcK&ob#x5gH584+l@LcSy6luLS?X_O|$Ni
zb{3q`&9fH`Ye_$s->f^LLAx`Io}OM~zoXB+m+GXK=8vGm|B6f1#1s`#iAhMRnPZ)r
z4dIj>u2ZR38^coRm11X-tczu)yV*&1&=ei^?joUzOdZs+Tu`j+gQ8|4=Raxs3NGti
z-nt@YSjf+X8`nn`d?}vk`9wjS;miWO;?tz=^w>ZYFjHdQWg@F#C11BLyL?Q3@czsg
zZUVhG<w|Y_*{gaz=|>k<A5Jh2IUjPR#VO{eS<_iUnv(f<aFJ$K@^_~{{(YRqX$>vj
zLT@cq3d&1~!ZKf&D&5|>n(SHywxGu9+vWf)7#wfjxp9n6F8VCQJHs6Q=-l@y1b?my
z7NOQ5komSbnaDm{E<5pZ?pDjy;~K2B!a|UQ?@VD1#aHF}8eSJ2scHUfkhL4)4zYnU
zw8KQV^ChL&E7tapJWVztnF#`04||YobAKtjSLbzK^VKj?4i+^FLA<!3TatSjrcTP+
zEv-?Z>dD|0Rg$V3Xz?2t7?r+7-Xv{oOL|50;I>X|=}59PmT`snkoZ&tSUIkadUk?<
z^}71pRsvh6OteKW)<}TYi38AsG?D-)yeb-O)x1mtnIoUJCZm#0Sm?R>IsC3f46}FT
z>={j2<ji-ss0w<=e;uPZnd~qZOT-i#i^HHTB9yin2+pt!IVB^K6;?>lQ@|~cWY#z)
z5^ug=XOdh3S$gpLP|cm@B;Wq}*c!>ZFR~3`)U|Mvlc+W4#ky|Kf=Og#;Za#QM`sBA
zLX^oA?L1nHL#eG{7J+Pp8ih@?WO+%J!(F{kSuG4Ca%u_|>q^Z@21Gd$DC!w|Vqy?l
z99q|pU)}re=6>R9d%^c?FwiM6*`=~(Ujx;J;+YVXd@j80QE*}z3uRK|sVHrKBzTrs
zFqx-;JbmYdkD8={1(h;NV*m8sW0V}9@KY}d05n~LCFSm)yd|RVC>|=k>7~%!d}*Xz
zJ^G#f7t=>R=`i?~?kU4+cb9^S8{LE=4H7#dWdXJ3^wfE~pM6+VT8ghRl+fp4vX||K
zS#tcYkC#0gv6Bh0Na_9fk1@)MuV~qXxV5%Z7!>QI&NpWvnL0mSf}UK>%3s%2U55YM
zm!fo!5dLx*vQW|PX`y>ncF>mZqys^#mtJq`1pj-i+U?jplfzlikjiKO%EfR+>{E4d
z>y2;zA39Ance#$2rg>e21<1<aaf}ur7}Y#A{d6LK*|{8fJo4_pN)Y(<A+p0-AnueI
zm)jNIhdfU#USlmzrz0z$yi<y-r{yQ&04ab{GZL-IIL#7eSp*0EPZz*SVpoWnmtfeQ
zyr42A3oSB-+@Ngf=N=PM1(z5)1@3_3!|rl=zF<JtzDjmmK!nh|rwHC_d*FSF<gq|b
z!W4)MnWz!xsZ}Z7CrQui3VAm{l#|GnuDxA&29N@U0TMwiX`xHTb)=YD(TqMJtbtHC
z&N)&lRIN_M<60H#*F|6HmPL6eE~}UD0@8uY??G*5&ye!CKy}HatHWg#Sg);^8DXyQ
zpDE_V3ZfB8V5G{0<t9I<=!ry-ESpLq=B<nj_I>G-+I0{k6+EQGWOAHpToF%^{d$)>
z4J$fL5xPZiJ(RLz#+M<BjY={q@GTQ8*v0;HfHb=zVf*%9>{%28C6;@Yw1XiXiV+VC
zY@+aWR)nIU4hE-EmghGyariJ5Fa=c}g2}%>zN+^`c7!~AXWahptHx#DGkg0`cdh-}
zD#BDSOg+1PzHpbMh)PjYSA>I7+Jw{JX*o<C@V6sOmD3D-OKbubnq6Z2P*Gr@OzeBM
zn!IfQ|DkY607gP`EXu0t!6QpysKE3Wq;k*RfX_2LDCT)j^LyDUmkt;>V0oX0zx^R+
z?n)dQ`#^8sr+-B$<11;o`1i~9smp?gg=9I<krniVL!U@UKS$W+VqDRu3gf=WJ+Sx4
zD~)z?Da&uv9mNWgmI437=Z5aP;X}+dqzS|wCKom2QnxHWQO*N|VIZRQTE3j}IFEME
z0pyP5f*4^C>%M%8D9$sVlFQ90il-?4y=*$Q|6{Jy?bmmD+WEBL9bk8X2^N9~^THvC
z6d@L)!CB<%%^NV>6wmlv^<lq#Z|360hvreTv1r%sDj?+uNg!COsG*f?_wt^<v%#qf
zlo+48Qzd}CGSbL0DOSPU^G2m-T7t`CwmBx33V-wyC^BXmi_*hzExAE-K3ulHqIw(!
zmN{Asz&$`iro&s(2DP7ExE$)^zq>j}q^PTNZ|)PDqaG>Iq%!ky;}?eSbj`Hhx0Q^L
z1plDtbdeTf!L?*cMn+LKxy$*MHjb8f(^ObO;e+NJeDTJL@1|I&-}W;Wzsv8i2)NHX
z9)gNv;<wJ8r4Q7b!UYuX7nWjC76qO5ER<W}`Nq0V#<(fj&{)`e!_P=hXp#6hPE~gj
z=ll71VM3V&2fp=S=jPt`HeOuZ-fIs2>LTK`<`|wUMO<UbE(f>X_}@@m`>~>mK4JPi
z;TW-veH3X$f>R_0(X>Rg$d)z!ExnsMWX%9UzF;MnB&)BkvU3jt5QU_(?K}CB$?N0m
zz%v7!#9{+br8XpIa)>E$$M^!ui4~8`MY=49Tj^Aum@u6^cJccIZv%C3T7r@L;7p@d
z!Z9|fYjUdBEjSG=yC~tE*Iu?2yU)3u*R7NAZp6ghJN;&Dyi3VY@iMx@LNYzQo{h5C
zaGiYX>T64Ft7ZJk4mrH=IT?V~K`6tYRJ-K>gEe~$6i{q}WEDwy2A94&WbvQB*yLl2
zzr*1?zD%Eox{H{=tqQ^pFHaBWJ{jT+E|Vpu1yi_xwSzKU5w@fu-s7XHGC?UL6+7{6
zXVJq|0(Ctp9z0Vt0@J4B15z(`p7=f`^DwuRmX`XUxz@`3oV4j)#@TyYPjAU!z^o~&
zPY`FbjnFR~23~-bnJGtlQjXycv*3@b;u^M*RtR8is!OMv`>|N9#mvJfG>8}XhDsG1
zKnX2|VQyH#A;M6(arN2}%r(WT>gOizD2dC`DTVOz7F4|}Kp9z{pbHDkaYj8_D`+MP
z+TU@&w^ie?YiLx`<sd*nCk`S9Sx#j2nd=kqFfdtBvi~BZGZV5&E(4rX>TJ4;I)?bh
zMm!JLNXV(>#8QOW2I(p=Qn2xE*{l&!Cz&5L<JPzyqU;rdmJJ3Uk#7Ot#=pyXmDC*a
zxe6;Ab+|*x@>=#2SOnp>Ka}Uuhi;EDKsCiWl1>vNv<`wgOXrw?b7G)J^j>zpe!bgw
z8^~s534E$jtWNOaY`{#mB{}QsQ~*P*Nl3XOyrXCYR`x5$yFL*OisH<5P+WE%l!W0>
za`7Uh15BrbU=s;Y_JYlBgyKu&zw<Mzq~v6Zz`qU-$2$><E-ZMp@bK`rht*%ePc|di
z$Gx!k84%1A{tDLU@&z62|60_~N2sD`Crv+s%s5<kWDR-(KCaI#FAq0bT#`PsqD}Lw
z%=jDFmivp+A^FQug@yy?k`eEb%1LL+8%nzb5JfrM;(=n12BEVphYls#x17-70qHCc
z!P&K&lIZ=zGv<t0C!NhCt9oi%VePpm3OCbAP2M#tEgpsvCQ!F-7Szq$9&Ph{XMGtb
zXr_#Zt1V@B(DCJ+9;B=TKFMD?N^l6F=KK9;8s~#kmQ&A+g2P0>vkHTbBjrC3fu>*S
zHwJ7bSK>j@&G&^v-m4_A2^uA;gguyVYoc>Gz>(t%-iVYq%BiV~W8s&%xw$5E?YSY_
zh@{r?%6e~S1?dT`TopKeNvU<!3ALpFq-qQd3`DfgfIpfmn>L6+Ry@g_J~JY3O97LZ
zIJhaTi)9U?kI8e}r-A)kvA4mE5Et8a7fg4_bc4w%Uj>0{*$Yp)D=~I3MMQ}-xg91{
zkJ~7S!gg0kvhH8o2Lkx7o!F^1>EtUba-V4CGu9c!-erd(>&d7Ill^K>0H%qgrP`_i
z&d%)p{UgyH*@*pnKtFyz5Zz5LB}`lmhKGx`6sR)O%U54}Rto=nY8ZN`Gv}H!i$qf4
z5mvZ-lD3utVhG<56|euK{Uy|yKQW+<Gh>@M*;-=V=Pye;`o6Lip7VI!o>VpYtaS1Q
zXM;6BD=J|wp%4vAlqjmDD{amKKRH&m3{+K>FmeKsHLg>8R;oW9N1u%=Sk(X|a8H=7
zo}LNNajo0#Y|;WDNVT?oyiEmRig@u(Yhg1~c50i+oq@eju1evI6j7u4fBfa1PIpzO
zhh@7vyQ2MbHSR%OLhrx;a)te1$E-L$lwcu=c>}t8CnAc-G_v#2EALp=jgBG&@=9*{
z2TY{R`ooX_&a&nL*_EM25{aUvCGOo3ci-nb1<?Cc2te_)QG7Z><F6oM7k)<voG+Ta
z?ANY|x^&=1kqhW!XP9Nd|LTC59igAUf41U0A-0GG++is2zH=oWf6>eaNAIe(aP&{N
zQDeM7UGCcu^87@uN6)-yzgqmyW?15N>BCcH?b5BZ&~(F{AZ~x$>pUij$GHRNnOR6R
zvLbMeLC+KXTYZz7rAXDR==*z+JP-~Ztq;f-(hC$YQ!T-S)2siIsK|d3BPx@9IBtBm
z8rxGo*YTctV%$@no3}+dE^ZK&gKSVAYIit)GSjK2T+E;caN3frSIrzoo4+V%dc{V8
zDV{Aq{_C>26Y-mG&9bwTdnE;ki;KH9SjmY@A5ylqwnB#$zF>E_M~=|UNoYvzZry9g
zSl(&MWU*1BaSQtId(PoS@Xk;Lno0-`%+IHS=VdRh81FnEFE|YipUK9m1EK+{2RcBs
zEv`sM%Ra=b1=`GS&rB*ud4N8kcXac>izX>8jT(t%YyV|v$Zzdi742tk^8H**>2jn3
z>lpqDueF(`xAL9Z)VuC-U@I#tzz@d5p^FbAUD%3<U$lHnemj4fqjy5HQ%q;s5#8m;
zPS?{z;<Lc6Ln+!k5IGT?^Fr%tSP&78{c+5Hu!^%GKo0D4FHOP1f+oiiySWq|@P(e#
zmG}40_gG3c45>EASnges+=KDWTJ>#h16y6^=l1pvJ5m8Ar0}EqAv+1!AeEj)IXMp9
z{axnMPY1TJzz>S(;!-*#fIC}}X*hWrKlq+p^{J{+gvOOry0DhQST?qL9tmf6k~ma_
zHB`YfMj1x{&#tmymg>I0&lVZPYq#~h>XZ90g3{v@kou_hW*l7zZJ>tgHf)L`*3gn*
z!^8@zIz(F}d0HX!nR{)4;k^i8RPl`?71o!Q`fIZCcD$2CravCeo^b_Eyd3FnCsl=9
z@uMwP5e)hB^4<^Vm@ViA&4I(;2I8$DZ`k`om(GjXK<3+iDKa^%wb{b*I0n@;WZ*!b
z(o(s2_FSb`%z;!YmRSSY?Gh?wb`V2>QtW(wJj(uLMESTkgL`6!lqD!%8*OR*Idbr&
zaj;>5s8sG>#eyRC9Tk;S)Z2`52{DR!={Wpgx8_d9VsF~)PXll750_W12Bp08(h(XU
zfe{k{18w#p--2OICbAHoRJ0qk^)=8mQL1GCO5k%_*!9xJ^L3j~%Q-NVOuK^v>G(-J
zwr2KkN^>|G@<_{wa^ZV0aQ(wkA;3z$#%tu{<p_VKdGBb6oy{|az{`-cyVx-@3F@;7
zdN?|^z)=wH{%b+R>1INWTYKst`NH39{gIw*;9vT{zB##~JOD2_qDRDvUy`0_D?h#6
z{fTJ~#oQX{DS}YU1JiF>|06`;kncsPTd7pRR4tQaBjHG@Ee^R@$WR^-pxPrNVzSNx
z^7+M2YHDhjFbMxGOIF0sEg8}_7W^&Ga7%_)y<b27fW*Al6x0!T)@xoXMJ2J1-tP4L
z{q;5)d#;jKT1YJ}o+7wie<!M&f+{J{{cY|@AauI?lP-4Y5(GOjfoK^XL&RF~qSQX#
zaut}9qML1>^ZotZ;X!j#U%tPjaV*2RIZ5xwy7&3+Ww5Hl4N-)wUD!O>h)QBp@DFTm
z<A%(}Z*0ViTu9de^5vLH=Bq%_#8oe;+s4j3{UM>i>4}+p9_|^ygQ#~BzB^&2n!!Y5
zPhP^LcO`HjhH;yS5)RTJtvlN%t3I%^?w3>a#F?02rzX{f5!5kS$^3P#ml_woqxO~j
zDn<W;?J;pXc4i67c*5=>rcLQ5X-g5&B>q2c`qcW6Hbh-Q-QT-YY^gR$q^2mQc1q<)
zu*au?FM_VFuBC5}c{=bJ)O>k5<-Q|z6qUamM)}!fgl^78%BoU0jyvxOPoMwNOt!R@
z_O0kmcRUQwmkk+IB^>8IbIuEW23xAr_Qb+$gk!mQ#630~Y}F37di{=8_Bxkqk^J{H
z#m3ai1DFHr0*!yDQ7a~uqOj@7HXB&kt#!BW`kgAD7_T_k`&+MO+0Thtm;gt8s?F7K
z7xy0=@eln20~UDC=b1gSsr^8RcS9qSF~w#u$+0Ex;8dUuZZJljnZfZu)EH5c?%sdZ
zCA7JILhXD2ZLO=y^1_WRKh*)V`6t;>$1VRU__D2T-hL!&)2}RciJ<^uT!$NN$rDU`
zal_po&jR1>UspP1zPdY}IosXIPFJhXNHcFp$vN40xWVbbsl@Q(mW_6$(YEQ*uDmo_
zDt~4Oo6v)@Hz=xA&}m(0mxqi1MtYMJl$6hAozD>9t!N8nLK;^|!F0m7iatMRhV8%s
zc?~9z^C_K&C}oFtVcXPwdYUG*nPMP)Ppuy$YIE^7I-ALXXDL~+)BOMom(7rQoJ;4i
zlfl7yarpiV!yA>{HBiiR>Rw?YrwolJwcwnNx+)^}rN1CZq=6LQyo8z0a%<pLh9mh^
zrf}%Ea&_n0k0|?<L3u0hoCjdo<_euQzIk0O>U_AOcituljt~=bdU<{_9mo#{ea3<j
z?^nzh(pwh|d=~Nn%zC>`4iC~u=dO)v5XV`?&Y%4f&OG|+DVgE1caS}Nz2Zy<mSf(Q
z^H>cBH|3(&8^~&+KlG&%*oAB){uVM$f`LuB^G+gcD6OENK*$N0$c&6lNP%kYkBNP>
zsAp|qgPMgJ8{HVvycqoaXyuzI9k7%HAI1WWG60IV(&mReODOql*!4I7w|T{2Y=vTo
z%&^8x|Bdbi$`<JTm!sj^LFq;GL<xt+5tF4Z<{Y~n$TXQJt0lDa=tp}adGC{K9`JtO
z0D<tHG^ll+V71a7QSmZBqi4;|seYQ()XpDOM8V%s*_bT-OD|OFC&?u8<Hs-5yN7*v
z4O&x4TFchhE&)<J#n>g+0!NJLCpeNoz<iKlQ;dW)xlhs38ggb*o@8SRVrtH^n|rMt
zE1+(`4QG~^$<exzwE5uUBY-mr7IqVMuL)LUV)+^x5@;k@8Pj|{LU6bDeKJcZtC_~#
z{K7H@wcQv7sJ2T0EKTx%SdDndVBz`N1Gr5S&3h-R5Z+;sB_a*HOY6~5a7}9VN4X{y
zj-}L)QAEjtv9inSzE44|LH!ZVL2<EF$Q79Kq;5gJ#zvd~b2^~msLo^L^iyZ@s(>jS
zXJ^yYks@w_83ZELXNWF0&Q40A8wI<tFtb+815{st*+&Fu|2nJfeiBd`#@*&Tv!`Vo
z<P$iduRrxOUr6MgHNiLCai(8ydYXDn3}~pKAzPDkIfR@_T}+UfOh8lqJ~RZd9O#?q
zsHtV~;N|uF?0w)^4cde9g?-0j1_kF=zj{2k<$No1Xp;uGqbDmoy3<**cI@=kET4I(
zq3U8vuibpt6I|1qfV(-p^HH9_uZ9!`xXWlITgxN0P`uWD09>nsmB&5=GR>IHZDI2I
zPk*(&e0_DIY#*irpRr@RjlzP>HslpppIAcWn&p$XzTX4BvT`K#ad%jF{29Ej`0vY_
zH__z*)Wow9hi0eaNDfIP)a>VKE&h+ESw>#-w88mFL6S`cxE|A~N~K~3?pJk>dKEa-
z7pYh7ls*DE8F2@&n5{y2^o`n9#$S4V4b*OQpL(c6=#`sjZDD+Q^9SPCm>6&{TJ!u}
zR|h7&m_e^b7EU&Lo8I%juK<?as73hC#<#Nid})MQL08*?1DRdqUeVS|%{DR(buQ$B
z=#t##gi@?Vr;*~{Dw1;x;P4!HQ5rwctPrpx8fp$B&n*?48uUz#>`8p?UamgIR{DUg
zgqOLR^a`#YVVgLuP|7L)fMc{A6x6B$HkqolE<UiVW_e{VM=Rlg*duEh-DmAr?P%wq
zxTj;bREdPlz-4=84G|~R)3@j&Ei4M+ztcmuN&afvO!kXQo|}xe@YcRK0xmFh2x~Mg
z7;#WJoITy7QWzGSu8MmWlA!bFhpgLFmFsk1i=YkkQ+gW{iWp$!i~iWd@}V_p1Cw^P
zs=tZ#0I2ww;fzU498RH8sG^>Q-1$5aAF88?jw3Ea1+iafQU)Uz*hm#@s3MThj55HY
zr>(PwAAaDDUh3<^|FHCVk{FG}ezrPOvP^$6Vs8l(24qV)3;E<DT}pEu!jF1{R+gyv
zFQAecTEP>&1gIW<fVsE@_z04i?24$*3~w>*3xN=v`EV5#Zx2%nHB>g>qMKZ<crq?0
z{$s_rC2w{AN(e&jB`qC~Ff2S>eHbEZfzrXwgb=KnoRlp2H~W|J7gbboayD-L6zihs
zKbR_e?SKjoDWNf5hf+1x4lk=vU5LaOoG?vH3MkZ?{8BbQK~Dw#8>iv*CU+B?_sjhT
zhLArvzXh68hjN=5hkU*DdN-z}*Rd`re_7`P4~L!19IKX-x>{v?a&j<>!>p9Q0+E?~
zf-juk%EGk9iuqJ@b!(RKzcea_(AgWH*=2}Y-@@2czf%&p6npUQtZijP1tljZgLxYp
z)Ku8lNBt3c@LmiZG~l!M%uc8Z)JB?t4NBp^v%D@o(v`RNO)c)2yi*Lk%W|EqtNler
zH4wiXn`}s%&`N$?OkUSORZz0BQM#XBjVLVeC7yZGObB&%(XYaxP|ctsE&%PO-ExHb
zKN+CR{HHwowr^rbwOi2NQGqj4Q&TObFFOfj3#_$1HPQaZowlek>?RCFz^JQgm!Geu
znup-b7Ypy8oD|CM<?*>;WlROi*cR93;D;ZMG|Mw>f47wAMaA|HVV^b=(|e-bH)k+4
ztKK~-q89iE$k5#jNdHHM2L91$)}XA22Py&&=s`PM)m`>gYfjb6&YE4TIU-QDYe4Ke
z>ql{RR&`Q*a<9RpQ>l7cueNp8dfje#fbJuYK(+T#+@-N-nOfEKpRpX)){6qkcKqKg
z=2Z=s3H<lPi&K7w7Jq0Plj|Ua4z>{l^F#vIO9w~U4$0d0!N8`M)vrY-o@C?gd49H<
zzHCc9?<e@6O+F8oT$p4H8<{9_|9J|fWGTi-vh4#iefZ&5@_AMx*5JV2(@(VXVWQwc
zI;eF4pH6Nnue<Nh58lAD?&YDOf$xtN0-kg(qNYqaQhe>-*}HbPU395)I7*TDUOKkA
zYVJzvW>g;gl|n;ui;bc9Nq9`lTrIi(zOU;?h3Rde)Tgd0%XS_<n|5q|Ns;NU8>l?N
z-PDeO;(FUIsM>#M($bj1AD&emX`a_nwwt>vg{L$Yc@t}z(LX2=UYg#Q#S=ijCMhe)
z;tUrCQ=aT1Qb{KQFUV4MQ^$idM)yQ$j!15Y(9S1lR^LP=tTe5iL>z|UDf3kYcKw1t
zLb{l_77b>rB>K@ofwQ*$41HbK3x-ZPD*21P`zW~KuIv_q-Eq>>1F#QQU>Zgz)J5R{
zT6igLBl-NCQyJC1t!m!rYsroFHnvi;snk9)Dk1oD0NQVZI58p!u;zLAKChK6#ftfl
zf(IuqE)UM(N86{i&mOs^+YH(D#l}z56|rd}DC>ig3^DLihlWi%-Z;X1v|Ch~p`HCs
zt!90<I7nTq<(W^k!HD@ePLb*P3b*bH6;?bVJN_GSGPv!<o^E|BG?rudx#NixpJbbm
zBy5q{Fg}sPsM8~i--d{_0GZMBbiLbki)m75Sz~8rLt*FiqnPs>bib#kkNTb=iVp$G
zSoa>)Yl$Y+`1>A9vj2Fv0NtxQ;ny=-W7Y8s{Ei(%UlKxWC$+_{B3${yws{;~M}g`F
z4-aSUS+27`g`Z)JF()$%b}+zB1>kd3Z&9Mw2?z*GXGH~5qdQhFyDhHUKQ+rgd{q0u
zc}!C6S(xP=v*9jfNP5Hh+<3y)p4_)xY}_v+9TW|IMrVg*K@GkKh~X0#9SPGmKIciJ
zlDgeC+yVmb3#I?$o-lz`0NBG900#Aftz~&)?T6(;-y2uAtoxP-w~w73XiZkr86q%J
zGZEjvL*Nh)xWq?|;&CH`$mfCdutjIsbaTJM@~C4i&V3$Qy`MrD*iDo0qt5H|Q}-Wf
z&f#5znttrmBft4!cYAx+as#nO+Q^k4(oZ6dp${w=-_SGM-JnagM%HJ!{X8ZKC_tJx
zM9vG3Wl=G5{e%8RVj*PT_{#><wDv=0)ZHRJTFmmN|3OyA+>rqxC%r0n`<_CuRtL5p
z+Q~w6PBK<)em{V1=BVQp;ArP@&h*#+83NEenTtOgGG^bPdrBCLa(-P1aLpU<fn$Z@
zr4so_b1=y}TU-C5qT#dtXuPwtsz~pTMdINq_%KNb61dVqAZr)Ig>=PEEgzI&Cz8yk
zdiMI>C6(5G$C{6^e*uJ168Su2Vg75DH#95lrK?Wwi4E1;&&TdKAl9~x94uA;&cyHx
z$Y3?}bl`Fetl>MrK_Ip5?R>}7QHW!a8UbRxrJYYwi&~NjzQLM!oJ!LMBytPAGjHB)
zqy0M=C(|JiZ<5%qZpWlvUF&dbu$&~E#O_~9eG%byZaz%y`X7<R1f*A}Xm7vxY;b`R
zD=QffMg~N#S>jzs7<8wS`g~>W^mn+b`5`tcUHChRg@qLD?v;pAe^QfKop6g~KcUcA
zQ0q&CkpB##l1UiZ@THxnCT2O8q?m`Wm#sp*SAP->6xx3UTgK6>qZ%YSE}gJ62G+Ky
zt)9jJXODZIdwOP;v|lX^&Em!;<nIFbFqf!mKQO_qIZ79*VbU8NaLfNQpr`6ZT)+$d
znq-q4DE+(b_r{3+$Qz;zhu)yd7>&*QrKvl2kSciIR<hhburO#O-$3uX9n5;xwP}>2
zi=RIRBA~Sp$|RERj#S(PQCK+`z`;e+$~FPLOG;7zW-lel)5U$Z^dNuXf1V-{=j$NW
zB1}omX;^J<<?R0c9o=l$g^Re4v$^T7S%<vkJ>f+t1B{b21r;5J9svS~(-ZyE5P~Cy
zUhmcqhQb+G=RgNxPLy+Jc3YUeS1;z$eCdF8JpQWbz4{#Bd&B{OE=2`0iv3|VaO)(!
zf)M=VURw*x37|-J*0l3g!A)grSmvuhhDnDB2>kL7>ON}YI;M`x%E)j*YNIW%YLnlo
z1210e>{cNser3FZ47(Vq@>q8FNmbfeLK}njm8E##ZKl@uvHi+{I2d@$L>IiKDqEz{
z-<f_H_9e+gG`na*zxV=6IN(Vu+;gLwn&{v>i$B|=d%+LQaey0T!<soxOaTJKRk*mh
z2lpm(8vO2D&)=5qCU*xDe(4sgN>F6tH$q(`ZOI5eD6!n%pF7CcFRYHXG!~<+UXYp9
z;o@rYWz6AfBZfnD5O#5up`Tlx5+@^4Sh;eMS_xn~k~NBk31Mg=jrjhSDM;67#j`c+
zZ*I%ks}>>2G;erd@M%~+&RE$KCHB0Gze1u|3g1(_xe}fyTX!i|@d0>$O3JJ#Q`Y%}
za;yB7LX~I|T-1Pj{)PGR?vKOVHShNnRbPA}PwPTlXO47=!l$~=F^WG8>IpV*{h6A2
zY@ZlOIs8i-i}~V#Sp_1Da7?rMm@GT&B1KYaYA8^OIW99*AespD@@U@hZP>xVeTO&$
z+5M5Jsfh4z@EM*T(CyyZF^_+U2(1cj9AX%#*xW#QjyKwSqI|-yNA{pXh3F+q5ZQi@
z0)b&3Qy=pQC^>(_ta%oIr0%*RFfxqF-@Gj|M|Kmv4(KGrV`CkX#OR*x1PU~X4LGT0
zOkd%hVcJW0Mt(3Rhbt>sL`1Q`??EA(ock702^y0W#U98&>iBF{TVEG>;32+V*hNra
zrudWPnmvKAMppWg6d<z#mVY2;`O$W3v5YvN^@?0oJt;J{1M;0PyQs7F>f@><jkjKV
zmdY$)JoD_h_V<gLPI{49AiAI<n1=n$Wl-u&dpD&2)8@ZnvPfrjCr8%(wlN@DiFR^w
zQeSlXhcu?21(7tPTvDut00LGn%4YH#{D!w4x+htF`*ia`7spkNG+ghGRY717T>-xA
ztO(wjUnNq6%)`g6g+3$`ZIih~3y2}mL`!5Lf=uLTpH{o|NeUZPUbqVL0&SO9sVi{M
zH>=DB1o#SU{yC<%a3n8rMkr+}axL=?pVIV<4^86pzO5Ghz(Q5Ndo*goS&I~*n5p>C
z^YS<9V`tqc^x?|)jZt78wGvU?&e8U$33eQUP9Gwe7dVTqepV|f^~iAkx}jOzEv7yK
zTyz+?XGi?xdrCw^lyC9BoqTJQ>s0pt$i@lu7r3Pm<2+Y|mDfc<s}=Wis=gSg{FEr9
z#Zo$Oa2N$NicX*bQo!RVDlH8MD3H!5mlDfVytr=tUar(J7pWhK?xWknh|gv9jfH6N
zqO6>V=-6{G2>q?9rhXXLpq2Ssp$Xx7#|we@FBXP6;9u<vWwZd<Dnxr*XuLcxK9nhQ
z3=zx+iy?fOv>=ZS|L50eDz!dBH@a?%@q3Z=y9Vmn@4`jVw3L6NNT|^@STJpfL9sF*
z7WEN;$V`akaI{jy=W&E5R~+rzZ&NoTi!E3i9{z$Y_usAU9SAcR9;d+icbf?yt69rq
zDTy&EdIBdx@)ne^Y^%PWmoUcU=BOwLHq@Hz^bsOi9R24nA&ce?$kfS+jr;gDq6-|!
zIit%X!bSf^i#f=F)gB>K^MsQ^Epp&wIspNw92f?|6-B-h-qD6>GS%Vu>%Di^={NM1
zymo!yvponIVbO!*qdW%zkD!GGrEWG*jT0BQ>2#ckCdgwA3})VZVS-vJwr4CwiVX<<
zwf4t%jwJ}jO$=f$xqH}nNdS}`+tt--9X22bgrwplTq&oa{9VE6?4*ylTpy+slwcJ2
z#Ql6X9cU|$E7A$*VL?er&Pq^MO4`U5-dis8KXq?_15P0t<>u*W;>3hv``53(odO>|
zG+oUsE#U&7$==Bv9No>YtG>euY~5k-PHeqY!Y9|B8q3y2^21@Hxwq5(z@#5``HN9z
zpkOg^1N%XV(bG93$tI`_e4$zF&1ebS&Q|-wQOc&d)p!)<shkhyXlz{TK3+6Ui^Qsk
z`ezyp^4ZaiyXCX}f|o0yp2*=<dDo+c49o@|$}AgF<2B0c5DazbW&b6E=w*VG1jidT
zn;0>T=cgEhY>LgpA%Ivgw?OZa3|DDsVNoWxoeI1jldMgP{QfSn=v9`L(MTPvQ9hM(
ze`IQ>1&$%caEz4oP8%X|B!ak%>GJECeP{qgh8$Pvb*Gmm=jGMGY7%{U<O&<Vn^vsd
zG|_cfQ`v`lvwmn*{FW2WUx9fuT+o{zia-u)QV$k}4S`^b%)QD!O1r_q!JO*V-$aO{
zY{YDSWB(JmJsZH_ZV>?{8JL8NH@`d!XyaA%MPTh$e>|Ff?=hVeeYXZHT?k}TsAku_
zw$YB(yL0548P)98h=3;sK+$6g0`8-MX=iwzf)yNoFyZT#_`Ns^Rvzf#e(UJhX+ciT
zQ+j>!3+12k`wyDa?>|bJMnf^{s=~io=1UN1Qq21Na39{|$}B8!sjE85U5|PliuJ4H
zk=w2N-+W|`UW)2nR&56EaDX4?F5eid_TA<!50xqs;2Y3_MW*9`Wq|?0p_9O>kd{Kh
z!1(iP(UOoH_=}%EiGWT$Y2fIdQz>P@0vt-%VS@l|gOZR`&;wOP_zGuW0I``6E-a(_
zmUTH(QA5C1qI}YYhQyepn-GvK(#%D8K)@!X58$=WfS%dk05{&Q7}uE_c#l#lCLkg*
z0|IgCTuQgnW+uS*{gtX0qW_Cyn>bD^mHcck$4P?Y&(Y7eFS%igP;jN(;hy}7^lO^V
zY>Ht50`$I86q9B{k&RehU*Wxo{~uXz6_rQVMQa9kcXx;2PS6*33+^7=J-EBOyAv!x
za0{-10Kq-DyL6H7KYe<PzR3k6z)S6_U3;zd%sDww-nTnIKy|+AzAkbv1N=yB0&r3G
z8bQ;{?*$p9Rn~It#?wQt(**1)NkcU9LBL<aWw*k^>xTu_sF{fp@Nl(nJ<dzwu>tlB
ztS4$0yVm>6?iH9VeD*u$p`n;Ahn36bNywhdG6E81c)YG@C9}V%HCcb`P%TqnL(c^g
zITI5TH}>~6oT<MC0{0(ofCPw?pPwK%H+PmM6PD$CgeK4M=5U^EQJCWMm^eFlOr)^h
zNR$0ABu8m*;ap;e@iG*%-P$mvc2TU-d`J{2jo)hq4T;B5&G-2Dcz>YgP|FE8<A4JP
zR8tMPM*X4Ew!44{7pHWmE*!;lDq6W9qk_&U?_CU<<t8j5v4G~|8alHO8d_qBCmJrF
z3$5Gf3f#A<<s!+rJzK=zlfYs|e;^|be~I@C+2~g%84b7I>g?@<ZGwWtWRnMlVu8T|
zwaO?sYIdh8_*tQD_<fTiUdD#+m7Q(DQ%wuX?d|Q(Pj{4$`W>{~fjYL5%bNWm%p3&~
zfG0-psnPP{qLX_j3Rvv&lF#Dz%uL=d7FBrkcJC)!;P(pigYxt9KkV)8oz(BI?6$-T
zyUqysb@@iYw&<Fsu7%cgfyLT}i@PHrtYomFJmys!*EF@f?ryxYf}h~N|IFtgU>NY<
zgZ2KX;|4&B<$r+rT_<szfS9gz)3dR;&1$sxAw4;z4&Z4Q7I})zFncdA9MiddaT$}_
z|Ns5rqs!l>lNYPuYYi}Ik#aNM^)RLKMB2F;bU|^^!tA6yl-lh6lj>Y>;Oc<AYU*0A
zjXE(%SJ8QP`R%w9#UXF%szE*wWWrV>9U8^}+|0n8vr670IJ<l@?ferMbpU0NH7q9S
z#|^sHN9Zk%aiw%ZaC>)}Xk?-CQ_D$_=oSLV^&ZonV|Y=YRU=++cKDrD@ajBE2zPs-
z?EIrfBWIRjb8_T8Al28iLb=}YQrK`&3lp2Vf9elb>RfWi;nqXa`(*%r?Is#fpMpU^
zZ(w5y<&Fy&0GkdWQuO)uVKeTfcH6oN8W3PLC*wC}c@OOv@$boD0Y4!QxU9K#LxF<$
zw>lrl@<)YR4R{D4*22n6GAh)NGG)-e|KIMpQt4}QN;|=N_NX`vk)eG7hJ@U0Fw-0O
z5vg-r8qgG^WFr#Hnnh!G{eXoX$_iF`zI;ux!{kE<lh$r5)ZswNPb^jwz&(WIpYwy8
zYl<LjWN_wO{(~sSl+jGNzVPrBu01a#!0d`F7fL3e^1+;I1s$g^<yU(=?WWY@iF)G1
z_8)`UsJO>b{9O8XY0I$#0JCEv0N%dL_+zH1kj?@&^9Czou^-^Rn_{{gusVIT@9?`5
zJ5MPN0CiT6(3-7C;OV4N`nOd#j@S_+MP1h_D69Y6Q?b>8e<J>h&itqVV*YB|d|wB~
z9S>YqZ7%SRXzQiz{u*lp*kn_qyW6w)s`)f{kPH%K4E+`9XNjRL9;o8Og?_v_^A1Rh
zKWwF;qiYW{^v672SmvwM=hwk_>3mIC3i!?SR%>(U|3T}2HCdll8%oPLQy@pScFz=7
z5UuD#sL=5+Bu%3)F)BsB*zi2}DfCm7XG=Y8EAjZe+-l5H(NM-wkjKz8q!>CgOPi-3
zU<9KPLyW?+iqOcGHC&PF%0Og`8l(aJ)F$6PKuIr5Nh}!C%bi3!2$K|w5kWoB2UQow
z*UK3z)hZr4JXO}T=1UbpZKng!h#fr@fL@n+7LxLR`zbs)5aoaic;eLRQ&ES5APv3t
zbKFreG1Y8+YZ5_XtNUaXM`cFw_SNadJ?o4}0mZgNt&UI)NqhbAO%+TqGZOJhIL~3z
z#0dtJFDQK(TO|%kr7ZJ>!`xM(qzq;>T_dgfU0O1SiO3YxJta1=xf$qTF>pc~;B7%3
zj}J@fhbmm%`QVKz+x7H^I+$n|o^Qh3a-Nf_Q7dYj_op?B4g0XQeeiK{K44RE^zO>a
z>fSs#z*=QugK?eMXm_-=+=dH}kig#EhJIi0GdNRp?2__V?&Q_#t(ahGb{99?vAS*o
zPZSLw_{KgsDR&94k{EH0=5~IGKoL3tA0lmZ?sgJnbrknqGMS)wD&t50hB&^wX!MxO
zxGd9ewaB<|VIMm-G+H<nRcQgIZ0dS1uTqW(Jz0$$BehgT!t8yTg4QO?l9IveZp^`b
zbTUh#yaUZvmWa|2HHe%{l%B%$5{GX}WL9P@r7^&VM4eSaAOu60=OrXpX#6h!^Kw+q
z%Mcey>Q%mHI6a-Mtjw^ix&GkG9D!7KD7jLhX17+Vg?bLJ!a6u9(@`e%w_*S=^1Vsf
z%LLjm|IR+ANMIs`5yLs~5!B-X7#JAVlryI_3wvXF`<~&4JmldZX9RhWqYor`VWI<d
z0PN5|5<a9ctVP-4+3_P&?&%=KQx97^a)0f0$KH`?R(ab818C1)4QsozlcShqp`9IK
zJbnm6Ayafw1<XX62)%P+AYo)RqhX&Hrr(I$61^|cDy-w25iL)@{)`auMlbV-*)bp~
zjM+_(72_-Q#?dBZ8kN~c`ejJB6*!4}TT<ZF{|8s{R2)G<*!7V%BdKjuGl5hs`RyW6
zsv@HJ!80uVe7x`5GbN)Qen``?uNwOK-_to-o;HF25Ms6Tq=ejn%0Tfh#WGBI%pyuI
z@%%PILK5VEEOnATy6{S5a$^|%Z#5a5OqpxYI%`yd&oM5t4Y_Tz3}|93d`iG)>nOmb
z&eDfNZzjEx5`u;k7Q8PmHiP%u6y>)+-6EeIzX>QxQ6aI#;1)JMqREf0(%{6`HTdg%
z@MREPVHGr_T99N4koB{Fu-tDz3<PEIxzYn(7-7T1!#|=nG5@UzZU7${kbL^?BO&y1
zjUf?Bs8!~qJ<)%Y8HOn<l&3l*MAza|d<_ud#LT+NNqhqf`BES=Qo62!+*Ow&<4T2%
zDKGMTQLc&JG^%~;i4$4Y^`*eku^xKnOqlyK;jccJ<i;xXVz?Y0Ao8N(4?25Lkwoe{
zrGw=o6TSt>G>AXqaHd%?ysaRn>8?M_8k3`98ASRP6Xg*nhRZdNEqJx0HL?P)X0i8U
zm>9+AE-N@6EG8Cw4AyK}w+B2O3<^0Xh2$0JhEg6(;kYdAZ$sK<wI#C_zai=FWVcrM
zN|f>0D2yT^CUI79dcPIRjEzc=L{48`vp6ZtW<QYDn&f}y6PnDMr?Fq7uL{Rf!0PDe
zHe_faIZAr`MxPelsx)38t<fhb$f1I@duuzmEj#GaRY?K9<HN_!9#z<yVfeo;r#P#K
z{CPk3+YbtaE2ow@`ef=p(Yb0FH0>{w^%Sy)U~?u8Y>rSB`GY3gE$bVdfo{*zx%bwJ
zw3N%SMEs`O!oq;h4?5iIf3M50?`LJ>qAj2-J6QeU{sM1v(RRp-!U}-ZwzFS+T(PtJ
zF1dXxzT0_p<Uwh<OqN_VAK3Gu0<2{)ku7eYEz%I(q^^#@N2aEL%Bv=6g+zD!pQ^}P
zhNE;lDQJ3?(U{>UO`Obej9F(HhR0*pv{dMH8Q(945&eP{+W8N^tM5Byg<h`+uw7!Q
z_#d_XbvcG-51~u>`}{6_n0W7MO?(2^^~;jn=$q>M#fBL4c>NPqLxXSGkW?HVb6IXa
ziDNH4M=i1`<f9pG1<-)2@Zm*i>PG35U@Fn8$3@H+?~2d<HK9rR?D%l~yG6JkKso18
z&Mi`4i>tFEdR+Xnqc4JumOz7x<)y3I1r=)grWt*zo`JVoBXq%vg;wg*f*yw;++kqW
z7EdNt@3<{-Lg=H90(m+6xp!?1Ic$(bBV%!KdARZU&+6u>c((RoomrBq%p|YEwt_W0
zdH?wM6FS@rTI?5((|3Z<Pf_8sX{~H&VR1<6iX-_}Qeu2EW>AR!WP`4=GeyNa8G&+C
z)h#Snnu7F!<lkOa4g-F*Kb5wZ4Gj$)PWl}_i_10;?;+_+mDAZ3>g9YpJ0S%BounZ+
zYEB1|m^AO<_>3A{M$#0n9G;N!<L$t!?IN>oaI!RzxtII((TcEmLO*qxOv&jX@cJJ0
zfE89jOm~wncyCQ&E<Ra3a7(m6OXANTiQR#8Z5rZMry9)uJxRlaGKdVn+JR%MZKG|4
zozgk@);n~JM|}^G^fZuHujK{{BEe#YZY=C>%RYyow)het=|tTg>U&?o_wm<6*r(qs
z3aBh>Wf6J5*@`W23YvIGMzR9qyA}xS?F7}4z_Fw=vsHhfu}ovuIxnawN5EbktLJ_v
zg~$r}D=sih@RU=297mlf7^@ACN9Vxs(PXnPWOwRHOC0iFXTMRQ*vR4@9TG-zgHg{G
z;ZE#)bhTCa;%`HhlYEJc{SoCJ;iAY>Z%^z}E!HVuJISMjT}lsFN?5Sffb+R_vz+lN
znZtXq!{hOK4v+GL$G17tP5|BAo8T;zrO(w=c~p)S!=nPR=NYFiQwS8q-$MHvRfqlb
z_*M*b;>H67$ZK$<kowFTb5s2k+rmTv*FJ0&XLzOflt9BN=^l2p#YPuwAfm8Z%;MS9
zJm@0yu)7B!oVK2N-megF0{t*BFwm3H)OmOE2gVQWpp0p(b5aaGmLh#GJg5Ww|Amr6
zs<CJ@J$_MM79HjJo9qa=+&+tkPvLykze|dW$3RyW84C9t21ZRAK>cB=&D3on{;=hH
z5_$l!U(CF6oCFta&A75}$Lx%>6#{lUYLiW@p3bo=C0)Au1C3X1fwFM&+JlfX&kbNO
zO`eR!;mzd<<1S3hN3Mn!^^|^At0!5S{ESWMg791;-P}Zx$k#$Oq?$1g$YHQmgym6)
z5&f%H@{`&jgAYYy%}A<CR6znO?snAH6O%F~jDrTks_$6;Qb0g7{J`;2(%6ed0!c}Z
z_pfUfI#d=}Agx1R1JOvj&`b$<1kjUWdA{p@OL-HY+1%W0fPS48?P;RXN4%uj)=|}M
z2fc!7DKT=$OC|fthMCgcR5?=2P11w)XVfxv$m;z+NAQ(V@ifRU*)qP3s7p1y2P$8B
zZVD*^B;b3(n&(bjJDH*7QA85I=#1Yi;k6dS>&IXZBgsdZ=DocEB}&MWUh7%XSv0Jz
zVEMnSnAyVFC}LXySoO{~A)Q{x2zi_%nmlSRcU-q8p!)$Y!}yqdP{?@jpGA`RdI|-7
zIP?=FljfOIYz*0s{84eA2>3ub43#<2a{vb+Ys@k__x(%NG3Hl>&i=mr2HZyVK%fW!
zi|N;-H~eWr%&C5M49@6pWBu6*JYtfzyjeMxUJ^l&U2G@k>zv^$DfG-qtM#Btk#xLe
zMa7qzW+3_(2>gU}2C7@jAO2lkIUFP`L{=qpoJ%!CpU@imVg#fK#rK4QA6OI8_SJAV
z%Bkmmjan%j{GJQvx@3z09YN`&RfOlidHliYw^>1JJAzzUi4}!3L|RRY5kHisp+-If
z<98P#oXifHS#T$nYI=E+HpvA=uLBwdvOtIW_eyq-T&b(ldfBUuOCsNwR5*dNNdH3Y
zHXWrH|NF6vbe9?o&i?M6X-X1D8P}6Ex~*29G~=~PbvPbx36TQ861`E|M5Sws?CMwo
z8vo41zd#IeQkh&SU3fCl{~%c75d<EBBgnXrqv3s6d4DH9wAF^sU+8oG18`MqDqMtX
zD%5hnMK&g~FN&{3{mv{SeZcpU%L1{p&5Aipzep`+gys2tG-j)T$F<Hn=5a)eyxQL|
z{)jd&VSZp|T|<@@LLxF%B0Zdw7V%t&p1yP)*zEaWJ?(bjgSoFTFz7?B_OES@$9fs^
z+ZxkYCaRwCicQUn=zIuV$VVS8f_mAixGW5qR>4Q4=H$enXw(fc)OAGyIMYJQ*fZh(
zN`7mA_uC^2)sj4q*ibX)O^SOut7kt+#fRg&{`v?PeFyHde{b|6cC>L5PWPI1PHbt7
z0*2z2N*PT$gNSs4y0K2A)1p5Zf*@BqzgLxv_8qeBZ9}-XtMtqB+FGlGf<}(lw#d_0
zl`+PX?_>T&%Yi#vb%E7Qfgx+m^^GKfT5vdMyH-`TmE&)Y_!TvHcopu8-piHTdjk-u
zdh=HMle@;%b#Z`zEFT8qk%r48Lr%4SI4@dQl|rd2(DE>|O;95L6RDvTKd1}?b;~Ta
zmnN;Z#G4geY+r`G`vA>%(~xR>q>|j~^8DrJKu<h%rb!v&@!A8yuel<fgI`Zpnj}%F
znW_arOpZQx3c4EWps&&z9Zz){=LlAXcr)GKYu1K(3xn<}&ly-UK1VU@MI>Znt};@y
zi7T-z%v;l|Gk$X(lx<frd)0`&i|qXZGXk{L4s=>}C~7cX&cgFI5CA};>-kbvx5EPs
z$hB*$I=#&_w6u%S-*uk;VOU>(QbNY*YLBnISg674SL*x+lFQ9)mS%b1EC8yBbEb);
z_CNCJxle;Q$#PlhlK0=LogfDAjf=snFpsp6N+JBoDUDukIM)2Auag$Nei;^T9VnQo
zlhn8MO=uiOC@sOVprFMgdyg#c(*hb;-HX1(Kkw7PZ{JvReQ&koiELkr?-%T)!VyJw
zl48@@8@OK=yOC=#ZRcz=5E!dTMEyxwI-db02}gD<mGK@WJ|WvTB9FHq%{@rSM7-1V
zYcpIg1GHOl=G50XF_OP<{ga;PeCzI?`yx2zy|x5eK(>WV``=y+1bH4`z7|ejqgtZD
zI{P|5A02k5%UK^8aGe@Ni3`W6iOK5N9JLxTfrAySasUpAOR2yC1L%qoGZpI9l%P>*
z!CU}M9P*G(!;Ij>(4|5)q{iwXhLP-X#Wt#m5o7Zos1VNZjq*>j=n~G2#{;&5tko>B
z{&U%yt2zTOD~g`XR=~c?{vd>x$$vQoykH^_?K>_cYQ_bGo{rgl04WC>ctO|KC$|dC
z#5DDa(%DrAC07|B!!A**q`io2?pmtK3h0Z@XL`b;p8y?GF;;OtP5DeYU{jFdS_4K@
ze#z9#-|bcdb8-d0o-XFNrB}WFp`}jJ8yr_{lHr;7JTBlW-L$b`*l{(<AM@MeW5)kF
zL-XVg%vbRpjI#sOZ@MAwd_5LhngCE6$Pf~`ER8M?wmX`V$i`NKOT_O6k~hx<iWnB7
zz-)`J7lSyn$4Ycnwj@`mILA@aMsS&SpX*UaWX1h=Nu}dQU{+E$$GH(Uh109pl33=}
zas(<<qTVtgJnV0ktOBz#n$UAiN42_aJRoBy^&IGLV2~nit1~j2<UUiGT+9y^kA05r
ze!Zz{X1f9c)=;q-HT{8kzy4ebibxgz4~NY_^KI9s+mrg1kM1vXhkSD~1mhrW2BOwd
z8!kXG8Q}IGcDrC+x@v1A5~;B4#GFJ?LI{Ai;ut_)LL@ASt}EHQ7}RVObEV5Si}3sk
z+@SYc*UwBTj*=BX2l!5KdBPj8RgV<sFSbgWZIZ&uS@=>MvmSj0Y&nlt5U#9&p_%D`
zs#wDPi@W7nu<Ld#VcNg;sqmAyFdSn(*M3J9NHxCAs~PeY4Cpv}vB^K_Ep~dl^P+sC
zs5!Z%tn47%*_+#08%J+hXs<kn+CgE$*^vf!L+)%^k%J9Zb2)2>7BMup(&(KC9;gex
z)E^hGl?iEU+}|LbK*5v;CU2?^wFnv1QcZxJw#a?IH;{WkR;#5=19b2?-&%or;j_u~
zmJU$2MUjyI0_@p7mH&I3mlhX?N6@Uv>1h{UXorW%#6pJ;vdQndMQtv|+1oK{FP*H(
z=Kxp~0s^89;sF3klW}@1Vq*Po!3zaVwf84;67uqx|5zy^#22}b8@e9o9NaitUbi?y
z{hz)eLO@|5`Fo6IW)JvThiw>5?tvN=EGH1&)eSKy0OIN1{{W@lr&UJ)9EpMd^3JFB
z{>P^pMS26O8a%xB25Suh(09u8>!@szw0doEM0jswrou*(2<c;Fu;*KUc;BF+Q1=2~
z^<G#PY^vq31?R9o>XyI20w0}~cUhz>KOI!{K{<yRG8TA}!+H#)d_;*MwLYh$ryo$*
zSfL0!mRE43r7O$`gq$&w0qfn0Z8A)mDO_kJ4w`s;zM2|gtw&Qi!!$A7#M(*u{HZi@
zs_7#1i>q1)i2cMcdMc8?hu(`b$x@y_SeM|xeGwW=RaF&eOt{=nqd+-*Ezmt8{^80F
z8n}e9e6^r#P7@Fia5&!(c}VAqphgcK1E-#GLF>3X@_XJU$vnv3=8FxgrOtFi5|axx
z{5Eg=cR?IZn3RzWbB1~bw7lGGodHtq*#v~hih$uk6^%nS(9)NN?Eofa2fw5{8*x=p
zgR=kds&m!-2i-yWocF>wxVqLfFm@?>LjtN=KnlVo#m2-`18WbhSI(ZXx4&O@5kdVW
zS6~SPAV8Y!A%&LmVSlB`5lHRjM^<zGjqJA5h3RCaNTC#Ddq0!E_kodO{m*;pbX{Sk
z<;H({pS>wlWMJbdC&+m}oqR7*<k(ym9!6k}{#k`-m07Zh_CBU4<_w~Js3I`f6rKtA
z!cY<Uo+Ajr6dE;et6vy4vw)wQF9J;VIF-(&)nV(cEYydas;>cx#t>i!TED*VopQI5
zpe>-)bCoY&6xU{yrKRBvd@N!S!(<zDhHs&^vDM14fvAayi0B1ol$11e5HOirtM$aF
z_?GGsEg5CE?0~y9>DT9r#&+xb|KkO~Gd@1vl=RSnhdRBMQ#Ufs{tW~*Rak|p{B<wO
zRb=PZ7VvoOPj^fKhE5dm?(oW0h4Efs^@p3VaTQ6dzX}9E4(2k1T-4A02;!yxH6~||
zYN<<W4FbjP<OiRXwW&8alf5;_vZqhcP%{-|(>|u~czC!B5g?q-S&nN!$F@vyoIF7g
z{|!nenImgj)x?JZE9evil%<q3G-7$LObxn_8gqx-t<4hxfRfWFo8MjG3sxjH{MA}<
zF=d7PIuFyHN*4TkSChWDs<!NKUT5C(9vN7nCSgcozh_6Es|juu_LJEYS=eE?@+5O@
zQPIFqkoQin<i?YM!OC#!47m4ldzK3`VAwVVh$Vt^_`A)3?Y=l8u6bT=tW`D8O1v!*
z<h?wMxtY_>={(lv^**jZhrCo_nP?E&3kD^sUWy*WXF&WGOx5%gQ2d<tjs3%|7&X}r
zh-|#}IPebD8}e+jdb+7YmH%50H{?aP9)n_C!OA7fLo_hT1(HCmK7S4cc%@_&Mn*o9
z{j_ypj{ad*N^(=t70^xspB?$sQ2I2H&#E-yU<2R~Q5hf@Y&`5Ck}P95KI_^%y@R{A
z+<PnAT+uz9iB{m5wK?tN_iilNLQWErVX?NjHG4<e06PccO)v(HFt=j+XX|_;2!+c4
zrgiTJYI%elzdYq~YhTrUk|hVfFN950tVEci&_E8}l~TL)l-{l*`1{o)f@Hh(jnw2J
zyU=AVBrlBroE)q5t1~8|d&UC#vnN{bUN^+Ug>rbfbT6_W={N;C9_9&3rCM~^y6@Pl
z{-R=9&fjkG`Lrs5K}m@XyLZ9!r^j+_2z^i*_7$qP_tOuAy<}*~4gFybv|{0KvX?*i
zYZ3xH!TYgDg2K9nKg8eRWd7@wi<U@X-(e21PhxOkq*j6aFeV<q95myqml>OCjDIJ;
zR3c>Do~-%!ybk6yc#4R0sZ4a!{TX$gM-HeQZmN@s+;E~G=uS!P38rq&>+U98?JSc@
zQUEk-tQnvAJs#*J^5@A7msjx`MbkDi^bg&c5XYV=GjT;y62JlXxxL*4rJqh*y^CqT
zm1HPY*wmZAPh+S?emXEZ4(h$s)vo*?-h-47hIuE`e%$F&+>v~-9rk-Qfqp;V<;>nU
zu%eZtWTdJmitD)dB|xHG)rA_l^c)P?kf%++Ht_?SnYFvvB#|8o?5_~Hr<QY_f%WjT
z`e&iBx3Ir?3Kr-!5nYtP>p>xxvF~XIRde`YM$ven@uvfog#ku$d{qSlc)RH`Df#qJ
z60e1zygA7<_dvmLiU*@%SE(j2*AgpzNiC4x$9TVe`C%84b7IQHb6G~DdlUVaaD~c+
z7F6!AyEdY?`oNczog+U~hcCzQJM3zsp;=+$Zfqg$Z`z#~z+9$;9cb#cX~#+DW%n`D
zBT(c=3EK`gsg_07`#hLsxURGoIJgUqUZLudW^J^8G}V5baH;NU(Qf&2I9#Dw?<9iJ
z6klZsxr_t6o15z{_SRY`AOTR1sN0ip>`nKhvg64?WCgbBA2d;>qN@1>D^ZLIp$!4M
zRr-*kB8nC}`GI~EKYt7w>~qQ%Gem=u8lWr5-R|<2{Om=)5_xU&^fWf_D^Ik@*Q?V}
z3LvZEN5OB!v4DF$#s6+%H%PUIG-XKkbe#(C%fZ@T4Ie#?{5z#R?=Y$5c_aC$29;Fp
zcH`(Q)YA|Es6Uev!SC&9y((t9JbCaZ;Ai4$y8Qa`0PArg5E&(F^?Jd=W&er~5IKTV
zVwr!fRRB^@_kuTgFEmlIng2fO!l3>*#cbl2c5Nz}<)3#EUdzuk@86SX<*{-Y1x+}*
zTK+mE->1jQaTb2o{_L+J9%8=wDl(ay?G~#$=hn@}cC!A~C8yl{5pd`9dwX4Mw#FHs
zn3$PZ)?<%gVth83<4U%!LIc}ebO&&>Rv;pKZwU$kTgsw$K@8e@)<rQLG$$ThhZoQi
z<WJ9xpyf%Q&f{Th1%fnAy!;lFU0f5x6D$;EuOf-za7>utm>QwYI|29NRT=Xs#bwpR
zbvKd)S`+~|B9iz$vlbQ>rpwJi#n*Ad0z%#z{iK*_s<eESltMrShPh)|hG;o>g^FY|
z;iN*8aqByC=RviPwX-@R_Wk2({mpg_&($(s?r-ifDaR69BH%Ey{0Qs(w*2W`wRg7H
z6%{T-p(Of7$j7r0rPw8WcF|cPO_O9bag~B*x(uX4&@(8?GR)qCp8l-Ah9)N`!}mfB
zP~N^l`$;JCli1`AIn|_)1AP3_($ZxYkiNldx)yr3=8Xb0L{s)QLGO3_5iF)*Kf7DK
zw>iXU1)H<LeSOxk2OzbkMFfk#&02O*pwxd>x2)9a?dupX%arPntz^~!j;sxA8KtDX
zKSxnfO_z5@l6Gqb27GMRYh&s80WgB_lUMRSSxUfyoE7kA09Wc+BL%SFdtJLf4Ldt8
zpDOB*UXwE!m~ZiYf3BZQ#y^=t0(da@nui#<<KJ5<AD!zO0g^Q<o<ujw&APeQN)c&D
zJYY6WfOi<bl$qr-YYFS~u$!jYseLjfszwej6fS)NQ0&y4GcF9~=<+xTdz}kZGmqDQ
zDd3dMlxI6?wwHlvZ~g-+%g;HmomEC(WYKMvc#Zg=?H@%ON0O{OVN3lhkJ6AH%9dcv
zLdTQCVxJGq$Fo7izn))Mm2}AD|BS1;l|DMH)6M}hRcN<^;NLNPBf5Z|9{0Wo9kT23
zkiiEib+As~Q`+F*;PBm{mbpEN3u7ea@(|cg^4^}ysx{Lc;cYp{`bh|4e_>S2VruAK
zb?D~<jZ6*!@OfKTP9Ixca^PLUfdC#D4Tpp@|BI=~1hB=A#WOL_FA|fKZ+9ZsZx3=u
zslCp<L?j_V28MK>?pB<4fPgoZ2EZsil}U$(&!S&srJM>->6#2hi`dvOw!57ym)|&T
z0>`ImK2(S7F2oth*-d^c*uG@yr|O;z7RB(0`|AS}sYXJ`IKkAzAOGa1@Cx|?46&u=
z2l8PItK%FMDXB+95hK652Fhkg#`Ym2ZetyB<a1FX>txv#6G*x20f^7l>GoLbVFic;
zHnJQZ3q7p=GI0kKPDXu@69S<72h1~8G~uukGjazYzeX;DtweNtE&Gl-wN}^b_d4g)
z5kt=aPPA|kY5sS=x39?CbEHZwtD->1nxk@Ue<}~CuRkr@S^P1a0$^nQrtkJuIk`aO
zYtFBm@BQ=h%}BQ`H?GoYAZI71ME*V$!C~{P8Y2#MdaU5vrNJm1GiNbTQBt>aYwhfo
z&@M-U_dCG|ZFp=HBzTgBo1xXWpslki_e<5kdR!MX<pWSz(I;ARa){CWp8f?M^>)#I
z9YNgyOnTVlU6Fv6B%Al`@t2XNx0f?N2WRL0v+lRpcSmQt^Wo^{nbLRu3Zw*9kU{-i
z_^n9}pr^UM{GtUm{uV;CWh+)z*7NIYn+&AA&zwBqI3IO|fkTX0o3WmSt?lZDt3$v%
zMURf`{C2-PUcl!VfHK<K+9H>SIypYpcI{r4*#?Sp9AI|v(LdOpDRm*hKOzDywc`dG
zT;Gof=x6Ro+Z^F?*-=qUGs0>v-Uf=NaAx(lq<w7ryxWgu7w+`q3luggHv`2C=#dSh
z*-cwM^Syu_-rwG)KxVW>1UN3+*<$H?z<b3^DS&HXo@<Us5B;Pdj5u+?0>H4A<w)8n
zqJ^!6{e6p@LzTC<*uZau0qi-8*O?lbkmm#S%psULg(*;nV5<I^$l_N000tn|V5bsH
zv+hHPuvau|aCU0xZ4SVuQw|M=Mx+U7wb;hWO`x0=R&4_1swOSorHR<K*sw{isVyP?
zToI*pb#Z26snSs)3OT=Cua`gD|B<i<kljse5M;42dW~%~^If4U8Bn_(;WOJYQ7AD$
z3cfu%$MBXVB%m$S>M^{zA50f~et0Mtk|bqdkbNXat6cQHI~5le7T(qv<>bHz@2ew1
z_*bz^g!qR)+nAYARjsS51V{6mQfSAhiN6QZlN)aribqw_Mc6lghU|Je<>y5~Th9XG
zl?{PWN=-AvN#QEFQ>F1F-HK#z2$0_KF}4}z&)|VGjSl%c$A>N%@@D{cg#z4Inb{5J
z?8lj))hL}DpO!W@cprC6{7jLgwGUxaHMWzrHWavL8EP{<d<D*jD&pAlbUr*WqEYG2
z@njwwxqC32NV$vat47xOlHYo(GJ4&RVW(M<4@HEdXDK_RR!`W+pS-ZeOJmcqSs*4C
z1k7k*!U2DV)Q}#45~Sxg3w=+x)u=PpBzr;1(&`uKKJ3=fV^`d?m$s}$$g>oET>c~5
zEqi?R%`C=Mta%UUB?ke`x8T9RPcfXbV-h{Ry<nfbuR(4KE!P@6#slH0tA=(&6w^a!
z8-@_JKK=xS-h?*!WJ)%VYy+o->6%d+;EXkpVCv9)FMXT1=Yj%6V?MjT5`n`4BHb`Z
z_gcc|X$&e|>r=ENvqC@1Pp8C^l>EMlYWRQ-Ou0tFK+SfcF+c1?ir?fplSOmRtiJwG
zY;Mr`aT=5xoCvRbBjM5Vl1f{=j6~`RrMo%H5&MtcU<|QjR$b0o@=%&WUF<#>v{l_g
zsCW^O9-G2$grRI1pXV(d_$a2yQSTr2^T>vqsz9YnfB*iLDvgkU`2t0%#5a@hEJXa*
zGDs4cMx-ted!$xiT0i#2gE?=xlUO<Hwjf@YlSe=hs!On+^ebPZiu-n!YTI-(Qfr6;
zF6$CTi@I~}36$-r1rjW5q7Ed=5zx^)H?x|8b*K;6Bk=`FuvgX%Y!>jCz#hP%C<A1T
zX-ZHv{g3AnX6yf?QJ%hoTEIF3eY*CQM`S)1DJw{X3&W8`vaQZ8ic}2yK9uzJyY+4i
z!-%l3g}2-#0vNBpn07kJ%99THp@+x=yU#);+VJ8XU4o^8lz>Kbc{pt`R*4Drjit$!
z{dGowdLZa(J>dDC7Bo<72;*pMM4T{;YC!VtL1-0Dm0<bww}JgdgYrM^C8&K+5=1J<
zonhGoIj-Xj@(hl9e}Ra;3rB+6)&^(!RoH7jn&--V7vwyRl;o<mU8lr9eAQRb6GD3V
z<b$R)jbzRfBW&Ql7_AvW2!0zUwt0R7z%-KpOe4-?P)s@n;KW}bi2+^`P_BlCow0z+
zzk8U;!*gp4y$|+;cVPi-XOKqmG!1z47;#0yba^Nu9Zwq$sK{_`gq&Q9JYLO3C6V#Z
zZQe(Q9F!U&u({L8bgXvBscU*w6BZ5?_mKfuV1Um1YOPNU*KAh=H`LV;dMZgyr3!`n
zKmWVdt&g~q#4?2(By#xrrETV$fxpFa;)(A_c2;jVXx=}BtXRIF=`sfNu9phJM7!~9
z=?wsYHk7#5NdHmYziR+k7$1{O?9A}3dsq;NYEw=WwSFCC-MoB5KnQLSQ?I$HDg8CA
z@LrePHe|Za)`d~`5U0Lb`eI}Bf1u4MnjzJe6XSg}nx*gmWy3DvslhqZp+1{^C5;@D
ziRrH_w^s%&%iW>4^e68N%tX<6X-jc$SSg?jd=lLtA&Oz>>T1>c?4lkVT1Co%jT9n|
zhlif%4jUx!OmC|sQn0wk1r`aftoausdr=#$o`{b?S*+rQmrk?cT9r^%Th_}&cne{Y
z_I11bIF%QQK)JZ}cxROJQ2|%m&_4=F$rzN3=b-Ev9xE~Q;GYC@+++@`XFD8C`GP+H
zIHXG}SE!+a$ps5Z{8Mjf0U>qTwmzjVUS+s*>R&Q_JYZBASKHfGlPt@pydPr9NL{+h
zqW#cEr!4BezU<&^_hCKWxUkU4;<#U(N7TX0b{0BXKKiHL>{?OO(2YnB-s+Vp^`^EY
zQBZsEMQNdRzExg9Y5NBqX;9osLAMPZ1oZa*H0i4cORoDs$KMSax{(o{{rDsCPxb(C
z7uh0!?Yei_-b8u>q%jVp%&Ra@1}yAcwbO;@CC_EEQ7sq3uX|o}^8lj=d?CrcXG7GJ
z>_iO8`>Yb_tDv43hIn8k$b*Z_|65pF?y1Lb*sL8#`Yp(z)`f1glOg5zu*F@VFmhkm
z4~kV9u%AAUMuqmCTAb+k^YLVR5Y&cvdxn#wvOYU|68u|FC2!RbbC^P!hhDGu+bCAx
z<CDc>j`p?|mCs`!hm2lbqIwjAW0}FEAP{p0OOO6XoL_n>z=WArMK*G`Pw_=qJ`X2e
zanz<IFpY^mu&fLyDH@FWp{eJ-1je4RgHQPTi<Y+i__1wlXIBCS>ooFw)mEtcNJJ~n
z3@=yTHs<?$Cl{K2<HelCLy^ld!J*;+Kemp^?8+^$rnnTSv_27zFi4o$L8W_5H;2El
zvy}uby0Mjj>oND-9N1bTQcF2onMHv^1^}pX<P2c%8v86l@Uwc<RwjP`;%;|PeA;Qp
zk80>LtYHnQcKDP_sP~qLIHU(b8&fR(UBxPCKA&&06bCmU19~a==$nTXxl)amVe=l0
zM!gwvf~DZhuzVERT(F1f83L7De|^@AHRruM!~fN5B966iMaRflmrKf9S`t?VBPyE_
z_8UF;<8=It`-w|pBn&=4T|8QJ4**j=fHOQ9d3fX*itLf7a7tQ8wE&hJp#Y$_b~pL|
zR(2u!6bX&{;Scl}%}$vZk2lqEUn<@;k2%;8E$(OffXl*GNt%A&o5h*VMrGyfcM?oY
zB`KwPu$LgCP%0%B=%fg6GnFLG!c;>FUa$weG!8Vj%zV;%$Z^&J1WGA-Q&E#y@$_PJ
zr+hH#BJ>bU8Q`0Fz(?<DJEsH*?O>FGA*y%d8t?SI_112xx0kGijyiRJxXB-elJJ5P
zf^mxaCs;|w5{kGB&PCR?Y^;sV({?V%WOCu(^Kc=nqOT7%Hy_Qo9k6ra4{k&{ZBLou
z!Lt_`6$ufD6jNuXe@D4W@oZX90Ws4A(%i$Qx<s-}FE7)BW}Z)foI$%!Q`CQdyd>8i
zD;=4z+O^)XArYz4-o<a*gzSp|U9G4_AK&8+Sa6e3_}^08#dG@SGBuRkxJ51Q#d#m}
z%1X53!F2G_@qu?s`bZH5<f3kXv=xWXePVl9g!3yd$ff@ZyJ?jSPfds$;W=9{l;)0u
ziwj8>V$0m^!7!yaa(yrw8*x$kBKI}BdfkqC;V*V~2Qei6CE9VG2xnA09#L$3JZ^Sr
zmG8_e>{4#X*lNp?f~AYMPOj&c62}Z}E^WC`xct|z;f9lYyOap*8%C6w7*>_v%7xB4
zu8?Ww)B-9}@9@uiMOD*3K7Tk-%OJM<5!Hl3C!Z}Sd)}#feZ?Dcog&1F?#hOEt;SGv
zbhxTup3U#5d*M9mZ-19?7<^;eG|D-H8$nH_P%15TP9Z7$2`wJzK8`e{??I1SS5wPh
zRZSQ5P*^d)LEH&^S77-|c(oQgBU`$9;M`UKNyf!j?~KqFy#dOuw(ZCFE<!el&MdN$
z%FpH?`HB3%h);n4ItNz4c&1Q%!&Do_<I$6>U-9SPHcnqcg;Z>EZf+7Y&J#u1d23U>
zR&QvN&MH%<s+fq-Ept;Cvc;k%dW6PL=o>k1A7qDJ(r98oNE%*~_Cz$zu>bZcra%p7
zd}At57Vop$v*F?4o1MYs608UmbC_W~e?5I69GO7zsou$?BN|6}g|z6#?PwYRH_Q05
z+2}<Vtw(!FQBnOP-Bp)Tw~gHM3O`^2c_`kcbh~2X10)Rp#`pEiX%}immTpK-cI!k=
zF-fR`8G@1uDiCA6=#R?;u2Mw|7*}nD!W<xjMq3+Sl$vG^+N2Mr0p$}TEajQ`U>B_H
zl%$>*l1#(a=B^?$^5ot$duEt0H$2nEUR-qp-=$}Iq($mnm5%TSn`Ern<OzY@EiHl8
zM5$KeR)M_W$WXqWA$v@@c>Sk8oth$h+2}wCdQ16ilD&uv{3a(mPaVR507>AfId;#b
z!k)MaVYmT@^ZTjEv~wocWAG+2<>_e*xNw>-PnoIQ9`LM9JGVjqS*=FXI*y^(mFL0z
z292HLPDzJ;@XdENv(i^JD8aJobIrR65>z&`e#h`g6T#z9^9H)JCeGi`rN)zug9HDJ
zAwJj<Ml5ahHi?DCKG_kpdhU2KZ*SPo%*_F^v{vO%OovUqx?#nxwSABGuI^DEXB&8T
zA}D&$aNA&;-U8!RST7SCAte~WwY}PRpy}WT&+dCi^PwJ|C^m(EWMtti8vDi-+1)uq
zYD?gtYwd~imkp|QH9iDPs==?<R?%3tVB(N~>@vO^|BSKXV5J$-`ED2T<A--#=LG>h
z>NH1hW_bE7c#YzgOC+Z*cS2}7RzfN#2HX>5z%=dp-%tZpZiHZoUo%jJMY<K{+@|5D
zrnB^-s4W9;c+7RJF!E(_{<=ON6fI5YtO{At<rVD_*L+O)+7zvO!o01qn6)l0E;a<`
zC>`eSEZeBzBYsA1Hm?L6xOiSh_m4=iVU&(>MnKWBH3c2XiCvVcVk+3_&0hQ{DvH8$
zxqHTZWJ*X(U`Hm2p%9ADXHi6B&+|B9*a6j?2p4)<$;W@$ASqt$8|FxT++t^9W=1*o
zZn6ITF`9&VutJ^Vry>tHVE|wNb^Ukl{O;~-`tUtdshEjbY<;0rOz;Qla8r@ibozV>
z>4Ay|rP)f4vsU9F?pjeNqQ{@a<-6_#tzV4fVqtg)*uA6SxkoiLj^Vc)D{iZGq-K0c
zmWtIJ1Kr(Udd|9|Oy3R2V!h8g-6;!vu6~kNfo7~SBU>pvd~?{Z@kiqu9vrpLYJ;p@
zxu|bw11|CL4B5*W+=Vk6-kd=-4Z(mrpWautbds&sj`_&8<V-W?7N=bqpZm=)%?^)m
z$rEveV6i>0qW&0G6qalcN)jaCsa0DuEi(AuxsEkzB<m$9FDF%fhV0hmGqgy>w@o6=
z#-BI7wm8cm@UaQ;oCh{J(~GGO@*c=>21UHk4@{Hj-vl;(Loi0k=|LCL>16~GL_z|!
zHZT+g0y+VU0XF%}Py${RNb%Bj=aP^@Z`SrA0?~eF4Pvf|GfV0~yJ1v>yWa_DZpe0B
zAEnMy5Oz`hb-KS|SxmMhS$<!yMY{c|;y2RteGO3N#}bLDs3f9xDz703A>xCB28M8Y
zPj=ydVPsU6Si@paP`R>0=nOQ?Z-+!`b2NjOVw)2I0pew)R4G<R<ljbogqnV}7d3Rj
zHB_~0o?WTMQ7)%mJc_<_C0ZTxUrmdl6ciMIF)=a0gV#o(3Ix4BFsP8Y5kt94HV`3H
zEz0P`lPhq+<trR`u9VvDmUr82D@ylS<?0G3dTGJ2G2&Ri;6a(yTYiCB7Av|fAz&`l
zGr6AT@Si{N;@R0Hq251dd`+^VQOFk7(<1@~jK7urXWUBUr&`EMEV2TiQXf2e7|3AP
z^C~IM`c|nP@H3A2=;<?A=EotUUdZdyBfE~VkX3Q~eDn`vI8$vVDEkxcBGOx8`;r*J
zTU3lG@ezjltGQNIR^l@<=mf5kBhOXew`oYy9(jJr9?DCyXcb6p2Gp}g%*_lLbDWx+
zR?;xBlspQiNU8av2N5wexrvEy4_B3LhB1t!ryAckfZl3fVD`g;aRd`MUPfwe?wWpX
zo*oH!)443QO8-=guQD|{ehMwKP{ie^Hs#ZzyRD8wDCL}jl{Z^YVj6N`MXJw+(rVHA
zo{874<b5X~<Vd-@<?2gP#D!1uWxX<`%p`fxH~CR>91>ktAlB0}fM{+G@?ij6A{?&`
zMei8fDP}Ou+J$B~SOUI%ivZNGl_SGVUFn`|@*EMMxrK!xU|w8IeecLap`;Oce=dC4
zj2j+*?lG%hd&H6%`Ro!LHvQ6rTD+&>(n_O;NgZP1ojQ9Z_qB+F0WP3Q1K}%SQBs4l
zSTT#DucR>f%)3gmATb~SOz`De_1%A)Ub?C4j-^lbcDt?Gf&u!}3gwj&D&R{$^@FqC
zpFTa<)BfBm#(Mt--ZD@jE6Bc69pM}6T!@LmgaBT|kWXF%ZB3rf<p^vZ?tFhzN{LCc
zGpQ3+mtaMQuD4%DEwxof!opXSE@5Vz6aTh0x-RucJa#TtY`$uHQ-YjiQ<`{D(+Z1e
zbXt2``Xozy%$}mQHZ9WmhK7a#uji!qJChaGf!uaLj*4WsTueAgD$aln_3#$OH0ARQ
zvMH2`fhNtdi(vDC+x5;q?Wi|cdpgOq9o`?-F}W#F#5kn#jw)xs_$PMR^3_bXVo7^2
z1X4Q_5Bbe}eZOcy<I;hRNk?}lwi9Z<2iyW9b8|5EB)7ntI=wFhxHHB?=bC@ldb3pl
zpHjQBI@3&0;kR~ni^(0+aJF!Y<L3;ETxH9`PSERg@&qqGN;5ZG46u*GEX!fUlmK~#
zD?kcVPS)n{ex!hQpK7jbq;(}PnFFpZrE+G%k!+p{kPiaW7-Dc+zf*9oW$@{P*tr9X
zq||3gwwhWF41!Kk$MgLtA_;@^Cj3UdPSiibcF^YtfgQSr9hUsIP#3>&Ac3uPVPWAO
z@C)mgpLhh<XEw9jX&cWp&x!{YYX?I<1C-RB%h!Bnicc|<os*-ir^kfqIy6LnLD9^)
z9J|bik&uWHb%Q32X_}ABX|}X@fC^%PG+}*!`P?j#%ci%&hrB|_{Hun3O#Fb)pz;W=
zjDOHob2XxlE+0e7Wie~6EInDe2``iVyzAnrRsM=AK+Yu7DqAioeoHd8XINO6f`&#?
z`pnf7F*45B5UsqTqBjzYAy^_qtEb6o4!Q+mZP}G#T012QN0&%MxgCp+aa$4|lk^L<
z`mP;~kmPHm_c&v`6$dBgkl1=G6MQ?h`PU8Z2X6tBQ}8x#%7S=}dwm}fg`XMJz@OKm
zuojuIZN7EcJhtd##IC@R*8@16Jpt+L!?-|7a<x`tX%(t~MGvYW|2B}BbpQHv7Q*#*
z(Ex|dsKoYmR9Y4sjNdz+v}z2BY?$HzrtcaWI1djG>FgE?+%@Ob3I4)XRu#gz0N`=P
zGK(9;>{`0<(Mh3}zEehFJR^ff7LbqzPH|xSDrA7XPuuqEeV^RaClF^lp?9b|W0FnG
zrgH?4oJlkzf9BL+Qr9<BiXO{LODbz&AiiDFF4Q@mZXrlYP%3He@dVpxSC`Yku4{(^
zJ>heyauNwAmhTNNB;&fdVg0*Mnl=)|Vy0B*Gv9`ol|6^oIG_UX+0h8%LmzS7+4R!L
zJ}JSBnLs!+P(fqYglJZp&~(JJjjwFMJH)TRz(y=<$)Bi&5)JJ$Ghsi!rkI5qmg-R6
zX4~{tCLmUZ_aOP^^0~^~cQaTOegd@8q8w7UKPKj;$)^Xiug**-+rBu(iRSSC4)hl`
z-&+dQv1@~hu2;Uz0HQ@%(POAe)n5TDy7T9)#8^y2wdJ^r*-l2XCTchj&lrToO_PLq
ziR@fItGB232joXrQEf-^l^PIv4dqw5YTvT0pQ#?2vs3yO=qD3&_FJjnzHe;JM=g4M
zrBG<jMb|YnH8s7KF*N|Zd%C*1JOG&H-Tp(`<LeHri?DxZoP3_3-mpn<M?yk_ooQI%
z^A<H(59h2MdVPmhY>y#nz7<=Zk8o3dOFMeXZc4fB7r?T(`VA<WyxE1Oxtysn`l-J(
zW{-o9B^anJGerzNrn8uX7g9^!$g;<D3bQu~Gcz+6my~=n_h^@fWij9x4_iv6Qw=6|
zdv44eo;FYEyeZBol`H%Pz2^7kt58@0a=Kq{YHBJr#N;*AcVYNKJG%}czkkuM-sUTZ
z(tlK)G2}Byn`8-~P(K0pGq3Vb=}SnU!xa?CI?`F!S#cA6S^?ea3kR24T9EulVy@;O
z4JBeL?Hoy3D=|D3HPZxg%&4YBh@;>MyYQ7O_8n-UOE4wUv}DVBN#X&|c?YCDe|s4N
zK2_UIBJ9|ZXpjluM9}obTI6#ovnBhP*CSc)8^zVj{qHh&<pRwejp6cR#Nd6b{0)a}
zrRHtPDF2($k}F8`m{s;HPFY%~TOY|L{)KG~#DL{x?Q9`G{lyJ00BA|b%#`U3;)y6l
zUPo^EYiVp;h+4AO@x|LrJ<}>jn#11SUZHR#p;@y7&X(yWFje2oj5Bk23yTt;VRjBb
zW3|!6502?(oD;exD5M<*`0@|?UFUd?H#HzUl#pU>E&Wm=k8=(<kSq*EBbxQXr&Dt)
z0%tjYFLGdEe>v;_lA*6nDE&OgdeB#YkUcK#Xd&M;8OWh;4Vn(g*MI*`YH?&>e881n
zIyw-6!O#7(4hA=1Ys<LT9^U&~CYv}0>z~pjx4%3oowU}atUm|WeXe2(pGQ+9K*hT=
zQA@Uw0$<sj&{~xzIN<P-SWT4UpOEZ2r}Rhh+fBp-ZLWz7JIioKs;Z@s@z|twMw$4L
zo5hdf3Cf|qYjP9T2*P5o7t!-)X3N7EX*hlopNC3JJw!nJ6AadRBKjgi62M4L<_xl~
z!+dC6QBv|lZK+HV4nqzKIBugR*+e@lBIO6eE2i>oZAZHLN^@diZE*aMz%dL7v&uLb
zlaBMVDUZr8zNXl>RD9W@I3bk@@WoB7DZremvY?>thm*8!WHnGjM0=KFMIs@7X=XP>
zl^enRIBt)vco-&ZXh>>olCh0w_;*(N{4T3fWq2~v%EsAxdng86d4PU0w<gNIo2jQ*
zx==lEbm)EpOeCKMxU?~M;_UL|Gj3;VOlk)0E_A>ss>c`44LZpH0oEa=so;(L{og!@
zk6@C*wrH(B4-I`Z_!w-)PQiwgOF2F5CNAFda;nrb*Ectg0CuNP=nAtZKWx(F7)!`Y
zZdsaBwrY=!R4tTgHA<OTa{?L|EOkIXp&n0a=khE9KA*0fBHgD)q#M#1$b8q$Q@rS#
z+V}gK00U`6q&PnFTn7!x6yXnPyd~q<j-UqHDOupbAA<u`7==wwTH232xrUXhQ!?Ry
zZzW#?Is(B7S20M<Vj_#GUkR;t|B5Qw72IAFh(5Hg8k*Iv#s(A@&=4)<d{%y{?u@_0
zv$`{r!1y~N#X-tyZvcZH!)tBv8fMBi>#$t`0Um|Jya#xofPNj3Q&U5Ofo+`)^alBn
z+CA~6fV(Zoc^_7kVNzxW1W@0e_~s>9(xVYBW?^Ar_1U9DO;*$t#2nDg1VU@OcV3rf
zEM$OwY{&9Leuo{e^ndR<08Nn%P^Q-;LRut6J~pgeI*Um7F%?|nkFVUBf4sO5fVmkR
z@U(R9?3j)xX8n~qx!Rr(-|!k~!W!GHB%#dx_6=L*&d0yv`^WaSzRLvwKso~0<&nC~
z<TdTHux=YA3Mc6)cxEceo;Bgn;}bH?&R@_U3kESmY|#)+cklUN(v1W}(DOW6tQ^d<
zm#|Vx;Mf_w7qyrA7Hps~HxvKHa*6Y{d~Y7q)hWd)9H{^Hg@R?|uu@5#Ca7f)9fZmn
zpRE_iVGmD@cE$N>X=G@YpWl(QGmFkBRKsfRMe^$!Uk25<2!bE}D?0@b8jbaz`XT*m
z2null(^D`(fWw|W&>9_SLc4##|0~vx#wOoq&iFD;O(otx>Q$0F7#HOKAu9D{l%Wpx
z)5x;vqN5{BZHwN?xBcAVrb$Zt##uvx37h_mf7j6$&C+1AvMySe?Y&5VMLWN^=&jWo
zFGD(CO*uQi3`u*EFJ<*ZN#v!mrUBALU|}XFJ@IuRYr}v<9;Qcoys{DzN^Qc^dLJK$
zkiE_dB=8rqx!4x{UIdMg@C|i(O}K*ggB24zd<cClFfO~5YIILRlIl_AA5)ZR`OdJ%
ztVO#C{5F_A#hYDH^SM*$z5#E^LBMi_0d#uQ(o;WT(*uJbZ`6Jc!4{p82(;2<IjFLE
ze6hp5K{Hb)nOs08O}=@AWjg6eEveF#cfs&S7AJXBM$kDmVYDU$0&iKJT%q(lPX;wv
zZya#a1lE}N<%lWLC^V$5h*Ld(D>Dy~7#z^#iK?k3Pj70-8*DbW|9+G&ZEJ5A1;U4s
zs6aISpR7H%U>zWT5r8QaSKK{2{N(}v63`QqXf?jnnSAVe@@>Ys$Ymr>%wO1VxC!M>
zzemF$;&h(dCLNi8U}otKHGJZS8V2N}Xz7bn4Hko0v!VBuGv`l}pN&VqKwea=rmp4X
z<>cT}1VC|znwK3jN?}wiyc;*AsD}aHzLZI?A$$W>GIk_jMwuBE6;&_NTbRlArr*!~
z*9h2rl#Zk7uYG=T{>rChAl82^p4_)V{#4`%OX<C+$?a#FGG9>1f;Gjc_+hEBS3-+S
zera44S;BgrwL@~E=MOgR6E_<>dq{WC6UtVC83R_p7k!etxbHs|(Z4wUKV*FcSCnD5
zHr+LJx3uKY-5mnb($bC6-Q7|m-64&1x75%e4FVF<2uOU-_`YX-XRY%CX04fL?q}b7
zU;7Gf$Gd=3xsE;H?+myrK#f*BKpds?n}OMwKA)^9czj-C${K#GRvddl4Sclyk>}EO
z(f}Mh=q;2@+Zzq${X)!6C@-;&(+ZqU(%9&9R85aEi8GI%{7k>t#jzOR{AuO@1=a74
z9rLGOWf@!d2b&|&cx0~K`=h2Y<9{`!5ww$SBMsiy^%%XDxd*rZB=o!f-7$0NPw5yb
zrd5?ZPw<~{sm7k@`|_Y(Yk7|l0lN&$d;DhvS&^=6Tm%AmwLdD(wKLN^(J~ew0{ztv
zPmT!TEGsBfv{5YBvIZORFkI(xBU^7xt|rG$1Ft7qt;i+;ljYey1rDqDg6x)4=;C&a
zTgdS#C4qfZg0NI;wMdEY>6r}0Npt%A)lTx`Trw5-_RsS8$hr4EYymeNgfC%kfM=J%
zh$!oJ09za-`QqsKIQ;ewS>BWhbcmj%WB4N`<K>N_Bp;L8UQU+R**gBS5#_jI>K}hP
z%WV4Y#xFeS<R&2<9&6#Kgi20`#c+?Kpe>Yg0=OInt*T#J{m?@g{G4JdL@G2xRm6c2
zBWV6~UW3i$zM^tNxyO|9m67{&wVAK;PXOlgg%SZ1uJG@!KkT!)iX-@RL~Gs$WPT8_
zm~Ohth*f|F)_H>6%KMJmVuVkHpS%Cy<(a?qWfIR~<43T){ZbV{Rw@6w9I^4%zE|2p
zQj4}eyVTOXW-epS?;I>f^~;0wI`V>F;pLS~lGi-!AEq&5_oKwxwaw<%Mc?C2rumH^
z+C@WRBE_?;LU=D^%WK%=y}bols%&plpG;}E-Xpa{*;dKFS{L?iAXi=aYzkSa)S62f
zZ+de*@8QlMYDm@eCii+ts-@w328XT$YqE_d65X7~jWkbuvdz@g)N)UtIAvgt{k4L?
z<?ak^ehlC-M?|vt)Pn}3AQB)UJ&Zo?p{u;~MBcl5+mr!zD1l>ORZLazoOt?#6R)&{
zBTV(PEn-pR*E5nLoUl9>1Od3v5E#Z;Bt)wXRC&`q>KQ6-N6!JLy|K%%zMkg$*8&d$
z3FOprqYCYGb9{(jWR+sdxf)#xC>k2v3SRl3oJo}O^5v4!FrAwjalpl?q&^PJH@zFU
zC4`U<jjbYuGpcO?h&y5SqSs!52lweg!||z;#IbEQTVJBB852W%S=MmzYu-6fvY?X!
z>joApZScg}KoaOgCR@jYllNO`RPy4oR*p&^s!*)tXT<e8H9X~LGSn}MO%Ctq92U9}
zYZLI!{$!|F{QUfRh^TIA7Gt3~IGQB$$`P;VH#p%2ilMMg2-y|hyjh+Sg8+lECQslQ
zT^2*K&BDS0F^?67l#~<zpd{0A0waTbssHVYY<1vo-5lX1CCJoA*DVPo+xcwt1{P<x
zi-XG<H4~FDaEs)*&-$^z4<Eu^-r?O>H5psyE%1zx2#*J^Ga#UpCO<hfb#AeiwDY`N
zI={3OiB+c>XV`y#f<y(Vw)1Aa$ZG__cXwWQ4-aVkzbOJajI$omhHc}t@GaGJ2kZvI
zjtNqAOh+~-{DSi(<e$`*cbtzGYbX$Y)b$ZTATRjFVBxiV^cKCPg{~TRz8*>-X7A@u
zGhn4*V}tW_SidcP)=UUF8?-oCu75XULx-zskuIC@STJGps>k?Kb9_)bducZDM;f1_
zN*0aHd9-&HJj6Fg)IG<*9}Hm|ZmP2E3Mhd!%$A+&XUz2qQ8-C+s!ui8Xn2oq?0d?~
zW>g3BBJ=@;t^HC#os7?+7noDR%pPo1d&Qe&^K3-p|E}+UKkvGXbomP~mvCvfZ0+%c
zl@zmNES8+R?-{ac#HZDOD+PG3?H8*7k8{5n$x`9pbrN#JG;6LS6cm>$h=J^$!Iq=k
zg+)ame0=)r9tmq$W3U&<f_F79`pn1=*_Y##;J0&DJ0iOedx#?Ak{nqBGnofPo~SS3
zTv_Z_hD@twkUs1=xE0!e_6>dhGYkcau8XsVJytZ2v~Y~c*yBDvuU~=WlPLrBUf}J-
zsz|tYsTAS80Ud-;lFiAGh^Qz9ApG+9Mo&s6%m|U2<==W3p3Rb(TUa3Q3+TFk?hwV5
zJd!J{PqsfILCw~-S_eMWop1Zz%vDPPy!gC-;p4xklBxjKQMX?cP}sxipd5<^z2mWt
zae}>V0`HK;pj5E{qao1QSq#2nglN-@onaEdF)9@O0n7|2+N~KzUjRS2j#m&p$!DYA
zFiT0sz}H{=P=i`)<RvRv*tWTY)r0*AjAFH^ksrpr4uDffH`D!U^{lcJPCkf^a}{I?
zKs!@VZnvN3PvK6#?e}IQr<o_2Fq0z3-lL;a)FzZj{jt1;^qh>9ej}>>-GXs4S65eB
zC}8_VHowP<kv;dHYO<8#+bAF8@_C}IBxUgaZUBi%AK8WMYJKNqw2&f_ZP~ac!B#C}
zn%0v5E)t*h!5=`^MRC)m;OPzr=I8NW7y^CKiP-7I?HMmn?ze@Dc}T99XdTp}|MN-0
zN#JpQdKAuu;W@Ch#d=x<I8VO)f-gYP$AdfZ9e@0zxcL?n6H^kMS3p4T%ivzabqEaI
zoSu2M?SxWMNe$wsnos36H8__U8x8o*CEX=@c#St)CXDDa=hz9obrM1L#h9_SJ5p{4
z^F**#8_a@1R#2SoUhH>lEm{w*0+>ORZn{_&4*R%<@@geZ#<0N>HGxWeN)ddMN!k3V
zkSF>UFcV8n9}E<TUbG#~v??p+Vkq*BZ~B$bQZWl1S@b>w*=xOfW1>P<S__kPU5v@L
zDm<sSh>fO}27%x?Vh|$;FO@-Uuuon56=v=&Q^5csG5cMV6achNeGz7i`#v+1^q6K9
zoN6=Rp3U#Hh9WF1EP3breiMy;_0Dw{WJw+t=8xKXvu#KKzk6(ni2oQy$H|7XMQS@>
zp52;64tpbO3drjel$Etm%8QCZ1LYp^jA4yqM1F<6#@)A?{At$pN+RNX89<(Xe$qXk
zRptmj^}8Ab9ij#UoiLr(vS*~A!$Acd+_%ku4yoA6%1ShB@RyMcAuUs0ohl2-lB(a}
zIUrarcyv@jwj!)f*L7O-57SrqxoDZvcePfe12Ip-B`M-0rLtpl8gw%;Yj+yO+QM7$
z8R-&eT5po`64psJw;WQj2t~F--`-lChP3F=xF28@Q}n>BGpE*XT{P?Q_V}|OD>YtB
z{Z`j-D+M+F4nzQUO~GTF-ame+6apeRl<e*k{-Cn$$X;GF-`_ufhKGc}g`cxgqyIo@
z+f5g4e}{hcu>p=HpdX~+i*Prtds-HSJ_^z>0ah0KXJ@rI=Pf^+;C&&#6J%1+RO^WE
z6Y5LqE04KoXe7<11OfNt$77QhDH|J`9>u2i6zytbig?XZCpvWihohH`q@H2K*;ZUB
zPjzCVcwdr*5<L~7$-C{Osz_KU4F@774mqu8TVH%@7m<er3#Sqq2?9MlJcPjiIX`y+
zz@ALxXcZnVu0F&VyYbPcl&aS(b{}q_aFFPqax$HsI?4A>$pKola}PyXe3G`I)NE`j
zoLA((1TZ5n7WSM%i=63k-2$YmjJ6skBr=8B<PW3e4s*p3u#w%vETc<Zs8d@HWlLSI
z7|pX3nubplyac8OkBV^%JfszIF|0eN4`;mH`q!g0O!za~N3dh96|He62L|AQ{Sox8
z8sKlMio=a=@%`g8q)J3YB<cHOak0e^SCwUSX$%(^XudR@5d@M;H?9Usc2UY-!vQxg
zK;}h9M*s|4n?GftGtl;Wv}NsY0k(YzT2u0r-o-C7VEq2+)zC))|JYbu4vo~<NY&FV
zf;4&K#A&+u?}!m%6*d}nUCjj;sGrwy+77J2%s-b1h6k2c2_%!tm@oAxA4SQ{TH7t&
zmFzrzh{7>_>A`+eJPJgaU^zKCNz)IVAKl80fo+@qZ1N)iz$#YGD0*0?;;xB`Qg+TF
z?sJcRGCALg<w&qQnYTLsc<(j*Ce9AUAMJq#MJIy7sa`^Y^ay*HjzN&h7FMFzo^3qJ
z*8YwQT~e_3Yo{IkhzxaGIc0S2o;oK(<Y<FX@k1&@rxtl-YzhdJ95+6oEoea%pc8;n
zfeAx_XcmSJ)GSDXg}FHbRrHDA<lMP*zA#V%y|_){wMtb;0^oOW-GR;uFp`UIax&Y_
zg3bmkP+0zCD6Gq<F&|C@2nGUd_yDI~!Y1Sr1+S;ncGwEC(<ItjdeX{fTAqH8pvwT{
z$iWAcpOH!TMj`}RQ+e4?1wJL+PC_}5;gRRiZP@ER7SFyImc*<(=zv`t$cgv!<Ey;d
zIXqRU{62nO%#N}a`0^ww7q3M`vY?s(%zuv~s$mrottvb;fi}P7yTeNNIqTn$g`q<G
zbMI4@wXqBXynF4>?}acG%SO-U%kr>8K6Iilpv=<*WWjTb4B$-M-TBlvRkKk6(Y8x-
z*kgc)>ID=nRAakjwJfi(%(l&KogP-GmMEz{5P}^dg3t@c`^z-QcI{xDR%+rCrsof_
z<Ey@2smpMX``J8y0ZqI~COI-77j#2_g_VK11hat-RSLOY|MvfTi<%4}Q5?sE-e+zA
z&t-KqOZgLBo6oke$k|?)qkfro`Yyj9To+BgN}(!!<RqKsCLqWdpX~5~SDXATEFcgh
zDBY8*o!$+iM0Ha5PF<+t17E1Suo(k$T!)#-5jeutHvP$lckBFH&rBIsPXCpLhEb4c
zUvV>2peyd*jDp;E>TMLW$H~cGQZ>F^GjjFk>LmoaAw+P%J-wRieE+eW-d#gZI}ra8
zw^Z$mkRNhjD!s03DTQNKNL&1Zz`6&)iT<aHpG28yt$0g|U*&%kOgp8>bgpz$YozQU
zZ<6!lpJDWpdm~z{uFNu3Cj7E({tHUO&z&}L2wB-diNVjsV0En5R5fGad%*ME7bI){
zS<;j&4Z@{REpQh4Ox|fYu(piT#~gl1Isdm;AsukwCPD{R7S57wA6P)9Vi0<FwU#$o
zd3lKhtUBe@1I}*1aU8&d?QRwnIjG6gG?DEq=(@|b`(>Qosd3$i>g~F(ZJ=IzP}Al3
zqvf5+K<8qeWInKNOInzn)ue_jCznA&CG2Gxtk~5h&Xeufc!lT1T=WIIOTT53uzng9
zJhgRUm^#@a&{4s&(z)d<;#qgLhEaNpSd@$YVIb)fUh0V@YUWP1)`Bc^EWolFpoEQ9
zv6#Aii%1=1-yMc+D7$+=as0%^?}NPhyDn^)4ja=K1IY&lH>;F9Xp5u<b$P=LRkwo2
z!os4stW4TV2XbXM;VX>5?KlBpvzf5-)D)j_ySQza&o2NIEE+Q}ztzBIHBe8lGk@4c
z!~|cEoguNh3u+DjR#eIUVClU_OU>&BFGBl^*^ELZEJt`aeWX$NdIm+>*c|4hl+QG<
zj!~rZgwr_yZ|(IUU)Hd$zCKB3SJ%)buy*7OLg5U|WUIGX1e6G{SJL0{<hSHNUeEUj
z9bm`|Cx*jfhk|GWw?Hyq!+zs{&9lG_g<B7+>@!jHT?A|4$KD7NBXPKiX?UJ(OJRpf
zg@xm@;3aKC1cUE5cE9BetmSdG9l{nhhm(Fu$F{lJ;%>TeqkZBX1s9079orZPRTjO0
zg%?0fDI((_VylTC(EJ`7b%9C1a>V+!wyJ7C{N-LfEG*0gQ{k1--LG>g34$zO5g2j4
zj}!)>QidwVwx4v^RtY;|_OO#Ba#W6mHse%&)LS5D)=nu~4wFdJlPUvk7>C%yAssNS
zsJ@Bb9#I4%l85?9Il(S^qxch2`jBn^dWG^{nBhFccZ(pb#&kLj13&G%>3e2elR~#_
zUMp21epma;vAohyj!%tp+6I)5Yo~omO0{WyhJI$w!{bM03+S-tQInDcwvE9`T)3yT
zflsWK+$5Q(xwRbjk!7F5CEf-wh;ej;xoRReVcXXqx-Nf?a<5_$vVUtP<S`f+%)lN0
zm>Q3#!)^#?W)R+D$uzba`$t?X;%X-RyAqX`kReN3Y?2W9BON-{KnN<EVM|=C77|s!
z2A#aA1^}I|=J$)Xet8>6l<0T?|2@C!@$x$bVR5K1?Z)SJ>fT2$g^F+CxUoB}{I4iv
z->Y&yelODN8P2l7^FDeN*QFgN8+jco7E83bf^ffT&<+5OQ4-P@G#b_5tpg4a#WUu!
zfT08t$@w*!<b-k}dP?g`Hk$l}bc9{0?P8>5GXshEh7i%`B5ZTdWM+{ah3`SMjq9G%
z9}4*BAbNvUfS!f2OzjJ<is^9N_6F($%xjLq2<Tv1Vo$b%IV&Ul;cc9ChYf@_nB05^
zeGhx7_{fzZ@NOF!D&?p6-FSdcw=t4PwMmor&PBC}JqbfIf^Z@oiLN)ez3K_GS!ixs
zm3<1=Ehm4;3$$CRk0*+R|D@7=!j|>St#PkK<F6JGOxe{iZ@CLg3(0}IH8q)F31@}T
zQJa2r($KQ8)6slq58^#s@dq0IcYrBI<4Nxr7&iR#1tds)6gHQ`wvx51HN+rT!2+Uh
z-O!8cw1G17z4Wj;`EGODO@@t6TGa`xqEC%xum12gM><KD?Fq4{^R?8B9gwh0oPhta
ze@vK|FFw)+o1JiER*iP!ohVo(gTv&jh-JVOsMQRiz41=S_qMu@uzP#h8-6Js04~0g
z3EkA%NMe^K5}6DwUsznA-*d0xyu?4CPrf3zLMb<9g4jhB3UDJ|tL=#_X-Dl4`#4}9
z?dnRZqgXhz%WPd@pb!o$kI3z~m~*8p={4_C%SO^C>K}V*wzMX%1mxvmlEN9J9#Cfr
zN-hHTX7juwox6kFe$F1%=`62SYmzj$wm)wMjn&n<wG=^eNFp-_0}svfFO9Uvqt2tx
zfdTo%EuOM=G2JLt>-!YGG)1rn&=kI58h9LY;Z_Zc4a5>hAkIopO_lD>IdR#ZeI>yA
zMHgcyoX8kv&OyUNZIZVF|NMghz_e^^<f*KZb<}&UY32h=s$09ea|Pr*8bHKH{W1q!
z>?av!RiWT8^siow8<CV}m?)OhApSs+58R+dBJ5ofw^u<y7|Tg1`eX9Z`SNJ}`=lvy
znhflUA}fO=dV7A1x#l0=g>Gq+hu?oQ?iDs9F4)HGau2^d-YsOvZo3yCCb4}pc=SC-
zMJqcpSiB@2ObaQzJgToy2D+ESYOdeK!b}65%tCr<5dEc=u<aQiT8IeVo6ntc%U+oJ
zBv=+18kV0B1`ZZ?_0;LJ|Kyqv)|%T7X-qiI1j{Rl?L;Z;Dh~$dDQ?gLxBZMLfcYI$
zW`IDMvv2;#3m{ooM;<#hbBv%w5%0qg-t!oUSOY=(by`}H03|;8OwV~afyz53+}#KH
zdbCrs6aSNVg-X-b>cvQ+@cK<Pne6iB#djY@VJTpotEo_IJ`2*(4(q=uY6er*GfQ}x
zV^km7YHlmilbsGmi0K90xHl*5oJqZ3ud~7WY_d>sd#vGgpBw)1hOD*gCjxE840@6l
zIiafkTbsiApzy1MW6=E!@E$C<Ty_i~!0x9#-~y<fIcS6U&04Cv-B-m|G_NRyQ*9>1
zK|DV#`?S4CDN&j@?bY01d#^{EqBZqX^@3=ng<Ge<_{dxM;i-lhQ3$B4%FOcvZ$*;D
zYIZdY_vAmuUR-h8x9ZP@Dk?ak1irkf)}n>9bP0ZeJB^b({Qh<ir904)ln40<2m<VD
zlJ!19gk2$_xf|GNH_?z1c#(Vvbw6P<lbF7sENUEn(EPBN!Da0T(?4<W&5&cXBfGjb
zyfTF&TIHj3VU-|@lCs8TA^OrDL(`#cRk*D+NBQ}W|MKO248`S-?MVYJ)k1KW;kaSp
zn(S^@%5Tb|tP6x<ipDu_FM~f&QMk>>j|rc^!a`nWiy6pZLpa^BrRD1IXfuBF&99M4
zl0YFn4BznAwev~c=R5T8>X*uTx_yXq6@tFhi^&?fOIX(PW1V`zec#*9iIYJ`r>hyx
zP}%y$+-=I$AAFx=&cW}m2!w&R`h04mI*6v@yhd?YX+W&)y3Lltq1;x5mCaKFdhI`(
zf#S<Au4;uhsAA2`mOtHb99P7o>R=_A{t@Jg8al}t)zgcZeeu{1|Kz=5&9jVIU0J+I
zQ~+;tS(t`Loe<9J=oIEWi`}0HcD6XRZQ(&#Jh+oVnK`bz!_+4Pc*{Rx`&=`#4!*wY
z7*zAQvz#Wm?esn@hy*gW*QA+qU(?L^{ASCvj-=KV))N-!hnVqUH0#iHR5oWERy(@f
z#`Y;IvuUKaQHWJ#psm_aH_Q*IkoCN{g&0f*eJK!|uuf&JyHO!N5)W2ae#POPg76O$
zcR_Gshr5)Yn>-EsCgy?y*7c925PngGocib(r;nBPy6<Y;iHCbm2OsBs8w6>{Nq85?
z#N1s1<kwHpE_~V)LvJuFI2LG@7-KkZkHfa&MdczLYWt@nh+#!pseB6F@a4@8q(nK)
zf3219*JAw;KWRGnxcWh<KQ`5kx4qV}g?P+SHL6k|lJNDrxH-t636+<`c{G5h2D)Z4
zH8tJZz8QG^s-$H)+xF#QsR!zLn99%>y4)j<sd5#`n9!?1GXnq;d&%-iDJlJQoj0yh
zYB>Usz8rVJImuMaGE<*XoemaOG_%P!{p=tJY$lN%9geSO+Gf}RC7CJ4D_R<ufamCO
zRoz&rvp~v@7Z$n**+*F~$c*%qs$mCyJoBHdcCuEST=WlOvR!n#{~T5eX_d`18mOOt
z);03OX?wr1rxfDgr+->L;5BjvYokLMzNtdgTB9u6g-i1nFt6`lTQhQb;LzTp0uDW}
z7z_ddkPAG<p{lc*2W+0-6(Te*fbymLZim%qUx|p{pT!G#Z(g@vgGC<w6j%2pQ&PDI
z-+<{fvRd}T-4A<wdBS0_Fc0!mkt8~7i`sQ&A$o)#*xnx$UTpzpvRY=z^J{E^N^jc7
z)FUxq;qiDf_4Pg|7gi7|eKyXW$XBOdr7Mos(bJ2XqR-aOOSbi20FcEd!UwhIvq16u
z3dOHdjJ(5%<iV0M08mFrO3ETK&8V6Uj?%<9`^un_8N@|ODn%WPEp`X|OT&sy+oEOg
z%w2A#u<wE8y}yEGHp?Wi7yTEx<j&~6Jf6yea9zEaU;diRK1#;ohpk+?G7hQyHZ^xb
zk<=pPm$!CBqC%`$&N|xp3Fcojmusa|{J3K(6LDT;gf2^~kj2~Pw$aQxbHz19gPUE3
zV>?C#ji=0Vl2}W+E{4fWLHoL2jhHiil>Gs6e-M<yr-0C2iH2c$+*t!FZ@T7)O?S0>
zuQ8b#AO3ZXe>DibmuwG(iH4dzZIQ{XI9xO0#Wl^Ild-O8?G0$CeuVQjoTX5@n-tAj
z^URul6`&#`$rW;`7QrK9CAA^x4rr?kbBeyfVw(YYmzXtxjmkJZY3#;GH)ASesx)P;
z0GxDC5a4J6=v;waIh%fiB?>*s#=C6LcJ9rJbx1`^P0i^!B}PU-F9<ls;=O!odb1D`
zX?ys1%D+jYO#nIuR2exrIXJGlcl67Y!o(%~<Ff<x+hpkUS|8rNiNt92OO6w*vEJgl
z(ifGtUbqW3k{0L{F2cP~R*|`mV>6YumfcfSw->RmN!ic9a@!?$EiKz2^!Z~35Lp-d
zaovCRjy3_4u<Ilk7WCW)?87kV;f&}UsUhTPEyGfY9yKddAEyI-wm$Uds|=T)g}qKo
zM=C}88;VOjD`k5m<4zM}5rH%GCP0^R2of#ps*Pr0vaS=R`ulb^8%6WO=+-YbAxpgM
zv?A#g3wEHRYxV==;lV1q*|xK67@NCLZMD*+<G}1-w1t^n;lq;3C#!7k=;2jm1qGPE
zUsej|V0@JM`FW;C&UNpDFZjIY0*nwRQ(6ZHhr5>pvR-F;-FGbxSrWN*<==oT+X3{?
zQ#%c4TA0x)T8k<c2G$dd*_KmdtE<39t5m)023@_2c4qdGhud5f=d*ECc7#xxrQ85l
z6R{fQEhlF-ql|St&}!%dlQ-=WrTK?NnB~V`d@}lTSF5j4vR=I{uVR+=xh5RcOjL6(
z9hjZ`tk6m>;1DNot4hIgskMFfS@NP%XUC-elz!Fk^ggqO*HE&wo|<076&Q>G`GSl@
zs8sDYOv}*D*Ecd-^Nb+9j#KotMls4C>61qpNe&krvN*!-`xt;l(_-yI=lQ5?KKLWA
zm2WXMTMbM%_GIsw;8|}rp|~29d}o;$b1jj{ktoZ#PE-VKJ)K6YL{Zk`7V2&b+kLSt
za?RY!IB#{(G~)|K3Z?BPHzIe^9dhnF-{CoyE~>E2>qmJWZxTpY1w^!3t)9PhGYf<G
zIf-MJGhYT?Zg#FOy8hUYj^JO7!FhRmoBf(Caq;xjwRv@SSIvNH@#h>lGSO6zE-51;
z67YjV30obEE}ZT$6QyPsk(U5+r1ntf6wI>~8?Egv?61Kw%YrTP(VsupG8cM#un+p3
zBO)PrOzkg;(rQFr<8ZJg(x)irLMH(#+j3N6wPQ4p0z{ugl7XJ^%Z)o%U05|O*>q+{
zQc=<^e%&!s%fkfGgiMv&?)K54GDP_27!MYW&SVq{E-!?<nWBEk1>Mq-8{rFff8uNL
z5KsY7%PdIa0frUka&`)1LQp%c3*cjbSU;^0P|EdvZVL*Z^$q!s`lLpI6PIpx&i<&L
zW{?BjA(2s0wF?LOtJ$~TF<e6pEfdIz2!}50s~cve6-7X{-Q(ZsE24;pjiY*BP1`X=
zM|R(s4VL)shQBQoWQ1};SJ0lg9!6W!f=`LQp_lwsDy<LaH6*Sz{FXqtXuikp?fICG
zgaWMjOGOZT{|ZKVIkJ&Iy9JRGZI=4Q=0TCYf9;T*Dd3vHnCFLEQBgrhT=JP8tp5(v
z?wAbropdHKSEI_$d5Py4JIVTJQ#-RoGkAg|h$xzv%+N<n)s6=v$CCC1hbxu5{e4j&
zvqq*cegOw{?)wt^pI8+O`c**kju5{Mxar;9-!r36Obf!<g#yD)L;wo_EL^+i-{QK^
zs-BkFj3R1xX}}#-YRkKxe1`$p#-9|y6;6SpV7bB?AdD2SrJ{KTjAVBYAw_%g;lhMz
zb=n!b5=pi|@X7EGVdtU8d)G|!y+U^BT{q||82YoQX=Y5gYueip)xQdb+b@llWHj>v
z_S?gBqS&)WH{ZzK=h*-}zKR+^F#wd)Dl#f6Y=G2PK>~0t|E=rE9)4|!0pAp>6@fh1
zEHS7hI2p&W{@AHVpy@bqhiUOFUdEzijiJ5ojL`K!;}eur8^6%l(6B>K3NM8@_s(-Y
z{N%)en2fCG>u(k~xWJ*>p2~!oalg{_fWKa5U;~3}6inVd`fSpv*F;c34^R~e;hrwC
zJZ|jBXlVoR6q*d3vZ8?W?95b~P~JCBCuJP;r1oz8nW$0OxFxgJ2G+Ro_anRX2I7oi
zL<cBw+yKtaH_niQok7RBh69k^Iq-uKCU_?a_H)+bi}<-~c%p}!AR7UHPk(3cc-w_Y
z07{HRtodyqc@$;#uc#-h4qM=7`@)JbBR8)9JVBtv(d9lp{Z{oxdkq0tVRaIlWB7N?
zNx$Mg`$mM2w(iCmXruefL|qX;X8>{+0JtEUq>DBWT|woC2{6)tD&^4^`oqe3;nLI7
zIeB>rEUUW>;>UHk3N6&7{>YSmWPoI9sS#!dMb@98D&CBpv1aLL&nsBvEQD-fi#sTQ
zKxNlrDEcIY{(edNt24$e%}6U1fN@bm3B2^q8$D?(qclxh9bC9E@N(plEaUs4fm7on
z7Ty8%f8Rht9AqQw<cH$d&m%1JJ^d{KEwd0puvjok*=DiOPaVUmq&Cs8HIlntMoxr^
zw)Zg|HN4>ps!9Zpfm~Fk{U%m4-esg&&>y$D8gc6UWk4SJ83*a*sH#?TD;pt4pdNH!
zpAe@a5piD*-B8vYR<im3cS{NWc2nKFL6J3F*q|f^(G!-7kTWgq^>ZAZ*@FJn8sWe@
zpj|L<1})ms2{7oqFK2--WZ%|3$Vh*X*8h5BcwBxgQfdRJ3iNq4)wHo2X-%&cL8o%>
zEgnk^sAtHs((?jQIh-m6-(K2Uct>L4?r6Cjd~v?M#GQ|s*ct()B?Vo|MYGvUq;zl@
zi9EW7$OQ<^5_t^&9{tq#K3oyMVsQbz9#bnn)&ytoC)}>=k2iGr#qD1WV8+6?0G?FD
zYdGGvA6tiH(P258)KBIG3`E$USNm9*5<466<e8orNFIt~WDM;Q3yHG&C&h6yxk=(z
zlFwaoq8Aij!6AJbN!i~135$(g(zCp4L)wCJ!!AHK2gFCQR>KE8zKI~>`!bZpGwBsD
zK*GI?uLxFGR|ir`(N`yC-HYolw9n$Na6=QI>@0QhrzRY|AXG7r3B6(pg#jq5-k-b^
zcw+LVcQimz-R8G8GEe9&mj)^rm{3nbiKEt^q`mfV$8F_30MJIl!mTIU5)kMn<@mw&
z(R>1Eo22(zbA>p6&IUeY+ec>P{|2M6G|+An8nVS1U02ElbD`mFe}WC8U?1trBqpz(
z3?<{P3s~Ypqtyu^uEiG!r;z*JM?5%ay=y7QnY5e8F>YCTj0Nx476lymAW}Mwj6`el
zP#M@8Sk>(~K6qO4Wa%DF`Km@>ff_gd-}MLS%((?&2$Rk2|K8U~D5)gmH|v-fBY2e-
zl+<84&YHAfrxjUManUZl&hF^wphaV(o6CLpVVjl~wVwloM_;Qb5~>L6BeN+?<{bnK
z(utYyuk!co#3$K~&vvz)s<P?TNeKS8$OlsbaG^os%sCnKYYNf06j^42(?<x!BJNkN
zT$d6$IQz^)05KUb^+&6G7V&Pt2{Jw1{C*Lz<L!fr@VncWw)^2vWL=B56c_0&?tG1$
zq$*!}YXoi3gwSCZZlNykti)fT&tFCAA7OdwG2wk3P#Wc2AO2kLtkBwcR~rebKjWfj
zPG-MK6ZY+{03vwY|H)pqP_C(hg$6m?B_gbeL)OcIEKDi906O*wAqV(<vS@FHKW#uz
z82R%p5iq~bWHJ73Ox_Hm{Le;4@mM7yF9Gj6-~Up)(k+y0^(%`Q7rEpBK|B=PK*b90
z1RP{MN3Dt4+rNDr+a);)@4ZQ~wZC3sLh5VcetLQ$SOS*O&=PeW0lbnv(ovfSP@W(X
zlx9f&k-R`WqUDTvU2s>;>*eTml8qd;A_t)zj|jU06-dDU0eo~k#6MQ$w%H%NH+@o7
zi)Je5tDm9MZihcl{QygZ?`vgRRVIyWB5kl+@$UGq8B3)vilZubE={+W-H#n=E)Ovs
zA9E_qlI50_*r_k)8Tv>X`rl%;|H;*WuY?(#LN9)|-&){h4YNO5g;M&Z!Oc(AXc@zz
zM3n~ERJ1UBjrTyFAe5hBkmjy#8dBm7uUI}*ClIk7Na90c=UPficFGj}ft8Sem1$cY
znlv%XF7*zO4JvQAOqv1{#OSD>h2k%O)-)XWd_~o#o1Qk3IDj4eoMhXqYhT~&RK+>k
zVYL?Sx#4YnhncB70llJ_0Z@xf^UnJKI7t$KS3Y)@NAoF5zukaFU=1bSh{|B>l5yC&
zny?FQO;351`hDQjv56G^QhGX}BohS%MW2lIj~2(7C8Fkm_Ea7HBI>}uJSZ$!y@_MW
z`OJ(H!}h;E=&CZGu^kbNlXBx*$qmQ_@`Z2*4R-ox*3J?twXuIHBWleig;Zs0B0r}6
z`{zDL{#%w<YIauzp>T-`M%Rkxe)!Hb7KqG)Hbl*8{$BM^{r9{Ipe&G1?_ZvR7hLR<
z8Eum`)QK*f3-Vk8&9^ial-$zecGX1d8p?~&{5VENK!6(60J(s^zWbb_#|bhP*$SHh
ze_vZo<YVIHMu<Y0j|M*s!kf46?GLzYcj(O=y?z~*n2CFc!9Z)13lyE-+L`p-1r8Gu
zOfb-5IGgy94NVS()TU;Xb|c`uPK7=E!q26R0_Y0$U}8EuMMQvM+@Gj|hN|y*FrWY>
z3$l%Z8V%c}Y;MvdF4baP;9KjhqC81hs!~<-e-+|ySL2mFRy>e7cf#(kEw8L(rP_ue
z|I-t&Adn`tPf0UrcpW;1MOH^3|GggvHXf!T<6rGi|ExI-jV_uib8DO6OfOrj28e*K
zt^4UN!G*JzGurr)VdU#GEix)^$abD`%p}OrvXKb&_6Huu`~H*-w&umj0AyBaTU+Lw
zoSX>We=H!$NOKE|<dhV+7Mg*%QAuEJqvP4EhGOt`)`+$m^N)WMHuj)+5Y^&KQ1)~h
z@m?}KG8HN?Kf9zt7o)0h>HOb;Gqz>~C64N@ex+{U!=*BL*2C%l-ivM=Z49uaOxV`f
zyxa6^JQ*NB!X#Vfj-FvErdD9e)CUm)atY81`S}lLRU4tyNb`aKx@^nI8A!*M2?5WY
zR&&vHs5YND?Z(pPjl<NbQ*r&9|1aBC%C>1|IgX)UMd#?GzP{~wc}h&ReahlDPN(B!
z0nLQ0%$g!-ASkY{ZfeB-I%goVp2#t(y`VK{n%tJcf^ee+YR2s7OObxl-pTznSonSy
z7TH7q@kH{YA&9T-hqTjARDFFR9bsgT88JG0b(;94g)l&A7sqwo!kIG$z!uqnM^Zy$
zW1BjgYrFiOwWp=Q#V7MSoxg?UZ3@(o7Uxa&%qwnI#C`sZ8f<TzO3?kst<W8mHGBjx
zU~abRr1ZTL0t$uUku<0`59iDU1bpXpGjvu~^1~9#o^F7GSK#9j{s-O<^$=6KTkwgb
zzH%04ulx<wP{-em@C$Q`?d(&5h9O)-uq)p5_5}s7xegBtqZSd#G5(EY%2D2opxj)&
zZLlWOJ$*=!YAoUym-W&llXmuDptj=k_TlHC@}dQ=jUD0>l`-|?KkVfCwh_u1*|IRd
zI4$~U^|pUd9w&^wz~DxE{8<+gaFvSK-{+gi;*s|%;NjLRS>O}dj|OD#z~AnNpp=?!
z?h(gon|_Z{tS9S3J&`d&D^LS-d|*d2w3)3kwVtHwi_=b0h1fV#CmP8$T}LvjPRKCT
zOZ#2vFtfd1h#G_}wU%%4BHA+XK@u+HqA8GWgHaWnp3eRy3^Bn6pX8p+vVl*}z#wFU
z!UfnpY&`a1X<!3Xl7UX|tFY1H|5+J!r&CZnEOcc7k*DT!KnmX=n3J;?2ou;kI$EX4
zOSTa~d5f%Y7fwnC9rHL#;L>l;HnF=C$REdcfcM@)?8IkJX0`5f!y86UcQJ9_TL-`n
zVaON(kWAGD%=#cOB74$I&~rOFEgE&(XkIzTsxXAMwFv@bs>o<OM)N<v*S${aN08L)
z^qF;zCx+KT4DME9j^4j*`vK$=Q+9m^^wuf<ClC!{Y=7ZRr@sk>51-2eJU=>;x)IQ4
z?><HYb46d^%a5oeB33B#zibmDwL+`RgB5vt52#0Sd~mNWl+C)P32s)z3iH5d87Yb~
znTGVXQ)pwPzA-(+EDf7%(Tuc{{3Wc^i>~EBvbduT*s2|gdHv7uuDbnp@81{vOG;k{
z+{awJK>KIId4=rGC%QQnR#wE3XJL<%Ai!!{Q#>?qTtz#Jfiv~}<K0Cl2h5r4%s=!A
zA--{UB$4&O%HRZQdzopE0^nBzBmglV^XbHu*75K_;`9}u3t3f%loOt_ggY8_`S6N8
zo=YQQF)%RnCp3Oaew!#mvgxheTXM_T*4B1@b%nQ|X)BE(XIE1t_1+(kt097u=S{=r
ziS)tv$2rZ-W2wS;iYl5WO%0be6R1>_((hdIvPLI<TI?@*qty+$Q31C3B`t8F0ZRJ(
zua$5&sx>pbe4)v?n#<c8!^^8d*+N-3->7LHcptUCMgz<zMb+y_V7eupBNX?(q<=np
zc~%!_z+?x0NNBKxC;ci^E)Sd~?DqEdP#U<iP&yYa=#8Kn{!Kj|JLsKC-otkIe?}}|
zO+*px0XXs(<>yO8ahEE|nk;m|5~cBbNux~UiBkcOGDH!A$vdAoo{hx7$<uhqA3fZj
zbW}+T(GwKzK#lU6#NQpZMajmkgu_vwC;g$%!BHD@i6v|w`_I_p1D&a<ITFw8!x2&R
zF!kI#o=QleFAYQw(&6|4L?Vc}x=PC8|4(MK0kr|OiEeZUeg!`nlv(Ev-v{?phpDSp
zxbAT6{g*r#0v+Aurj4$W-4NvPf!5mQc&V1?Q5BPM^EI8Frqhlo7%cPp>*wX6Aihb$
z(-d{^7$7Z{(fVD^VF?A?j<`=nu(7#P^4ts`bN=1kM%Zv29;dj4?hWlNE$wLTQ@>Y_
z;#H2J+_E+IkVP*~#|x2!UI{t?YPSd|f~HG207k6&9XqGc$*Bn7iyj6fx)tA@L+8C_
zNdP$>e*8jDPrp@BmV2Z#T=}TtXY7HxTimfnzF`j8t>XvnX1v`iv+x||7x+;eil}~P
z`l%~R8`q{rYJZACK{x=mU5mk@Z`Z!wuKVz~q`4I5FZQRxF0)EL2yP-r{aYD1d*`UU
zctQjNL&FOoOqD6(Z^mDSLScY!MxAUsVV#XXnJSxR!2tsx3#DKF03f{5+xz5yKXr{0
zE)cL+P0Gzp0Q|<c{+^K~fBFal_V2S69k_sCvQ!Zwq7MUi>ySPnsQ>ft@4q}RT}L3<
z0E>JmmFNlJ(GCOALikuK$nD}Alu$NNJM>H)<_3!S06bA?D81Fnih(W7!O_u|SQ%an
z)g!caFH+x_B_U^khqi`|2Zv|f9G2f1;qd*y01fVouu($PkW6W_4z*9RU`Y8_YY9%$
zM!IbfS@b~kkb<#A__)Q``FUac_?Rt`onUfl+-SM*()|2jSzbU)Bvk-0V5d2>Skoh&
z`dBD>9u$zM2W<E2iC{{-8M0inv$JV|^zv$d;4neoD+!u>z*{)!B~k35T%uHV&Y4gd
zU>_|rz+FWT&(2q**1JmM-5U!_TYXup?|>PH3Ak>NWU68r+(`R?|B1KnoFmcviPV~@
z(|_?3ZGgN8+B95hnSTGr1+<B8IA>S~m{Nq>hCaqV0E!4i23SV0Ib~u|Uw1MCy;u+(
zHFa+&<~fH{Ey}qme3#T9ItDEq(Ao)S<u304jM328SoGV|ACW)`eN|FE(l+TZah`6X
z>aEY0@lP}w77XU_vl>*pYM>cQ1?>60%Vn4xhv?voMEnxSDpAK}<+g>h%tx}MCIc$t
zA;2lR@y8c4e?JO?l=Z%fc(oZI4W-*pU$QhgmJq@DIaP@}UJ@VznF5sdkO<`O)@tqg
zGl;&vz8+&xde{sJEP!-86`7ITA*JiJyewAD3Z%c?Gxv2sQz8v!o^z=8Io&j6UD9@P
z?j~rvC{U>I=q2NLoh(1@&?B5O1w3=;8O#otL;qGuKexR62_8p5&&Jl4mfZs>og<J;
zE6%9cA%L6{ffVPgdh-;i0LQ<bfoN(tVKf*I4o(H6yy@z7Ch7mka=7r*qPYN`33&r{
zV>DJ(g6j(-;gWg$Cs-C6cSw#SI}2lrq{16>0-2mkQC{Idz_OOhaTP2A&?xK}NQffo
zIP0K~8b0>mrfJx}_#L)gCl>U)TB2~iSOam+NQP{HJi)pv$%el41miUW^TJa|%A?WV
zen#QfxA-PH>DVnsYd6=@ml<@9k2k;GrQyXsqLJ9KvL3GC{aN7zl-Qzr(;wktI0~Z`
zzU3uoeV-fIwzQ>5CckH@N{$}7W0ViV$pnH!^0W3252Ngtf<~7D=}cbVu+%4m=k#FU
zO8uE%1@=_bHKcS$YC4H0#95wSNLtxS`<5({T9!G<E0|KlNh=5!$URd@8;2xgK!U~e
zRBzx0eg(W;I{Q`)#-oAgU-tw<cYw4r|KJuGhUcI0r<)d~t`5HeQ4J$$f$uYr4*!{s
zKKAMBT#JG&%ZCXH5dL5E(m7)lk8+L^m5)h^Jx}8m44!1<{&dMnS7fV(I_z9-Q?Dk$
zEs6_^NSpvAeZPW5(JFU$1n-JLJz#o$ToH^K_t`D2AUmDvO_SuZ9Nw!ZRC-c>Wga?!
zBpgw+uo4xmDEBj&iMkKI;uR*7@TFL?&7y$OM#x>J7d|h+z-P-f|2u2RS*UH-ZE=BY
z#d?dGRAD`A=k2O}jtrhvN+1+E<%eqI)*Zrtf)Bj7L3E}T7IZSO7No*mC5!IXuEA)C
zZL58&eV_!aMziu}(v`P0Hn#YRh@!K%$(pf`dyhj5nXeP_5$(9E_tIhiGmuv4s@uW8
zZXK+YZ#TPj5Ec*;GA*sglHyh_DFp;8F6)-7bH-ZuhoYBJG$A1=OsmT`+43LdxsyCf
z{)=Au2vI1~htvzZ{REb#3K5GD-1H6}(tEji(R%y9+WZ5Gwxw+U>-1j>FuMfiX=ccC
z)n=q4$h)A_oWv%WhcS<MSrRVu7gaj9V5YE;2L~A$oB|F1uxrntKaBxznqX!(yob3I
zIvKv7K=Qr)L@h~+W(*9Yyo;{S2O^d`1*7c?QEk?Wu?fh+@=bO(qtF|%egxfLb`hoi
zH3H(%u>$|1e<)q;7iJ?zpWNHDQoVnW|4ea@w5^)WwZR5G_;Aodjgq4+Mh_IEe2U=G
z2ZMGOsU3)$S!DCW2!?@1Y47{>03%RLTi>|RSY*Utv7)?A1WM?9QVD0vl|;fv&KS@3
z{|v0xq=cRb&DU@cO<&yTjm5#L8ai$%8@;>b4$s<r(WTshX*n{urv;U&3fsO7{}W$y
zJ%c0K0NruSt~3vM?s)=+epYxCpz?=uaS7;smVe>+2X7UaEYQOzyD0PKmx$v%&m&pI
zQ4t`7!eG2;h$GlF)DQCF!$wCJi(}lM3B7@1;Z2y{l~LoWF3OFlu|%}9ge9vq2@F3G
z3D=Iidyh4Gjsf4cQcQMv^2P-~2ZjS`jZv%m-_Ewb@6ip0os)x=k`|VEOBR;&Y1e-B
z?6a9j!8-*gmSMH&*lXF1y)!1G-vNlki(NbCo%nFe_evQZf2BI(w1f^%|GP_q?4W#Z
zU?Y$LT&u+UD2Mtrl5Y(87AN@ejBn%FGN{1`*I3Xk?{Ik&K<NAS`7&zUb}^IponJ@g
zSYjj+2#GSe4D=~Iva}FIuikJqJ&M^*x|2%qyd?geDm*&d)kQv;;N7$K7XZ$KWH#PK
zMWL(~Y_jcLBtoJkOvak__w7;q+t7g63xZSzTl3A0Kc+&HHxH7>!IlC~ANiVpMgPRk
zjCLejwvf)`jhX9_r?`jHR^>kIWfs9k970^io7kB8Y9eBsO6OvM^1omrcz^dFsdn+Z
z_2*+w(h!ERgZ#M1d|8yzXTJS!Xz^+E_sQ5S{Hp?}0NJ+ji>h#zI+v9WKdzR^Xj4Q)
zl^1gp+Q*NlJ-VKoU6_n&4ukjEbaWuXX5!RJOey`?R`kp_dNuS_4A*0R`hsIe)5*WH
z*{}ERsi;`!Z`+*7!hpq(7m(=F7ovPV2#jLDkiZJXd1`p3M<zkUuTauTA}wr9?IMY)
zzY~ou&-oxZ@#M+oN(B|>O*7AwKSmR#W@5mLnvX<<dG+=Yslobc{AWL#^kK>J{V%;!
zX!>IupL;miSQif%1mguq3h@aDh5*i8G~_RYENj?U>6=4^jBN#m#jjuFw=se$Do(u|
zq&^HJO~ZJ9-}{TJd~#S|Aem13`!aVNZaEW{^-0vB#=*>9?&Dz(o;rDOfyDnpl9;k<
znEJe4Q@ncPvn249c~UAr9_wCGImmF{Oop8c@3BnZx5QKzHtY5{`ZLQ6Kr)=^Em6*>
z1K5atRC6};bK~Q{jZo98E*{;ZtEl(}ihIuFc3n;J3uKjEza~G_0*>!b(!z?;&ovdA
z-}barpM_snRY|@5g0V`dj3c=^I<|77K*wl{%4JgY6`L$9PR6jyr@3xezuAs}aV||`
zzYJPh523DIMfTnIHiB?PSi$N0dC8#T;N@$ndGLGzJr8L@!H~;kU1Tss1R-Tit@^Wg
z!}yVgtk_JhzM32sENoqYXIYALv=o@NtSy3RY+S~w^pT?jVJ0goEl009QK66;{$n}w
z_hC8YePu@C*WyW9_Omh;^s70p?sa;e^+Cdnb8_oTwcmq$Y3G)eE6y4T1fHoN)oipo
zf~yysAEh~px>7OT4Hm^UmOJat$avC+y49pp&O;0oSj~Z)uDipc#I)Ee_k`YFnwg|T
z)4rD0|6*2sws!vi!K}75m7PgPxGlp|$DSQ@XJycPw|DHw*P1f<lmSpG!IlU<JaR+M
zYt+t*6K@}bqZN5L^tmb|+2-rluXhi}wIC`g@uShE)>iksz~jJ@!<E-F5Ry=U4Tb3B
zG>x8Fk4;zs$WBPf)R`~u$9k#r?Lu7uu0ZQdJ44ZV+3glF?<*OB)YMpCjQ=`0N-#^r
z{d01>X1)R1%Ve8bBs0KfP#}ws(|z1<8#wh5RB%eZkO6fsb}pc9_ks>FVZXoqk$l^O
z6hmzlso`x_YSyCeG~38f4<j4l;qt|nY2#x||5(kQc-XsWwRxN)2!Q4Y0ZthK0^R~>
zJny<VQ^TL}Q}|mtdC8uY=vR+ZO(I(&_=abC)XBsNfMx`;WD@wUx^Snlt&OC^^K2Lh
zUIKi}3JU3acyn+7YQ}&2Y=S~>J6qeW@lSk*Hi>;xUpUO-m&C(p6gi4cMvc|}=HSzZ
zrb=+1JnvSASe9r0B~EYhwbZpsv8`z9*S@>tN!dnijlc~xom7axVf$?{uMsWt7a)Ql
z(mHN-`|APJ24GtMxT>`8^PiS-4=<|KmFEA9No$>-em5wT{IJl{kn-D4vu)Aqx^=GO
z1;K!h(yd<61Wbd#M@RiT9IV?ve__z^8YwP(D(>faU^Di!#$$VRp`W(+M78sMw+vEj
z5xo#>Z&pR$Sy}&<pwvSCk;tad^<``rl{Mpaa27&2%JEj<k;;fxAFN9d7Dx#H_l{p~
z>qW2}cr2UCwLN9HzC>VToOGn5;zj)~K>qz>l&Uh@r@f_rD4l?Y&Z%>GNkC_`wZf=t
zi1Dth5k0Qsqy=p>!&`GpQf23!r`xE<8Y`<@2L}MyF3S)8z`2WU*6F$Cimb_eSzB{T
zyKQhE`egj%{ac<NhhDagH{ferP^%<y%(8d-xxWokz_hpg8g*PDrqn};`(<%C>33#>
z(@?-JTo;}B`DA4~Le|}CD_!D}{_rOt?C-efm;SbW!jHT)6v6~SDomadFbk~{=tue=
zq-vBOr8dnk<YRI4umZTsRi}>3IkGr{o|8NNm0W|Zl3jrm7N8}Bc4l;Bu<-!Y+LA0P
zBi2qP`#+N0Y{7qYj@<-HXM8Q=z;jU2!a{ya?nc5#Of&QOd(1l6w*<{Lw4PLeg&x|~
zq?qE$Y7i-Rs;QG=0=Z}i?-vE|7p}rA5YN-f?FiK2>J$kPv4S2jn7IYc$nH#mbbrDb
zfPFpCV{w2*a9X`CXrcHCxp;sP&H44O-TFTRUs|0v=3+!V`u#Au@}!@9m>p}RG!Jzi
zRp{Z{Tnpu}X!ydxT%w|87R}`Lr4;?^l->fG?+~o}e41b{rXJ3DRi(8WgvYh~Ie*Z>
zzEh0ADlDMiAXyaFE$kkZr>+F3#ufCjUxo244FJ}4oc^F}*}L+|+X5n$d|;-1$?I|t
zj;%EKaaRNf2Zx42YGad+%w0z<t^B5sEP#>vN;Bq_4;+t{1|CEvHKYy~tXz;<nD5<(
zpi+R#SwF9WFGBK(KFw4R4ecl{<g%tYQWS$OE+39(x8siMZ^NyzZ_P_K$?|jn2bM@`
zku!COY9fkC|7<Ldpxo~%>;1vk$q{sH)!4UkL+Je~cFFi+9ucFenP~j`zmYdh5j?+3
zJBW1LlXsI6HSBK%kh{pLVLRw&#a383v6aMdBxo1+zPoGRMWvkxL%{1ki6E)3XA>&s
zz%o{_%6ht<Pr#&=CkkX>f7{^depSthzaS&Ut}a@ayJ;f5P_|We26yJi91^DeeJ9gD
zIlzj}=xgyn9%CG;S~%DWh0oi-bic&3cvu7*o*`i!|LYV&>xT4p1uZEN{aqjPFj>l3
z*;crLa?|ZZnZ?+S{P~YF?%U_|ks%B;v{^;2n&M&vAYptUi^m$@eQ$R+bZ}566YwIu
zg(%qriCULCFe*_yd9vOhN}OSpuTSiiGNgF@enrckX&Od#uU>E%&|!^NHY2E=5e9ki
zc<b+KhqI>ldvl|Ihm)|bz)Ly3<4BiaGk#**z|Q)L11V%H>V5gPV%$b?K_K!LcIlJ>
ztf<oAx{IV!V-TK{ghNak!_wpR+5i`pc19Zx81bcnz>Dw!&y!_XR@V3j_2P<wnWJ3=
zg}I|&_Ftw_lNN<?ezB~oN(eHLdLHbh+Z0AIBve<JXE#I*Pj`JjWN1-WKBb;XiW+uS
z+TGE~_x<yXi51czPeLZXX!<VIqcR4A?b#&Ip0e1VE=KHTD9?}x70BnoHty}G@lnyX
z&mPTZD<)ZNcXv|;8(U2r#!v;c#+U?{aitj;s<4WqiKC~xCVLKgin3_7&ge!Zd0<6h
zZbZjqUN{&=?BUR<L}3nRRT8Jk-B{~Tz`J(Rs^yL)b6r}-O)MVJ{d{X$nT)QaQ@s1R
z#3F3)Rk&?z786nUYwP07i8Be_Le6i4)I6M+X#^&G>G)=BKilk0xA}BOXWP@FBg%ie
zdkjhLbuGwFF4MOPOrg2I<11uy&8C)9YM?beB}llrCs3)=^k;%Z;8-F$kgE~9bYi$0
zwK<F{d9zDp4soTb{R}rG4{XG_T|h)J9~plIQ{Ql#v@nv)JJP%{rR5snvT_tH`OR~O
z)BONn7Oc)7cGV#ETdHxW31BL2#d37_LN73mX*aF2?CgdDmO|I#L!ih;4!QdoGS-}d
z!+o_L3Z7wUeE((;7KMn?fLmHZ2@?NSYKIBk!f5mqM8+y%qx6WLzb<9fFN~(mUe<OW
z1hW@*o%f?FsH=|vuH1cbf}=622#U(IGb(WO#k_cW=;X#PhRulV5~{%IkU;qG1?F1|
z$fEFu=Wr=MYMA7lEXn)bKB@qz`;|5UL@tgP3e2i6fyBVOS_F_gU-SFRpL*>dDpo(w
z{>6m|tlRnbnV-lJ@T>nnw%$6bs<`VGmXhucN$KvE4&l(<CEXw`DcwpPQUvMl?(R<M
zl$37a+ee@09q)JV{d12&?ETBN=A3ITvz)jj+G+Ii&@!1<-t1J2hEEG*CX_@zIYY1!
z`d#r9c<qKrh~j(}eAtACdPi&&f#$v?nwV1-Z4P}*I80|jM)<9DXBp)A=U{7vI?%|(
zgr%lu!8uU8jJROGCeOcR_<=I*1u0k~lq%Az^n#L6hyj7&s2~nc+X?L@t36A{a;$f1
zi@I=hJWT#$tG?tmt@hIAku)z}Fg4><NHBE*0KQ9d8QL#2uo`WwNcF2r1lM-&z=0$$
z`^IN2*}Hfpev$cGGs&00PfS0|8*`hLbkA@GRpl&Th)Jq(4Qeut?TC#f-PRolASY%G
zGWj3jx@z^#s&6oBM41W8^j4YeqNQGsTRG+4CcZ^c{MzDL9BK6woGLco?X|*DM(>aX
zyi)n9&=Ktmns!H48#;GoVWgEo^GB$L*gfBZ=^I#-u`*+|5<=ctVprl8!FLT?&WDHD
zmcylbAUI@BH%cN$+ph<2zrS}#%^x3A(*eE@g@`L$k}Hs;4FifYy9^fC&GV_be%QbR
zJ8WZ14SUp^UKnMgb~i#T+j5RDf!$GOHv<jY{Gv#!Ht%aJ4l`$4Qif$EQ4LAkx%BDR
zlCRS#T$MF=NU3o_2-6J#nVvHJvG&D-8zhNA=`*_#aFX&|Dw5lr^F?a#pDL?mY9I^E
z%gbwXt1k+WiEvnqXDpQti6pVXbJqBNF~W>iQ``N5nYzH^)TU}Rpg!~vM)(0?cPj}E
zoS^YJd)c0*rRvV+OVAnKN{Qgi%5i)S)3iFC@m$U;_bVrq!;(Ek%_+l?3U&N~u0U#e
z#F5rlA}wv#EOZmo!&~e0e-;V`r)_=}TiC@Jhx?!i>DY!?+g7?*;%y~>Gs&W`5BgRN
z>T<!|d+(r|>1pw-WF@f)ximm(cvBz{h|S|vzX~x%7nmR=p>x<f%ayJ!vD}CEnbw5R
zLqklQUS-s23GLYG%u|UK!+o_xH<vnnmZ*`|#gs%ZesPhJst!T(5w9@Gp><^fq&&05
zVZ$L0w-OtW&cD*qudz`DzVB!v?#b^-xX4diCFFp4=Cz;Izgb=aPA%UlX^2FoDsw3y
z*u1tbhbtL_E@Te2dGmzc0U){iuhgHN<Kyj$lD0I~b6~4sU{fd3<@56sZq1kJ+0=M3
zL|Fk{#DEq^*y+~O)z!v~`r0Hl*8|v_cDhIIR`XT!&@sH5$oqzJ(D8*oBIMnJA3%8=
zJJq8df5tK}hOd6a$kdyKe~krPh<?|{u&Ek^pBc70>3oThCB0G{)V6;8C-nLgb@H%|
z?p)khp`O-FXD1HJ)A19|&e>H`T}VgAADyQ*01eh#$>WrN$C8zqkkYwOO5W_s!zId+
zuaNfEUxF*~@A}+#T&_h_R8&Oep38q6|0WV=BF8?}$bImIN=8=Fc1Fi~HA)1a@GGmT
zFj<CQD(KrZitAxmQ`>_@YeY8*cI~o?Dbq4D-w2bd%YS76#(NSEIKQk%Kq?}T2P<fO
z&?AuZ{MG?I0*xbsd!e_;B?Q8<D^ge7Kr1rxC)AMiRLT@8t5r(NL9Xd2&ng0sU{S+q
z-_9yIjWVA;%BmGLi`uC(01B0RVelOuncY0jcNUM|0+LKwk@TQrahq5DVZEEj8d;&d
zxC2p#u;fk7X~+*ZJ2}J?>WA>Qw2l@L{hUdLqQl+nZbLE*N+!<!RE_tcIUbAB8T2Jx
zTpl#Kb9gK{&iJd`w)3^kV9nGH{FQd9mxe#ugqVS6;E(l5NG}0>gAC=w6^mN;U+-I+
z8;5IZc?5U}<$G08l9k|iP7^S(Q^H%3Q%BKQiK~2+kjI=b?Pm*Bf=1Q>zYsS?(02L4
z7EH{J!7d$rlMY$#u%?-5T5&iY9v+R5kP5#66^|Em;CSYkHdEuxhJU&zit=-I_;8Oz
zfF9`ufyK&J`<6Q#p1yuScd>Zgt48y*VlYiVaIhRU%nT&Y=}p$zYSo=6Qv-`zQj{1W
zK0Y6{M$-2!@W6<$QURtxy9gO)CI8@xAh0wO6E!ILGev0J0BBehoUTer3jzk+3q_=A
zXZOAH<(|FsWo(`Ry8^JzPcvo;lsa-zuLZA6_VGiFX_N_4GGz$8DV_L4%=kbRMuB2+
z?LpicYmP5;Y4rEpDP?-y8E{)Hivsv-n?>UJ(`l^)(M2guDnZ=sEaAiGk~AY*saX7|
zHw?*E`jXi4wo%p($x7tW=G!Po^nLdn^gkAM12?@TB1@(j2-0s%7=9M;<+CN0^`igO
zAs51z$NQr&w#aa4Jfc5}@0<G}MwM7FHnmL*X9{RaD`WBts5YA9W{=U^J6`WF!kA$(
zER}W|G#bO3X#|r4W0l?7`ua0+MVjv2?M%Dx!Qkj<Zx9?ZS6%vS1s;u4h3T@bMFpbp
zcCxilRe<)pv|G%qOJBv6AOFU*1W|cju`H{7{y@n5#O}@bN8DA4vp=`%W>>$a8r@Hb
z>;xCLs7m6D=f01-NV)yUo67^=OmMB3ij0Sj-+3=Acz!shDh`{fPsT1RFP|UU7+!I)
zn=jSrK5lIB=#KQTGd7;Qb8WU=aG*EMKfm(_G;lx@B(t3Ji&0J`3fg@H+aA%y_HQXc
zup)eYo)T&Jy-XtvUp6d51JsRvd|N_DVLUkVNDn&RP?t3eHBLwa<&h?bZCZ?MmBW?G
z?7bDlL$8`=h+u9P&Wfu0-9hznW@H4%!r~1$Y0cckXFP*n8@OivploK1m$8I@Zl7`A
z(K7GSiR1oGBY!0H*gr7u^Gi*Z?3_;g^3RtvvKWfhyup=55^V8=O1=~PWB*$XC(M{I
z=c{j~5hsGf<Oy?~Q&TFH76)r>kHXhR*h&Hbk=*F?>0p}LLM1nB5u8<7r}09nF}_D1
zdoU%_vYk+8uz8T=?Vsw|_uBeBtS-0{2+d&wGb|CgY#l?<Tl^4we0}P)fL|Cw&v<_N
z_$lkAY1y^#D*{GfK6Z9#rh?_1s?68;c~!ux&H!urqZNlWU72)tk?C?w8nLI^Mg9iI
zRGowtiL5|?AM3Sko!@jwc;{zwClS-Mi|ewoG3Tw`7n~}u#G_O5Kk6A|`t?4!l#H^Z
zv}Hp06nGBeSU^YN%_+XiG4Vw$eDc9T#B)^rU`qytH=j@}91asmoiLg0rl>&bHerCE
z8QzFT4aCUEh+WO=!fSl0>LHSMp-wWYKMlmfQeQ%9xhzpfrgiZbvn;I@P+VlpSp@69
zR7sJs_K=KVdzPi(e@ztNifW1_eKaMEor|;@Nm(*JGgiwOZHcX{bcL*CWWol&gQc2+
zX$>yNYrj4&2?-dQoa@lI3EE#ayLnzPKS(hHrGbhw&|}Mk<7gurOf?bV%3~ANz9iaJ
z5>^|R3*oW?Y`7>kVto9(q%yN70ircCf|gbq4TXZ1#n=9+Wmw?Il}eVPR7*n?zNu**
zumRouhSiR7Sn)2a8xUQff>xk*>M%{qGK9D2`xt=8@j2al$VaIBYu_J+&xv^#wRH_8
zQGxTL^SxlJCoXeL1-pEHQ=$f!t$9WL!p*={FHmtZ-P~@ThlREMbcknZtv{HPR=LbF
zXlT2ifoVz-@(WF-L~+aqM@J+2w9IEG4UgwQwjQNmy;I#}s%tt;ZMR6?Q2)tiD3{4^
z?KRdEmy1P_ng`XSI(v)=c?MPLXHE@E<E1Vvcr5gdh?H-QdY9@-z!!4@$KpJSHuJ|{
ze}pgWzALKKNCqHaKpBve_d`5jnyAtXuIh0{Sefx9x!%p+?zggrTZv3O6Mn704UGN>
z1rxJLL2(9LW7{)IBDm5%SHHM}SsB}GM{75GY&dPJYfR0w8iK<6TP%@?lljNn;iS0y
zl6cpTL!DVaZkE_!V>tZ{$_Bi>y)UGq5`8A7ZRpsJrt_k7F=IR)hMxPwBM+e@dfXfJ
zMj^pl>K}*IX4JLz{FJ?3(Bt_)$zr1lOHK@#qZDOYz7or|5_4dFFF=J9broM|1cm2U
z6vT~@+4c9GMP2#F=vJ%0hYb(l!6K(nAhwKo009s47#pU#Kp|9{T^6lFMkmt{)6Sqv
zu=8jZ8}X&q1U}Q$L)i&D)=aI1cTp|_Y#LQQ(7jQ$w{JvhI|5r~D~5MX*;;{<R|F8N
zPzcig&K$tN4iU7JXlqLc_cKqmwzk5`bNM|(w3e{1U-QhWr~g({Wa>1_DbmfM_o$?Y
z<fr)s1@NTcmHvL|Cc+p4$4IdI%`vKEs}CtiOZ4SA^Xvh1LXh4sUJz>y3*~0IJoi(p
zs@9JFM*bGr*WmJ;C>oUz`JqFaDnC|03^u$IhDKC;8RoUz7$zh&Qb%-R2Rb4sEE>3!
zNXk;ZSYCf)v$71RL!RH-enZBnxap9&C6c*-R6&0;g9M3B6fkGreq~_NGwd#n*WQw~
zgnzQ^7ipv$nvJADBjy2_@LwM}a>%|Nl~Gr)27%Pv_jd|JE=M`FTinail9%fvY5m!_
zlk_mwEQWL{4DE#^7=IT5mnIPF5A^r;8V|-(eA2Lu6ktDT`J4p4magg?9F`+9vQr3%
z8j<1i^$whGov5>*EX*xa%%)&J>AZ#kyP4pjji4d_K{Q)R@VYTK{PjWv1@kvwFoUI&
zPK#Brl1B+M{ZR+2VRLCh>2$fL5KYXB2DEC_KxxscFkGzmM$WqS(yUVQ3W>7HJQf`C
zs4C4O6hzAg9R#GabrnXutK1~KFQy(`K8L#@6GSfhLX{sn8c>2;5s$Cqj5gm(vpMUQ
zZHul2$~<S#w@2yI%@|$`MQuUfri|X?rbl?rie)k-#-()fhb4+ib$_T$Z1PR6JPvEn
zzUe8yJ1F=zIdm6I&LQ-C7p99T5_#*jGx!CPY&5%V6Oxcg-JU+!1kw+Hwt2xo-zF99
z9Jhm<0n^wwqTk+46xrvNG3Ke#XZb&SeXy{|F%UG~)LUh@09n?ps>Y<&d!Cy@c-)QC
zjMmntYO;EV)v|h2g>v<nR36VtL{-%t9X)$HluENl0U=(`W;R$BJXdxFy|w+c(aeYW
z8(6>RdrJ%ZrEkI*CQ6bvQB_~qdF)omd%baPBSySm7QzLAJ14wT7&Du|`Zo9^FiR3x
zSA9k4F<;{I(=X+6ZQekmMpy}vVxiMGSX(dEDh|4ATJMA&d&@j-A&d;}JLpEhig3dI
zfNt;V5}L$O-HiH-|CK#B!a4~~&t#7)B$GeGeCFtLo67m1rCyPg>fItCrDYzika!Kd
z5F5&$ir-Rps_&pZvcy(AAHB75Q7N5RVb%M%JZ=@4Vwd66IjLBXk&%Do=QC^putCoZ
zjC<GOU6oIheo2-qa11Xlp^fqyFQkxF1w3B%8W6R$8nG*QIgKmvZTyP6XXD^dT`O}t
z65`1&w!IMbP<u;F-M!Y;u;y8N;%iuODNw7zJk%H41{V-Goed&kA0N=j3XnX3DVE!L
zTPiWVDNdkw(;0y2N3ehJ$#M%zheC7Q0C6B5eREqIA6@kzQnzI|<P7My3UzRB5IMNQ
zQSo0d0QSY7Kg|HKpOd@0=5a1AutRl2Uja8wPaS_*wf}k{24vHQi^^jF`JNt-YGH7e
z4N(#ufsb#^K$66<+tWmNjxtZXltju|Eh-pOnm_O<LYidq65p6`=rUMTq_k*dtPqar
zBJ`d|GD!l)gulgUY!MgZ&jrbcy6tFk;2efzH1*$_V#S#vx9mqL2_WgnzxvKYCIG!M
z<_g#_b~k!K2XWi(P!f4W3Hm)Xic4<8f2_L*st>nn?Eoq>xJEec%Qp7zCq(8D(C)GQ
zU&bzVc?=U*h5B;WblBx>CArotS-ek{YGlGs`nUbdo;DnBGTo&}6hF(G^D3Rca&_fA
zzPIU^kFtW;&jDd(uL<G^AjlzJKb>fkklapMzIj){%l2lOjz1%u9!ibPJaXFcm|Zr6
zy&7J9yKIx&oJFzbq(+5v(?Xb(KZw|a`m4n29x8I24o=3?BbjdYz7aSxtzomQd5U6b
z-9^6O{(;<I&EB`an_X+o$|x}HAu1d<_6VzG_OMxbV5m>O0i%CA5wA-Q4cvMwVn7;u
z1Za~mK!O4SL~lHQoLb-<xcMarV9?QbPGykn;LV#~3;EDR&E1=IBM5p$?VW-<y5a7J
zXq98-GR6Vi=IUc(BzjNT@zl3BI?meya!Mzb^VaPM7pEaxL27(%#jtA;s_J>5_RFE>
zRyS{I=rPK8!06}&=rV9Uetvs9fG+a%>U6DCx3&vQp=silGT+N9l|Ie$T#A)R=OV&X
zPfzny14PU<DM?@o_~%ub4X`X$9TzR@H-`7a!8+s(L;<b3U5`Lwl9I3>5DC$nwEDIQ
z;y2-VTIx8vRU(;!DtVrKeWa@9a9tTO?|}t9JQE&17S?h0is7}hj_MB%v$JF7Cr1YZ
z`tk4}R>9bmShtosKValykO%bj7eY&0bVZ=JUI8P=1zER5oVc@?z{AIofa37^0wpq_
zeK!su_3S91$P)C5%*2P|g{MWiIy08Hy+U7Rc$z6+uL_c~B}386%eF8m^?|w9T1;?L
zgEgNccl_>i;$}W*l5AOJSK?oBdiUt@Tq=y~fM@Y*cc(kZM6k2_nEIHpuvFNv9m9-p
z?&o`A$qg1*;JZ0h4O=}`_`V_!R*hcs0R|;UW$}q&Yb5MtrKM@qI&`>-4Gp}KqYNib
z`|6|>jcfYs%-ctdgYU>+9K3vnP@M2R>{)<BJs|-*%Jr72hNblPaM76A>}vbS*%3P)
z)As_Xahx4W+I>1CYB%@ALo#JTvsQls3!kXds8F^lr=qw>9iK<fFe=Y&=^HR|ybMgK
ze>jf=fKdTyHU2=+-z`l_1ASF?%QnUnlLSQPniQQdXD%tMhR;ASu?_U}__a{6E^MT?
zJ`08ujgo$bqqiw$aQYxmaIsxrJT(u^ZJ#9}wdxD?7LE|%z!5es^}Mv`qvZXf*f&J`
z*0U4jX@<nip${)tH3+sf^&EOUUYlLBUh`7`>RI8A&K%ykiU<I8fSJ<Z@NoA^lk0E4
zYCO^@9QKpFv7g${iuBlcC8@ii^!1L9D3%i}Q-@5k3b-?#?lT%C&zKhT0|TECO9yJb
zR{-!R4!4gi8R9!Q-R<$aXK|N7f7gE}=hNoS&TI6zK&MKc?k%qSb@e**LQrNE7dzJJ
z9><3=o#-OJKbyGYOta!Li1T(#7SNipXX5St4DZg!3h(ra<(i7++K3wu&+XgF7;?Ti
zt;}<ZFVI3uW|u)?@p(>B$b;xU0>mb05%6*a+Leda)^dG3UAb@X|Ha3T#!ZjBY1hYL
zf0)|sdtnz`h3DTfIPJuv-I$cdXcy9(!SO-zu4{QHT_*>QO85m@6)(2PZFR>3s~zDR
z=WESSZ?=Zp-mllAX_SI*3UqMsryA!F;>`ODcx6Gh7(6HH6xj?pK#xo+Ufz_|Qtv{N
zBE9s(-dbe*Vm8ZDqfuYo;cA2Wo|}^)C!b}b!L`8}zKd5^%etpD`ETC2;9l;(gcKJO
zZ3cEHGyBweuMATXeUpD~YXqz!!D*4tm&*bP=qerN)PsT}Ibr4Fl7v1+nn{^m{i68!
zC1a+}le5l+F9C-%g0qXWs(cjI1H2pc`7#Twp_5O>PhWq*wCT#9$39#qSf5E!1J(Ap
zq}4?&X*6@0Kue3t#vNbEP3;7W&<6nD6V<~crzk5MoN<FdPu9>YF|Gg#-mP%yi(1=E
z;4J1BF3VVIo8JtbV$%?<@W{qHuH+0lpYW&lTO8Nd)MrZVBUG+*V`=SC!_0g@wz)86
z>tB+aiuIH6+t%uGaKddNH#+I(AW3AD<f=;U<@y54OiTrgTN@Cc+yOY9v90aTMS6j{
z4j`)g`3DyPg`vNqV_5rB?@f4AR8OHwZm{!Z=OvIQQZ$0y|2&ZCj^t3V15`ERV(mR8
z9xF>%oo&N8AU{Al?*ZV?;DE<v0S7ZP3yRT5Kn)&W<#fK)cl#n-QO8-4_lEbQB*+kT
zYtOSgBf`LY@5Yne3W*rRqxht<HkfTgrd*Yc)Yq5CL=i{PKN=yaZ;B_!p;bwT=z2bO
z{uv=XeRzCAVtE+hW-ShC=BUzlSD(o+fkL|=V#zNmGBWr#ltWcHYREwC3SdkQX^nyv
zTW8W;7zf-QwR})m)4C+=29WfL(uoZAiV1w6!_e)ULY141v%A~3_%P4ZTSH?+I0t`2
zMH`rh7qHL@i=&lXLtur>mT+J-6)3UPald<a+HowKV$7G8NcmZ3z!|xzw)Vq^%&(nO
zyz!aoF@eZi%H$@z@G=diryzH1w%;#jp|`TYZGrCXIk5Exg?FpzomjFpG@Q3}bZ6IR
z_z<ua;OrT0xpa8vI1t;6W7AUgW#MGsbQ$7){6;9<^23goev=FG{Cx7@_`6puEDZPt
zLtAu_)-bP8jF&<H_l+lFL`0XN^_dy9yLZk`PRb|-dINt(IVTVT=bFelysxbKJguaB
z5C)MF*ylKZv|sC-z~sgDdyfv6p2MpzT+HWavFbf#|7g(psg3-(qR~D?t2+o2j|XFa
zLZ0H!5yIoq=!l!FIj?h2o27It3B9(@W!uVOdeq@EWxn&#*5srzR_y(b3I(*}#&=p}
z+mBC^uH>)&rl_dY#*u{Y;Z-(Hpd1U{N|9-?A=g;PuH=kjAHZw%%u~H;vnlzrK-Sew
zF?s>g4A7@34&--x@IJ?ZaWlU=+&;_T49;vJT_fe(#1T<5<UjZ2BhwDtZW>4p&JQ!k
z8#ybTsv-EmCtEm?Y=LY_w60zKA>xUIy8^g(dp3K*LYUlb&~h2xP_O~T9-Djd>r5Bv
z<zZF&%^wk?@R*BvuK?*bB|m?Y%7_p!po1am4oQXjdlTQ{`m1qmj~czOP#KoxJ0}NW
z-!D}S#vRA8;y-qmZo#gR5)8k_=`XEUFj>eH$&SP)RmSx_j1p;J`6`kvz=va?69jE2
zj@P$2;z8S8xaF@c81?e!YQV{omJdA~^DrdkFvXc?2oC75_xWec3%w-Q5!$@=hjxUi
z-Ko3%3y~o3%^2@`K-8rAU&volR1l$dWPIFW0<{Puupd$OEfy$)Pla!zqT^XSj#QXP
z1Z)HRpcMu599@cjKw^j#=b;6RE5)<UYF_b(yoxFX+!eF2rJ14DRASwsT%sYJgp?HO
z)1Lvxe~`tN<`}bNl1yNfb!+S6X^TxCz_^@iGYCb|d${hhZv*kJ0ZpeA7pc}6(5)`g
z#09yw&UT@@Q`8Tpyl}EA33v<LL2Y`A`(2j!Z~o8xhTaF%Z;0tpW$+PjEJ}kv^WsnR
zVug?9%$?OHyf`zD>GY5)kKr<QkttKZ9wI}-c`MUtE#8<DJ3zFW+JU*9squVM9j2oe
zlQGH6j{U9UKIbexFv^PEz--c>Lh!lh8J+@P;%uVKUmN->i&ZhW|0UUlapJo~7{hg-
zCU9<gf)O$VJ<x=GZg4_d%?8l4b>>1Gyuf_5yV~z%ZRH<yUF@-@Yd_=^rb}+TLr6?X
zK~@^^GKDgi=bEEQXMh0qKuP?DgR)RR5-w$S39Pyl7fXgOAi{&RJIgJR(2SPXkgREF
z7&u*R**rcj<qOpe59r4<M*s45mxDsFFeFC99xJQau-Hdb!+>7wJ)uu^o@2BFY6fFH
zC4l5QWB^o)_PRd1Jy8F32T%&ICeMJD<3k@!5CGSD4nlyQ8*D}P>198(LU_m)3gn>$
zyx6F;bIxOfKO$nT@Dm@Zhq=GOuprInv4BwLHOsz$)cb7?H*CVq0Sc4`W{jT0{?3k2
zB4t`bLj!q6LU=x>qvPrugpr`JGCn&Z%<ijEdt*z>_3WgEtRmc!8Z$Q!Bqz$smI$k-
zMhEXzSmZi~H_fZu7HM=j=a;WhiD0uRGLSI;M&Grl!1=ixDrf+NNB{$<0^3*;{{8z(
zp85V<X5*(Lm)t*Uw0}vDz`{bhuIaL={=y&xD}Lam0CQjoj65%8Zv)oiWGkVX-&#h#
z)Qv?44~y2HBt_pqS|mw<xO&_}kvst0**X%O&y?f*J*&k9NYA)uNlvIB!9Ng|k&BA!
zGK%KQ{|(u=GGE0V)ml!Fih&r5S-PL1F4LLw+b)I~Bf(amTK&wPf>Dgp-nk!=VK$OM
zTTWz2iHQYH_5$J6m(<iDj*q(QN(s8IEFSFJ>{Fwoqkv-+<y&0q^{nT**)+r-vGoK{
z4rMohuK?1?h|Px$^uk$Ox4&WqgMN>b<?G2$1BGVNZhv2^{;lZCNiDT+GY@2(L_b5A
zi)1jE?k(Pi#Nnfr(c4qdZ1$9jR=DU0hUddU0o+mij~{P)`}^OiYf`Q-^iNEni#*@u
z##?2ZBdC=CJ09`uHw;Vp<yRMGI7q4Gi6{})Oe4VnoNd)vV-))5x)MG^i0lH0@snOr
z6Hm^jPG$S4R70Vrp&TM%TERg--u@oAoDe}z1%(=Vx1Fgh#7*W&q)1nkrUi7)F5(O*
zk;;5~=Sw6U#_;cq2|y4+ferfnxyt+6#-z|M^XbeYg?A23j~^=VBJhRhk+*)=L4x*C
zak#}g6PSjuc;LTUJ-(-ns;&P9a|reKt8)=nqRi7Y!kTM;>ebs*U@GKVBnn9U<1Ou_
zHvE`odfy4Uee8Z<K6)>S9XJVq;DB(XAC^weoLK3VLDt_T6#**=@F4lPLp$s<L^YsC
z#fLZU`l}QuW?hMJLEJwL2Gk8Aqu;<Q=YQN#02|Lrk}aNffab71oZLY_#$|@g4q~cF
zSai-6pDJmJNkebdf7oiSVn2pg5D(~}ExlrJ=CVges$k$|C{N=v&4^>%Il$@7tOeMn
zzX8GI1#Ba@AGrav?U2=4SOenxZ5%s(BDnoc?<JtZ@$nO}-HSbAh79X}e)<OX@KkQO
zoplxlm<V7TC&?k$9N^&3sF_@W<1KuV`h~#f1Jhe0d1LP8aq+S)3XPYTB(;3<7Q;N&
zSfPnvt{J(v(GAj~3>zc*rWoAV4k`zHo)m-<U~qCe8?FP?`&O*3elG;nGx1-0bB*x=
z1Sajp_Vb3F=G+XjvL!O<Jbx$8gWAamU6|oZLM7V!Fk}<EB^Vg#JClq8*HxI93+Udm
z7>QH<&gw9#R$8TA44=yu_xF0}R<YpvhYOo0&1+(T%5Mt49fL-2@a^C5T&Mq?g#_5a
zIyKsYq}NiA@A6~=CS`1L5~FxY=+<aG&iFl>r!)sCy8S+{;CU~t(#HOh2(oEfGwGn8
zNWWz5CLc?gm_p25I<$vcwvb0Ic}|%utGFBpMRho~6Tb-D_RK~u7(zgkgKG38wm1>+
zZ7|;i*$~7F<F6ZflIWP~X<YF5FJTQfZwAe>!gQtO|A+e?qfmaFG(++^qj-J|S_<g+
z(TaI5fnGuH>YQeuxr;7bIzT0($~;RQh7u|xO6M2GUQUKMfWwL(ZIIfd^ug##hJCIZ
zk+UZ?FZtZ<81)+>Iy~~*42G12qhGdaD?%Oyt0B=xgwTOH29_q-m!fa-#Cj+p>%xi^
zIP+>fP7C)T(+(ttlkwpGZ^<%AGMhAy&Mq$9?Y@tbO|F(*SY_+WenDXOvNPS>KM95%
z-G_@~V(+11_jbq|lymrQj=wY2pT;3l9q&AjuCn`wAYUXXWs_(e&!d8OZb>8)Y+BlV
zL$=2aa88eR_Rof=QTP-hWH!cGH_en3GhNuO<REilB@pQ2_e;f#2(#+`Z2fCoX(K_e
z`TH-!DA6?yXTP1Z*<4}1E=t5+<uLU2^-?IMp+7I0xjr~5{LV-wT>^vqv2RP;TJgRa
z-m4QV!B!w@r`6WJM=z+wfphye^K}q~$8u3_ETHfJkgvE`_%nFSx^MO4_44x!<gw?C
zJU8JW2m%@!TI9G>!#ETn%<OBacODlwg71(V&=d)NZFVKk|01i^XzZH)t=$zp__{|^
zeR%Q9KPZ8m`=b<6jR3Msc^zvBQ)jUh)N%MG!K~I2bs-62O{<Pz$@#OVkJNf^($mwY
zE-b~B{J$h7fcw1PSR?Z$_?OlTF7VhOU`GkuO6EG0Bj{rWnz+{)U3ouTY@^twc(_Xy
z+!RJymkeu}Jonu%`r0?Jjr)t-I@&$kxVD9<3E4qIKt0HQMS*br{X)vDfA<lQLV#X|
zl(aO%!(zFCU@MTCFV@-OgA{&$OgsLt{@&S6g7G20pQ&Gm>w;BQMNK1+O0PSfOav~f
z9*^sz=SjP3VSL0K$4-8kq(3KZaJ9A#ah%-%2KAe=C##$d(q;iahD3K{+TW`!_(0s=
z377(inWlv&B>!DifF0DO1iG3flR|WGOB0HYEuI-rz2O<ciCj0S((O?_7Ej*Gd>#0G
zhmKM_Xk&jb*wg(_2y4syQ3xpiZn~nu2%o1Bym!bz_IvHf$N6xNzSc}c9y-~0;=Aw$
z1+M2NdO@1$K^Zd03I+1-pzJ9`#nb9Xajp~op25-(jD~r_VXc+^s&i$91h{j7KGy7w
z+3u`;&MqW2s(9T?k*jpS{wD90vX*y$wyveqwl!_aD|c^klL{&uJ1|#1NlXdaQH>|g
z8K<?lxWEu{@Q#`f$ZVV6K5!7>PxV%n0`^zEwbE8I4u40V*69UZxD)zSC7V@#GW6t+
zH|yEz+6f(l49|gyeaa<e{ikhBk;^$blvhU!=;^$Un#VJFNg>Gil)McNt$_AGb_pEh
z!Q1LHz?N=$;5+cX_A}&DfF&CZ0c%5(iD>?AUJ)SnDD!`LhQR!6-dDeT$n@RVw}PH9
zzG!MMNV~zY%sCJOe7!@)w=HmYR34Uyo#nl-0n>MM#QD2sat0BMR4LUTl8QZV-miTq
z_|gj^ev<IbW~zvw)YZ_y6=?X)q#x5;+f1L$4f&`52FT{@BDs!=lBY1^7Z~`T<n>jN
z>$yT8O7(?mVseeH51C~34~_^O+{K}wAAJKPSo%**;XHPWY(Q%iGBA&Q+IWPuz9d9(
zg3}0eIDWtBSTeyy*7t5MsYS=TgjDJ0-$|sLbOAQ2Er0zi$KU(61Og!?&?@x<0A;`c
zsn79qf1YI*XhIe9w1tEaE{~3twST_B0?|RZBKi>K@k@d>!r>>+F?msEbAFdCJY$-f
zsT`Ov@ro<9ca}#8A5Pgq;Nz2%X<if*d*~xp;0m9vD3Z4mykeVgRj&YP*ir5}JmM-A
zTi1smWgwtE0fa+4SjFV>jKItNVhVip{N>E-$n1Sx-G?qzBWhn&Uxhc%P%d9-4Q93r
zvo{q_*9M(ycWid|f2*U^b#|_*9OD!``U=ix$T_zwd~&q?z1Aj0@ZYN$l{On^Sf|HF
zZuP%`AXPR~j@z$EF;5p#EUb%wa>xIjXQnf%`gd{3kUEb2UwNix;i&M$TOQ3V%kC)n
zSrvw<fe~1(Dc1qd`3X@W6WTb|T!tLqNr2U==j|IE=M1UwFkD4s$v+2DfNzCvTDz2K
z9UUDQ12iUT&?qP<!I!^6{@$3busG<#Aoh(Et45C!@du=PiMW`j?_t#bzc5*auIL_S
z@%<eP-unM?cA{&(N%ZFpWs2-c`K=hN()Uv7j`uQtJ#aSQBQDZQh5(Mh9a7pVn~K|9
zxiz9RX7Q2Z@a6afre6LA0)X#wY)U4o$|h4z#Wf-NaO;e7#){UQy22zWuIGm9+_-lD
zc*ukU_1^=I0r~_mu&}Ky9++Lyq$U3%%jUc#x!LUh=Ncc_3sZ-_*~k#_ZZqu7@e+)P
zhzQ`pU<&vy1VxQ~{w>BmOpQ-&ZZZ|!yws^*?nBR_Hm2_{$MAzmHV*FC?r=arjpSoW
zQokhE?##TA4mxCv0FIfLAwPohFLGhVYquB>5b(+*)9~;4+AN#h%mwdQ6-HXsjg4}T
z01Q#yyV!1N>v5u+$y`N*Ep%b5&nqF#K|2V13J6-=TTz+!#O!OS{@+q4DF%t5&gD=a
znLpb*K+8QRMp-?5ALu}NhFh{HOxD+>47V0yu>^W;CG-M?DST(YPjqT5=&!$@XaWNR
zv=rHNJOZ+;pXCxC&4GPdo7~ytx<6TCsr3Ej>E$CM<E}A{t{_v{As?c(F1LaGjLv_P
zoLX+EtS^_4e%nvq9gd#4ZXXmdr+RsQ(?Il%@7}$W1?rG&+i2<)AVfuYfID{MZ5ZiX
z5>{ejRRQ%w|1S%(KFa(#*4@0m!b0fhc2B^k<4FWhtE@eohICb*JtoQua0S6p-0%Jg
zDf8!<vluy;DKG<iqIRzXU5%4SK@CgllOYiS^Vk3~cv27_3<<YmenSDUzKGPU81PTk
z5oGgJ0mf?k>HboDtU&V{QZgz-*xXM$fceFpdbl}>0_+?xGX+Pm@n&s<{{@Va**dVj
zz0ffcR;FeEjGQRN1rHBTBIu3+2t_f|t!vEP-0F<aN8#E5U%q-Bkzd`wj<ynkWPW~V
z^wPRNC!ntXhRdR`q-;=BR8&q;wU|`W*`=i@Bg4ri8VeegQ#ICCa;SYEy&4NtpnnC@
zRAx<rGwtzOTgw5Zt{XjvJBeuOw1#6hY%a}1WqW1j|1*Gh{l5bE-ouL^rcd===tw%v
zK)cDT9!98blL3iQ2=WBHB4QvhuS4g^h<v5ww!t1C0|Z$9zREU=wVF>#n|njp7-u;o
zoB+|yX|bOE^r#C6b=Q=DCf$)?{bI_0|H7|VP}Q<Cf$!}ODB`7b<+1Z`07Sux9-3A$
zLpj`9^G)d~L;krRonHPqN}m7Id039mNDJ#gkOcDNP<w*uw`E)%7NMAxPH`(nTqBYq
z*}xWSYOJSGj84_)hu82hKo!ByLP!-sKSqA?E)6I|PZX;{(y2kHx4%X+w4y(xlz(z0
z=5q+KtDeib)S0NejgT#@37;A+;-kPD#e=k3AOP(A{8srtw5LluCbmrUr8DjdzIk5L
zVsc`1t2jN=YvH)186g$?gDNb3#H09+jTrsO!feLDWc^+Oj4Pz(ef-+k>A`BFRMr2(
zm#b$|lI>V6*Ovjs1YOr(E%%QdC&|7ff7!^<G0S5g0faCy-$u4yD}c!-##glUYQOzG
z@M2&GDa%eQ%Y1)&1bPFDAEHi`1k%^P6ae74%EZJ(-bCHi8g;2WT-rN}>yXORCFysW
z$CT5yvTHsn2cG+g8Dk)M^T-Y)0t3$0sCNoEPbpDWqF$%cVv>4a41*KF1V8@Enu?4n
z3jM!R2E1XXn@RV1Z??z_Rc!+#7LsU-pDK&#M&%Ax>M{_q1*NhJAEp{~zVR?l-uj}M
z@qx;#_h|jg%@pKU2*#f<H=ztG5tbw6HkXrtrphRB0m@HkX8IynvGj&41T7pghI*B_
z(rrQ*M2~{ry3jhC0Iz8qTx*)DToJ{(a_9m0(1?!?wWfP%f3mKMH_ooAicGdvA`+ti
zRNykgMGB)39nn9r9r)+Hq-`|$3BZy-a)?K2KmhUw>&GRqyns;?%|%=(0pz=$rKb8y
z#KJM)GWJTM)z(tk=_4&|;?WPKKVHeWy2R_zsnmMiDC6nf4g_PmU-<qnVv}Jr-{<SP
zD+cQ5v9=wLA&^5j%q=au$=d!dD=A;aIBa(3!Aib@a10|@g>=JaWb2uQst7wEx(Osx
zP0Kmis4HV=fd-Z_aS(`7jw5yQjR>5c5<7j_1sA0@66l5belRjhOg|Z?Xq24W2l-_M
zhStMHMF?Ynu^NMrFbt56f+FF0K0K{e!`N)k|JDNiZ1t)!g)NsZ*XA&bFiqF|w>kij
z!3v1Dg{^XCNg>}2Y*~`Py-4c?N27tF^nzgLj|)e!%;^o;=^hOhaYrlf*Q9Y)NHW+a
zdp<LlO?}hJKO?sI6N@{_?aF2+l^1<-WV%W`*a2Piol$iW)jQtRezIm5x>OgZGnOD$
z%y%qRW}m5^6rDEr`RwW#UGKM(W*zCoPu^3sK1GO}x)X+iyr*IST;iq#lR-Zyr8usF
zBPSWfPCw2WGiKiRX)lMl13Xr<2GWF?_TMlCv-dxSnbbG9+AE=$m~N`J7)H!2-8)5O
zp=q(NeNPg8s9BCA9-UAd82J-!O_>G&X_am>n&4}0Y>%nGNC&V-I$>xb${!gy)W6tG
zcmNF+1j1Zp%^k@-*Sa=>=zhXwD2*L-@mCA^&1QJHE!f9%zW9qprWGTREMbLyvPiAA
zvJFhL82>KbN*jjKWX(9WK8pCgsI#ByCws8l?r4?%<|q7@m!7vB=iLed)eTg7Lw}C;
z#?ryuws@W-s0_fM3&}!jG!93PVC*8(((-zGq6B}g!^A77ny4#92m|-e|J0?A%`rju
z<R~i*B92WHFtav&yE0}zs9Qsz_UEC8F+-;?B-V7^E+1jISZq!Fc6od_9veSGyMC@i
z>vq|<xvdeOeXO)ipKHdkAdZ<`@2mn5m6mbR-i1p5xUhhwV5I=8f;t-eX1sr^pq4(c
zCBnMF*1>Ew$&K%=;M*%s3o(>WBA0y=HU6PHM*<C_`Doq$lZfa<9{f#2lCfUh{V_G0
zr(Tg8!>wPbEWN<NyX6f^cck+Uv+qr<CIY!y(y+{hgr{TliT}vk;%lKmPpiS9#}#VV
z<#BxxVEzc)zrbDF)9iM{1_a+&Gid@QA4xE+kf3xBFUC3GN9R)MN{YoamI&=oKAJQy
zGwozlVCl+^wPnwKB^_l-;<43otj?<{YNZ57KjqjpI{HCTgvTeJ(P&HEPXalgj&V?N
z@bHk{s{^|9ip2tX#xcy9_QEG<cysuW&5E=RB*U7vkBD$Fe<Mf$%1eaM^#bg(%>NlC
zLg0$F#%*B4aHmIUDL=ttc}iQi1=qPGW8g!ehB5q^;wVh~ov`rQ?&d&stqs+a<SJ{j
zz5aXpSo*6bxu?p^)apiUaWbsZb93Ik`@lh_uawj-M8{IOd<VFdKAtEZ$7SK)Ue63{
z^xtH*slus9{sjCAoT&X~6ijuQKk8xx!i#lIVzPU2ho|>&ICCu63A_5V5^#moWyE0n
z51A1>G%qs%m4V1x@J5=kTU)9Ywl?AHu9TQ5;6BsBvRA76Tj|gakqi>j!DOYGt*O+y
zY}1#l<mQ_MQqoH2ml;6e+NW|d3-%NE1N6JrE!V&rBTEqw5&QcH{<41?1XwHlkhMah
zXOsFbz2iz0l`vgC!Z#dH;`PJwCV5h2oByp_@k>~VgB&<XAS$^MgQ*(he9Iz)TZ7o>
zAl1Yae6u6H8to698%Zj@#v(3Q+BQ)CNqby3bpujCGErzGNNv+ScNt|=lelz30RVM<
z@909LI7AvkhTNDa?8<`Wf~3=uH~2mrHtK6Adz=R#I+Vmb=~HCwF3iO2O?Py;R2MV@
zl#6MSqMG37qAZF*mGF&Yt|kg1fi#iVtT*?du|G+9Zwi^esv(S7Zr!R0t7J9cwRtaX
zg}GE=RRXyY50D$F{2%mf6VD`(p!l`x(@)0^;aWh7Dv8XB3@?BF>elb{?!yS}nE#>*
zFI=tke{WwM*hFy>-XkFa@V~(d575MRa(3>@7W>>OMLWcN|E8*_h<+<~`+WL{b(Nim
zB_DN=AVBdZ^R&>`GmPHi?HkF!2Kd&@uQ*|)#=L_wBwlZwm41-vqw2-Ix+*TaG4h8+
z)(O5CUbzIBno;}Fv^HMktLMa}n5<dKoayz6!g})2*%&_4!4%AQ1YQAEjQpAVoeRDA
z5;38aSE_|uG*6v@Fo0Kd*j8)5CeW85kj0=m+ru&W)p!29W}JVDzl{GX{$AYF<>TwN
zk&i{(@l>f~K&YuuT8xGitkqj2r4b6<^m~7n>%tiG$31b&dFtNdFd$dCt?u)=8zM!m
z!ED@K@HPD`_2gjw8QQtJmvLQuXT`x&;1}nrN`>M}kC3o>g5%3PrqI9K=t#lM(@*s$
z%xpU9UF9|zW*(rQ?uq1r31-x>q}OOTEdKrl@oTl&7qTTXBbJ*Goj$}?VYb9;ifBZm
zxJtV$Ce;04L<BM~k{bA5RMspWyjOjNw57%jyG1Yo@lwur7poC$#dl=$<7q=TcV>6n
zogT=G@N3iru{6i7i8sWJkF<-nA#1q^$lfYWGd%W|kp~ttm|B?dDxmFeM>3=)3-eXo
zt3qC0*K=B5JUG##SeakPMI=*=VBWNdY{phpcZe^P8+@-QU=6o^`Yz+j$%Gs;h5YYW
z#Rvqhmz2y*gPC5lgUYbP07=TO-U#fFsh8^oBZX!tA6b>DN{G||3%bE!HK?OQ1mK(V
ze^mjNQ0~9iP}@IM<`M0FMULk%87xdk9ku1AI47DL%_;p6)b#m4uX9}bY8}URj&(#n
zN~Me01ZC{JKYA#LZg2uaitGINo8uExxb-v<{qf0(GcWte-T5{Fzsp;cE7bUNx7S$!
zs8Lna!fM`h#^qwB=rG)eFT<d^@gF^6WiMiT7sxI=Mv4KO08({Jh<2^4(^-AOuV6xp
zCZnxQb{rFl58J1{w{~cz7lh#pPjS}lZ%i(~+<6qbZLE_VA0T#bQ@b_{J3G!_^Ft0f
z@G~oN>Tlw3;~A=Og<%<OZ%oR8MX|3nSo5TyV3%{5vrai62Iqyx{t;<INg-!H2NM1B
zu^47A%OIw@m7j^u|7S{fbduBA-d3u1Bk_%m=khtaC-pSAbB~zX59wcJ?SjQ7vutd@
z46K6$1D?_G=z93M@3UC^mjvTFhWYT3Sc6jM?F%?3%#)sSvhJ=<sC!-lnY~z+k}uyK
z*W0`Io9RuL0rP*hkH3^H2p8n-47XE&&=Tf%gG=i96H4jNIy68c5V<OpOdV&v)EY(_
zIb;0~c)*2i3=@OY-L{`DR>5d$y3LR1vcG!s>n)<S09ypJFf1T=NMoym$jEK60LdcI
zhqnia2fCQ2Ys(7zC4nTd_Z%o%!<MSOxN!M+IEx4nQB*(+@YG?hszJrRu0eCbSoxX_
zKv|T$Umh6pQ@_-ha#)onC;N%+iRty$apdPgJpovL%s-o$?=S{Z^I{mkA-%^FDr+;Y
ze@8}^S{~AqNF~cE5X<g98l>QkL?O*G7-_N3LNeXEHf6#rCc2uzNr*~u0)9o*ijB*@
ztz#%h_*EjBje>{3vRKo&BAsxr*-lbti%X0=;e%A?G9$^+B_qZTO2`?*)lWz%Z;&=I
zk!mp`1%1l*$cQC89?eKT3)A1Q-}Q7+&1?m=Q|EOIE*=i$5aou!&E@}LV!t1c7cu2T
zT#w<-ynDVg18gthSDW1=6cy1Pu1u<kxLbSs`b_plQzH%Cj~6k?WIjbNta&Ry^vNyj
zysIwS_aNQBk-QcB-=sL}@I}M%@mU%DzhQ1I0R(k4XmJk@5AOsj7rPef%D8~4r0v6n
zDzY5r+_`2*f!$L^aB#2%z`BwgTr(cKkt?eP<mO@h`pMTlXZZZ%3ZM%#_W`=V#y;jB
zT_E&TDf`nw0$*k(I{uz6BcS(WXY_HBX>284aDLOlS(2yY84J{#5!{IR<3%y>e9jyR
z=2ECsEl%Ds9aw|z6kU_roa@Eh{-h80MM^U*%KrV(AmfMfqR&|a$JR{RZJt(MP+Rl;
z6t^VCOFfut3ACo+ls&=T3$j%a^}5CBt~#44U6D?IuBPgs!4%KJnSEr@nGv~tBWJzQ
z#-+d9_MddpoEfQvQR4W5wlN>*?kHb@Wadh{InH%;xLJ-;e=g3ARme?TgUU{wj5PQq
zH)F>f!9-Y7cjqi`bX2iM=(;#g92~N8fLidQwKhS(F7Lir8Xq8z$H&o({O+ms4RoFc
zrI8}6VgkOe0#Pgm6y;ebf^u~reCSuAzeEiNlmZQ)XG%>^H-i{x08$XmNUIDt1qD=x
z_4cyPwf4*hk)s^&yfQSj1|~BX7eocARl7Qy<VvelC6T+aw-FpjN@`~v^DL&<Wma<J
zQEKe*{Mn@Bil8EjJE|+$A5N8~2GBvW9dX<)%MWQohsuftAFP)M7CTg~-Z#;yNIJ3h
zz|E6PR<Zb4pQPH?{r?rRJUp6j`3Y=~LIafZUlSThdIsP#5v0f&4lrHj>++dc@VWYP
zwa2yr3FfA8V6a+7+6s5|f+(Kr(UqI~p&^KH5jl!8GDJYZQ-yb5V<7~yTYczHETI!3
zQ0+?OIUdoaO6Ifo-1J&ZO{ZSYBIb3<+!>bs+iT<fV>B&Gt;Xf5+o8Nr89f~x`cHU7
z<#Xe0{?GTcp{T?(y^@>`mZgx2MXnsM#p1X!CZ_^-dO>x#g^3A8V-;}?_+y&Bx)cQB
z3Gi1c!k*&t5}M`x7otwSfj)fMp3iiimi@Q(Hz;V?fF9ofm7is`0ay}tgW1h$(~vxz
zx13jj7MCH;y@s)US++j2u|*W@F_v(<5R`_NHEY6a3uw@-_EXU;=^_7kZ-tlW9{)#S
zQE}%NW)r5YB^;*ZVWv7F=6z*~l9EK659B{OFoi7oYx1Wtcg^7O*Nxppg)*c=$XO`R
zD6;Aib}0%+562-+1&!$ra&mI3<C^*<K-&A-Lr|p;R*}U8sY6F>l%a^o`U@$O14$fw
zN(dM%3G{=h*#vdXt0mEmt^~YI?%{4LL~>?#9!%Jyiof;&ShV9h3|p^H?59MjSYhDa
z+{GbfNw0w%WeSG+Epp<<+133O{P8xFqj8)hF6SYe=w`Q&sw(>2PB}j$7o?;$3_C4c
z*tx`oc}8<{bMe*+h{;SrF9e?C<L5^-YcuRQW-sxp;B}Zo{iicvb_DL_<)^ol#rSrc
z_8OQcfHkf-Q2NWn%&d1eJS+6yYrX|Q2Tx9;e<A#5QlOxP9*Oa@4E^rSN#1=Th!eg6
zqeVw9nF+sQA6r<^qR6>Ep2v2T+ZWYTHyX5JU>X>5E_|;;H<3T2zrI2{Vex%CaGkZ<
zQyCS{A#V>!Pt5WQ1yzs^HNO|9e68`1eIrph`($ZhK{DAWG|JH%CNy(W(|d#*Or}DX
z3u%DIa9H63H2>AX5!lP3poGl@GtknJbpR(^6Vvl7B*hL@&9Ed&ABntYDhg`t;F<8|
z)IpVf+4P{HY@&3Mf(?yuC}DccRz|s=Dztq|^|4ml&dASGudO)KwJVVgcOYp^$z_`h
z|LUYDZhFhKlFh8r`CBx&QF6y%@B{tZYRs0~D}%5eDK3Y+Ud~aMO{tV#pwLVr9GVyY
zd~+B5iyyNAb3$luEzr0YfqkkMJ7r8SbShUSDxIiF9S_izlA!-+bJ>;q2v7@DkMf!S
zPL{y1I3PDeU(c8c{|nimQwIXNJt|gKrON)vk16?kij#jLt+%dGP*E@)Fk&qignb;l
znoItZorzby$J1yN-~3`}w;ZNJ>>Fu@Rn-h(nzj<6s2e}3|DRDm3v$xVFU?Q!-l&@q
z^s*q3<mmJ(?VTDmVffB$O`yKNxm8_Dt054q8B($m#Xg*(gVog4V@6I=8dJ$v7C<dc
z*G<7Qx;CE~{EWyND!efi+=;YTZNvE%!CBtdn2tX(Jf8fnX^4}T<<k=Gp#c?Kfr?c6
z3UvA0h{4`Ut))@dFw^e_e;fnGe`T?)s}q$cbI4Kc^cu4f&gYG&Y0WuIIdu|(o61--
zi~HXJ71FQ0Cc7mLxc(QP(-(~AK$m)DsU+#L@ZV+tu1v9nh~NKDVGSG+lW!rE>$IJ+
z(ZBN|TM7}PRkoSB`B{DLgF7Qlyr5Gs%J~g`ed3b|ce7%r%{=&jYcvyens6c699V+p
zY`E}UB^iH21jGGVT1vpyunQiHJ(LAUAiaPS9df@>?C<B1*pG;zx$o&Yz%BF*gYV_#
zrRVAMTMZt`y7#@Q-W3pVq{3P=8o{K%CVRaApfL~wk=Ky(9SYXBgJ~GW)^y07+-Qc2
zZIm_KUlYUVkbAD{4Dz$$hp4);(rwOuPK``SE8QZ)QyL@4uY2_Py&Hth>fsTy0+7a`
zD1<?bTIGXE)@2JVqx_{=5(u1aQP-JK(cLbIdk%fJ*o#8&@VPHkaTIA05$EmSWd-}z
zSig$Dt@xxD9>x0<krfjwsv|jLhIz48_X!_cfd*+H0jy~R!lt-6Sb{hD;r|`?oZKsP
zTp^tY@$?qxZHFBu`j>-%0E$EsqVy{j{Z|nJ|4~i=*X0TflnTgaGXx)Sx<w=9j7ixS
z6ka#B9r!R1@6D`HNW9=h62uF)-V#nzYx=*Kv_9^CicO)G@BibvF{k-hArcg_vaa%1
zu*p6I*KT;>7ln#q<V&LCYQbq@!wN0rZQ@`r^<mli6RD7jEK!%o*ohGSJDPU{KdqU>
zSwnNOi$yCzKbu?&Fa#CsDypdC(zSpdbng>rxZP#9A5q_6CTOW?IXcAQf6gZ-e}_Z+
z{8caZzIl0mP9hY!uIti#H;?W!oZYc62H|#3tPa#VIVUKbKe&PSLT#t~%XsVspP5_v
zmkfG9x434RrTn?&#P`qj!mpL48s&uxR*!U7<K_PC!5XK}yz<%PID5%HdoUJ(13Y^_
z%(v88S?`)>+l+!1IPndYh?C0GovC;dIXx6RbUhRS(-X6MwAVyvIDYqtg5{?eD?;}V
z0DN@HK;jEzXhFnmBAQ0@6wo3@YLUP84giB5z}QOD>2k;S^7L(7%%ZZ$BjMHO&+RbJ
z9+&=X>^xmrj<=n&>d$`GzdRLw-VQTXSm~w|iu?<2L}76Nj9nox=zri-fP_&2wdg?y
zUC!;F8!q`6Q=D&_{t5yVE5h+zM@uLsL8`dJEQ#4a3SrHsiIS71RHo|3;`v8)OJJp>
zES1$&Lwm_Ryxw5vZ1ukS!!T0HUw%wL42l|&(0PwJo<ZMjgkP~W`2gnu2`v+DXu?m4
zR+&a1a?oI8*0w9G=g0Mz;m6r%Egch%eD6^5Fv1CXxiSw^nahEVzS3${SATPBc$=sl
z9Pw1!!?S|Xs{c^E(`o;qdP5@w7G%}hyeWx9HUStQ2t1#bYz@o?xP>x5Kd6mt$CVZ+
zjCx#_X$p1Ek}ds$<R(@6EG~s>8Wg(nC1Q@B4ld;@Ef-x*#B7iNJog9(fZqv~ukC;H
zkG;R#j~cVuQN@|L^GQr;=>z>#t`ftTx~uxebO+mP7c(*-`yZ4Ik_G`}8n^XK-M=FU
zP=!N7@W+moC%CS!#_IFxIksWti_OlA`tO{({vwdJ=G4Co1xw&v8HpDc0W#D<Qlo-G
z*p(P)>CiLY-@5jkZwLOl_+2LM6v2B}QJ!n}GX4K!?<=FCTKh*8FgZwv(juUwNH>av
zQX(lez|b)CP{S}{(g;YGf`GI`H>iYkcS`p##0)vyjf$`DIsbL<r@Pj@Yn>0QwP)C~
zpXd2?Z<doR=v-^GiJ94pv-sz43ip42%CeB(FmEz-Gs9E5^XSS06Dq2|{xV;9h17Kj
zH7=7@D!wXx_Iq<VR~_zFj(0oIzhv4+Ud<;^pWF}<59EXu)QxU^&I}SEXfvPpOsY1{
zY82cIby()=PS{Q)v0(_fvYNpSOvGPO8c)DaUp+Fu0E3AGHAzyMvhO=W`t|voKD|#_
z?zp9wlF2Zt$)`fmot2ekY+~Z;xiz0Dl;$)FWCCj<d|pFzN-rHH3wts!jPEbv?;F_C
zku8&)S{g#kT|DCA`pv}eRrY15`yKJf{0Yd}Tfm`*KuBTreEREG*q90+>vAH663%p*
z-Ky~1T$|$gY}M%_RddKkIA_GFYrhsS{8HUT9&Pj8B0kXiXaG~GMy>X~oXYdN?>C@=
z*mHHfQpXD6wlM7RdG@7zK6D48^)fyqDTzUITmnevPJ9EpOKy8^&dQH!=%~ja$*xQo
zNRTTr-3iqG+8dJXAE~InN;r9w?aifzbNZXaCI$TNrJQp2-Uz$ko3q)NqCvs@$?d+9
z(#1fFe4F6o?;9-!EQ1|IO=Hc@Ck*aHSf#oq&a4pk+^^swFa=?KeO<+QYu2=<Cr#;G
zHWz0$EGU#6Q~5G#jLh~JcXV$hg)qNDslm{n9FPTkckkZa8I!=S4nL;`d6RmNt&J3u
z)A&ajEsylQ6^m6q-`yU8wZ0pD?y_)aVx%}Jh%|I`)e`7<n*vvFud&{}F*w^<oc<}z
zhvY0^%N6<l%u{DbMBVJP%gIDk7W>XsW5Qk(!C0{*i3;~dWCex=G=V!^Lx4IhkFgw7
z%|YJ8&??a?T5gP*=5}t)4rQt0jOobJptIPh^1=x%Jqs|fC6BFk;g^jUu3q_aC(yjh
zp=rxU$lw}zFJR;wQ=O4!V@v-oA6HRYK^E?OmE8IaA#JvJPJRmz=3&l#h1^LT(78u1
zt~MFdpj7xc*ix&RYq+IOwIP%p@uhF%=O~$mc|U6d<!R#9$B!S!6vs#cp}zNF$RgDw
z34c;9p%GxJXIC*?r(U<APF_tgBu-JL#!gXr29FDv3N|x82h65Wu=-fAdrMlU7o^ot
zBrINgtIXL#Y3|+{5XdAbT>zqn7iGI&0g&R)!2Rf<7I+DmXAo&}AXI!rXXQ-{)UfT|
zO1bhQh<uIJ@o?9P8KdLrSw2l;??g#UMVQTgtwgFq+d~B`C*VYa<xH2mf1e3xCmaI?
z&3x|sOCEzAp3=6BGk8l*C~zai9}aYU1NF%0EhEGc)ElVZH6_1R+Sc0oJUV(%GSGVL
zUf_ilYB_;WRfeJHQni`Mm0%SD>RtmwLv$-|ZJ+oH{i)n1g}^+W^E(*ZuL~7MudRO-
zD_y$*lrulq)q}D(DB5a`x$^zA;F<7p6;9e5yWZmiqt;l<c!k(f&=b%CRG(L3RTF1>
zk|ZsV)kivFOomOWG%#1PY-~~wguBvh+KN7Xy8Y4*qd2JW$+nXxn7Z3TNHys%Um&NO
zHmLNyW4O+duiolnlpYsEzyg@zf>|x>G~HS5ClXBKJJ`pfc+1@N^z^2u*8(qwy?9qt
zp$R#D3r=4fEmdPq8D1J?X1;Zob7Cc!Xrr)GDw{pU>3a}+i!LyXg&r6jB}*$c7#tjl
z`B6CdKbU$z;)x0Gq(LqDGg$fMix(2_IE&!?SU^x#3wQC$v`kRw>gxwq3b^e7jVn(B
zA5<k|OT9j%eaP2;Z4vro;tb{+ODGdCqiBCWxUxy8%R7G63#iILJy4D}zH_%5#Gqt9
zS^-&H8`MXtjcS+qNmzE!$(4E@j>x{gV$v9NH2_}pAA6|iFCcWI_lmRlM^z;gkh*Ta
z2$WAr5VB4JWh<YARy33hca7?cKFdF+9N%+2%9z576ryl(arrFR-QG^rziR0@>`WKM
zpgp<+I1`u1*#Ku^*MBS$>auF#2()mwR0Aa`&98I^UgtN}UBz!u$<+rYK><6Vl$ykb
zL$p`UtI_|cPy;yq-C2TVTz1#fjz`p|DhbFDpY&#_BumkYii!rTs$vH#ERx3Q_}1!!
z$%g>lM)Qpm4C)pmOuE3Fo(6PSvu%#QbAMlYpIE{tN+=TGzsU!+UoqLEb1wF{%kso@
zxhk@`gE~NS0Q<M8sPqwI^(PKYMZhqEp9$zdpp7rRA!6`tUi7?$asTMnZSP&jJycLB
za-m5&-kF2`R~fgSA`=J4WeMaN_g?){dwQwkLl@W8A$3B9o+=(;xgT>o?!T3r5;|~l
zaeaQAkDD6)t2o!s^ct}Mf;Y-~+5>cLk8G^gO`*2<sOXN5wnlvs$R77Ba*p#QbEt;}
zfz;5x@0&<%+66<PNaM~OGE@3ZuEMw^CRy*=-#OH$c_hGS2px+8U#GV|*h}ecp^Zr>
zwNE2WL&Tg2?@+Dx`O|BXwStc4*M0?sAncz^zMl7h!DcE^HD7^PUol*yeYsYQFSv;u
zCcAk)Z>;7S(9FTY%}=D}v-x2lfWts}CQ1>=)iDTr(50`boJ;4q!gKNFUnKVhvb%{a
z9UxSPLSnj!F;Pz3vWkjKKm-@dQjOeNN&LhB<__8eilJ>I{k#q^5W7j9(yW#7O5rPv
zQhz1EqklC~02uGuCm1oev>YtLnJ)!!t4_@``q4?^PZkgcCsdy&G;)1))_%GaCB$O2
zM^%EomGeOVoaNqeuT3S(uL`c1W8kfuw*?`GH<E}#P9N)Aa^sWRnqJl3eEdD?^MG)V
zpijO(7Af+XEV#1nq~V*CI#{30N!?tg@l0+}%k;o&7Wp6<dwZVkp-$((IT|y8#Vdyr
zsY^h~hU5Ocbxg6ataqn}N2BMxcsYT&yGatBkEa}gNjR>MNlN=y_PhK0gI}0K+(L%p
zi>X5;*(4rQV611Vw#tE4U%m+#N>W!>_a<@j&mm^_WfU&8#S5cC!nCxtFF%R%YyfJD
z>c}6eTN~n;g$-0x>95(8a`2|S0j(j2Jv0Ez?bX@+D~T_L)zCZSyKCO8p!cV|E0b~2
zv9$;=_z1ybT;f4coq<c&uc>j=fxq2ir=GeOUJ_vC{Zs|{W57sUt1SU(F3@$UI{>_Z
zaLK(lQ*<H9+Qh`<OAOCM%}0`Mqr+NibT7~=Pu{(IcSHo`omW2Q>A`wB3!qZS-%4y}
zAno%HeeCy?ima>mi<Cxmyv)>m8e-#u_<A|AEjaa3lkm<P^VTTe4Bd?1+tj#pxZSIE
z&rW0x=M-pTpE>ZTsticAy(R9ifT}B#bN6}(sde93+h1Gps6B*x^Bx${=)Ch}O1F>U
z$Z=7UU|>|C&grSyU*>%Irtw14xvz_hRwpkHq_LId9g#v21^Z)us6M}^_M}$dW>$c%
zmdk+x<JWRAL9bp2ti@(q?DTy^ZGG$T!G*ZscJ0LnHs4sc6&Ipdh##9)&OAP?m-Um$
zCZD=K@-2+TX>Zf~`-sF69&rx?1x4R7+`64m=tg?=#2$3Dri6AOtMig!?2!3&ZM!G$
zNjl2>(t(aU>h8v1+SgfG93}}sxVr&JAWhB9^~Y(*4F2LiK%p^^;mwZhy;0uIZABOl
zW?hY8u2;ZMLcrW2rc)(vcv!{!ba&q2pEO(1($dmNcsyDF^zw394DgOEDIh2D;M@Ej
zIx_SeY5TXXi%J#TpGP3Jlz<cQT0h0#bfox;ttr1=F)&BcWAog4jnefvZ50-1)bA81
zDN~?8ZXC~)#S*dZ!P42fFQ2-TTFnf<*G^GN*wkGqhhCLlAAQ!P)m+>p{c4y|Zc0uk
zUcNKq@~JK!kq2}!?H#j=ZtG+I_T#RcR52Bv->IHRXsi_5LQ`luH_BjGdyyCsff(hx
zwBVQie*uOR(B4D|yoTrXdy|=C7XO8w(-M#qd5eAc^u&Z4)Lv-jG3mrgP%j)8x=R~t
zC+<`)BE(*p`GIfWCskHVFhG;2NIut*Fe##$(F~w>ael_2x^%90NHxBTj&-7kwygcb
zGqnteyu(9N^b1pG-fGPzHVY0izJ{W!Ior2Vsfj3wQ%wmTJPVhNc)-@SWC#P&L@*1?
zZQCLKw&bH>sfZ`AT&&>=TpBeph1<+4?%2)>p_QuPcrY$DJu%yySWT)2|C+9QluXr>
zqg*#H_w>wJJc4w;x;hf>txvvfl5%&Vp;4`z<SFJ!pN=-Z5-(I90z(p{am)EQo1yq#
zwMN)W_#7vw8Pp*P87bOrvy>t)o$#`)K~Y^KVUjs{8km(yk(P6qZv?fyY>xgWiwari
zU$4Dxp_vq5NbAh3*WOa*aFnIXbXPnzE|>uL)osypNgmVwtCHP!_`^!<hRL$ES&w5h
z<?~IM_2w@{0rh`v6Rrr3$7bu#qb_w>w8v2bqjl8v^~EVGGu2KWAoH}2fP&<MO5I&i
zmk5p&oB1|=>IWrm^uv@Y!xvn+LoeXAQ~B~uhd3_(*rM-)TN}twI2UdQbSwz?9aOO}
zv&ddsV?RumPBjOE$CU4?U2f*;Mu&S1|DD2~iNkqeg?S?nqn}Hi@pS<XD6Z49uq)Du
z+i+a><3{e6Mg)qBpBH+^CbYIYqbo5AO~N;?Vh6zC+kB#_UNsLclIcs_zTf)dVeG+s
zZrxAs>Lt}emIsYsYqM4#waI=$PUB&nhImGwBL+I((6Lb9kd++)dVa5{Egd6*8B-d*
zgOmM%t&7T1zHMAp&o2;L(E0!&kS3s6rG@Q&ymx9^!!K}XlHO*4WChJW_0i<5X2IN$
z{<ZnKJ_?U^TO)Gd<v>=Lr4PUe5LneTWqwQ?eLpAODJaPRP`>-zMREFtA^}hB;pT2V
zyA)qIN6W9h5?p-CG}Y5VwxP-T1M9zl0}#_ilADZY@9OOZ6({|^_ZMlL{D?`B^Q@ZI
zuV<eS-%q;n-6SRX5ZS4_a_je3rx!Ap@SYX@yU@R&JE5X~iQxy&{7Ve~62rej^*03m
zC5GRi_*XLge?$y3w*c>c<lCVKDbsD;x&`nje?bDTbKX*3tcPrW|ANOIu&T<F&+e=J
zv;u!)oP_DkLzy>G0S}l@jj2=1>5Q*4HDFa8Tzz-^r+xO*+RP*%u+EI3xc*P`?SHqB
zv0m2W%PNn3*G0C6FjgaF++qciA45j=S<%T+lO%)-S}HwK)<VBG@aw|Z8(3^+#^*}N
z2>pTk*ws=8>sI<~vAuCpNN#L01nIWY@i9M-&Pm+o7>khRsoMC$yfW^&kSJ?kXySTK
zHBl^b{JVylq+o@3i7=jq3;?Oif2HsQ#Xjw9is>FfQ;=-z#o#^vNC;+|+Tyb)l3m};
zt6aZUw6?d9%vjVB;KI2P089@#94(jpkeW#7yFLS(fpZR65*N>}(>u+48SWnUyq+|K
zTe2EUdSMF<MlaGUnR*2@N&q7^(cg&?B61JpPTiYRd)0+`<0Y>*v(drGU{av0sJ!n}
z*au2u<+O&pg%I;fjd4ei(EZk`%#R6_sQpNHjO*60<xE;qn#pneo-v#6%BMMspJ@0G
zVj7K@w&)H0gfJZqA%JOZ%AsLAzCCD2Z>yabM7RJN%NK4+7Ms|@wTSYiPDXv7H0!bF
z{X52G$}eB+NniY?)0V&}?BnWF;%?YPe1C=f_4HnHaZ^cD5-dFzj?z-+`%<G+sc<n}
z<Zqa|odzIE!$u^~J^f%$7765$d}uqx5g+*UYL_Septzqz)1q4U^*r0~8+ZC~+IZA~
ze0>A;U&EW!psE`_GvwK(Mz{xfhEALHywiw;UF%gyxp2wqy5jOc<A8#&JhZ0mO=Y3)
ze>{L+J7GF;T?!J<^CtNXl#G8Z+7Cug%%K8$n-`4)7B*wFL?tac;u4Qy-J6o{4}PCR
z=B9FAIm+SBTF|;u$Qys@AA7t5I1dlp7?ksks6r*S*B7P0m6bF4h!P20=&YL+5q2$z
zlJN4^f-H~mgb&VlZmsQ6t0g<}0<&uW&KKL4F81tmu{A%I(`-)=wbQ4r%I3KqTV68v
zzG()ZFiKu4uQ5|LR9*w)i7&{9P(_WHxM%$1bnjNr0r1_Xh`};O$wOB|LdI3lpDmMv
ziff&<BoMp)L*XzR%Nzqzo#yWJ#l;n>zoUl=k3r}*RTjBfGCQ#(17e_yX84lTIY^f6
z5JhdNsgqBX-e<PJ|B%!v(aK)D*z@XXtXQ`A%x)MMZQv|kWZits6Swth_B%eMMh495
z-rt3ahyotZnuJ2vjC)K=jAXu;MJNcLF81#Sy3I;pB+Babp8TI{5;W?{yb)@wbNi>f
zx-ED@Wnj9UXMca5jByYEAn;#ySc1lfQ;Rsm?kW?A0c@>UQd-!-Zvd{)xa^2gGJbs7
zHF=2IHlkB;D=*~1uum}CqpZ5-5V+HE#y}%5Xer^S(iE~44GhxFglQ$$^^tA3H}qXo
ztG-Hqc2ZEjsoN+${V5Z~CM4TP#mP4{bdr|S!69Iv1oI)Izxg;^M?qg8U!R^<k5~mg
zvBKb6gg1`v1PznMsx@~et1%d^%DrnNP^GsCrnNVs9(6BOPzYzW#N}BzjMHUD3pO#E
z=*1=yRfrcot>t`Gf=n|w=RJ6HFl-Q21dw9U;}+7Gn|Oxw=)J}$JKN=fG4jy$;cNO>
ziEH}TbSwso1&`(EL<H5ZWz!RMRLfYLS6vCY0RGyJ3WSc{;ftg8b*2y@B^DI46gMh#
z)>v&jG(_*vBh!Tnvjnf<oI$S9EP+zw-!AP|e#w+Ez6hN5)l+1gO!8Bv+|Dm#z7LqY
z;LF1z3D(v-al6&D`5{Zg+G?SDN;sFC%;qZVP}!y%)=#L8Zz=g^U?KkV+@mS&{R!rx
zII(e-u3@BbiMcdBCwN?eTEW}`+OS&YG(H}7l$|o?zngiZed%!{9xm_Z%;&)!{>!_0
zPVx-2m&^cUsuJKuD8E~&M}PAc&N6h<(o`uZvE}h~VRoF`N%nuu;B}K>cu#R`{7s^d
z3(u7m8UnVrb%gAQFo>GUS8=eLNo&>wMe*>BH5mG<W##2kUA9NsC+lQ?9~dURxSPj^
z$0gUQZLBwpF=)D^{Mff;ZwG0qcJyw*T>>|V6opahtF$Gvn9>rM$qirB)YNWhTiuTL
zi>2<k8Sk@l)Ayb)x%6r|Xg+AGgV!LDOD!-dR8B$SU`&a$5H>GJd~=rD_ZIIID&UrM
zy_iQ7UX5)6Yl4kA5CP&m*+AyaE_eY81FlsCSzf##R<ciIJ8dI6qDf~12lT3pjjwet
z?A~l#_WXyuRHh%iOmlOpvQdT<b;%6(q$lu<?<M9gsH=lr4T_eO8g$zXX5@_y-wjsF
z3Gj@)L@>?{p6pmd=9Mpe#*bY<=9V8s0NSctcA(U^&Ww@<y#Ow7)2zG{R8WUS94go2
z9ar3G$#GKv!cLjIz%qS|U`Ywvvv)_H^kym=zddIpL7EGxd_8Op9os-_kv8|buEUSp
zyJ_F|3)=<sA6q-MIlt_<?IWb7=NslN#Rg#)OsHGDl(6o_Tv<qMBqeE;!MiYEu$ko>
z4Zp%+bME9kZ&RP5uFZ)o?@QnT>s<Uq!FoFiuQxlofrYqQyyK9hI3Pydlfw=2?WL6<
z=S<B`wy7@xgzI%<4f|7>{mv!=s~k3xm@|0!u$jXTln|%YZ-vd1oLi|Wr7DKf{tm_F
z6MaqQ(Yu4uMpU&YqWzr?(_S80p~iOGW>@813_#T}0Z@WQxp&vIEIQ~g-wYMp<TNl!
zU7H?LYSP!XHYY<~<z}7neZci*eT&au{NoaVyq59&Gxz%tyTFpD9rOl_(Vx~O+MU4)
z@?g4nR_hp3W)LEjHSZ>W0z;-Q=!bUq;LO`^_HU;HyvsLN?1GodyRk}4yx=xMmW>cE
zFDqQQihu4xmcNQPPJ{|Zrtj<D;7q+({w;Tma=SK%fP=ceMX<QLq?}u5auey{v-a9_
zpSXrfU#~FKJ*1IxaJ&66CnrrJ>+Mrv;Y|Pti)7?LZy~kE+^~*T#2H^Si3f74eAOiT
zpW#56&^j#_o_lxStrAd;l}W|fXQ}Lula9v4194WUfB+<v)kQ^tvy%MY>(`Wk2>f*a
z@>^d3Vdl5LRp=JfaRL(pwj5Yd4OdR}1OeNJy384caD=faeJ@WLlhMI`xjL(C{F|8%
z>OUoX-7JZt^};L-d+v>23Nj&5rhnB_UGN|$E9?HL7P&1`mzIbxQ}xf_Ak!mdnK#<X
z4Pt*cT>w>p52%HIfAj+o=|mnV?o;hw_km(DK=z>KYLjX*HF~oZU4d{{;fj&0_D~ZG
zt)GBP2SrEZM!Mh(x#FZNJw$DPTS5CLHj9(TuedQuOb%Y}Od7)T_O(*zw(Gn7bgSVC
z4hy1XRUHY;Qa{qk-2qv7&iZ*!&Z+T3LG5U5=|uheff1@x%UTFBVXcL@aU{3YQJU=Y
zeMi&D=X)gF;h)wtvZ=0cfPct^6EjT432<2c{P81W?VLk1aLqU*o^~3UYPg5(_Pi}s
zZ4bl2XV`UK1d46Z{mvqM1?<>YtdD=wlP3YkJdgi{f&K;BD)jLfX;UGpVjA5!K?W)3
zhT4`khL_8Z$#R26@lIhT^B#~6kG|gd_1XCH#WwwYePoHyVAw2Kr?i741^bml6@ilZ
zf&LtS{bgAG*+e}0fpfp8vQCuDqXEdQ;MVBqTTOnmM@M8`^AQsBYBUwzh0aRNoO3~1
zX*ay%);>8efH#7R2~X;RiuezP9{$>SOadN1bz_AJ=Y?Xp=M4KxtLK`(q5FZH<U^Xv
z<A5JO1o9wD1QxtC%=-$~t>A|6@`yafYs#ljxe16L|2t$OcYl3BWA00SP)i2yTx?sd
zVy!7a)!$@nSU=t891EbI03o9B^EB@g2viyTR7xFa`(_|3QuG&8iS!`<LDOXbO_y){
zc5ot;rZkIi!;rlTL|l{GGIrySl2`8Ok6{)nQ(T;Z_VZKXBEJWicg(NuvYf7(%I-Le
zJf>=glppoy1#1N^$n8M0Q{Rh~+PiCbW6-Io?lQ1bZ}Y7`fF0%Ydcr@S=DSI<C{-h@
z?O@jqEn!>7-G^(MPqqgT!wdLAyGC{qjWy)sd)VK^LGTK2Yl$2_5S$`6{9=ONntM)X
zDReJgt^P~wUa2~t`vbct8u9wwq$@&_bnFL0`loA?(8vQj|LaZrFPqi3`MRTVQwCUH
zA+H#^G7Sv3r0?6_6UG^h^}+lt59Z4pbR1sZmCHRvF$oQj#GpMheafDIHJUKSI9f96
z5JZI?D-hrabTf0yDRYtaX<bjJ@9vI7*#35NrvAdqA1bu$tW7kjRQVrl=8HG*EIx=5
zor3g_J3f#A#3J+f#V=y{!#jD()GqqR$I}~_UOr{wJ{Nxdk7t}xDS>szDVv<$eL`2a
z0P%b44F&T*AI7BUa^>{Lf9y?>nMqN}_rt~i;bC{<Tz--2Z?K(+gul_+AA7sllYsxn
z$J5mw1D?xcrd$7TPncw6-ptee`RrdOc>|DT6y5{UUyS_c`JM?Z{Q2?MYQ7WzZ&4@)
zoUZuC`A9y9UijnV>1zL?;%~tH7Zra}?!R2|Co}-2`By;wDMbDiP=7+>MA!VQD^4{F
zpo9KZ+kaQY|Eldj46^@6EvVkdZ`FU9rZg<R9UAO2FW0xg^Fh5%o0{%zAF~2lmMFol
z0+oNn1Kv>pe#b4Z<sXrD-$If_v2N1iPY2oP$Z_uOiLl|j&^}G~*{Fz8!D6T0;?`+N
z6b=NQXj`&$da;@1oCS6`+|fx65<!&tsD&y`YLcULU7>ARba|^cCViG4N$f6gqx%R+
zi))Y2x_2CM#J%ree3rtjj^ze7zCKbs=utw)#nDbTHn<epsxf#;=(!y;h4-r$nwoPZ
zi4Uyt*aeyYGIHuR0Q>NJ@Y|m}ntG#){(}3F+*nWF9b?2fW<3ifn7&=>p@2fp*e6yg
zukpjPb4svL__CG6UMe{$uR@zwuNGga&7oyoWKTMHjT>2=Cdt}TVpsku{eo#jis!8r
z<r>q(rYrR&<CTR(h^?$i+Jh^u41rPYqu1ANQV4yV-+?-T?HhV8xUqwWqu|(m&_U<R
z2Ch(l867%FR}s}JS25F(oHCowV>HLoM2#O(-G2I>@+yFrWAoPSXH2Gp%W*@=y^%W}
zcJEnDxFGT{T@JlZ*crHhz|445r3n^9^f{j^aj^5t0G08se*|^uq}8W=1sxQYL0r^k
zyB#N4kfwLF{9#<xUbi4|LPz1G!p5x?{oPP^r5rsG+j=Ow+)L{P4>quzT#z?~*NtRY
zdF4}q*n82i9sTjj9#l8wq68gcz)?x<j9xVx!>|`P=s5X&>Zlxy<p*`AasxgUME&6B
zCpiMuRfMCApEkX1+<?MS%JR)13GJf}RO5RGxahS3hh~(y`xdv;5+!j%R<`B*U}V7~
z8i_>VyXnb6+VM8D>;9G629)E5FTX3_$0}t^vtPhAf^uv-I^#y%4njO82OWaT$?M;}
z<xA9s+NN3`u$z7ks)xtCg1WmaGz8Tw&Ie!*RRuaEsG1KymaRg+8fA;OO+Xm5-`zfw
z$S>*5N29I$)8&~$DgAOOXk2{PS<rQBlF<|xx@?|xPb)(M$wI|v8tPZKsPp|4N;?Ck
zb<1mbx69YUcI%R~_^xebq$dsz&Jgm((3muQEMBVGicsneKPK`v{<{5y51|e}MC0>M
zA7u;ApTRVRCar~(7X}8&GKQrUlEip6w|(SL_S0Ag@8|3x&EE2wt9mYOa%rp6BqlBt
zbZklxZpWRtI`TIk?UpWF%;r%2>5P3K6+tID+_$~^8A|M3<>9CO$YOu20sY#2l0tYE
zzYKy7mTguUUa+)4xi)^TKA2u?%>e7onLD+o>>XJYo4fVGHy=r$JEfb~k(&hw#=!Z`
zfQZ+a25H8XAj>3)1jFP#wcZ82nf_Lb#v-LV<`#`%_{Vh8E6VoK0$VM*F_|v2G7^~L
z@owC#Mp*fxeIu(*W90-k8$EHBf2lD~C6$zZzLK_gfjdA@kW^Bq#H}8=5N_1I9#0A_
z03%lSD-DsTX2O&bOh3|POBk#=E~u_k%N1oX?jn0wQ%%VOZG2xH+O+Pl8Vcq$OPih+
zjcy#x_tfFi)1}9Eh(&NXjbooDTe>?A*VeMlcV3GTgUU74I5ZMRApLqH(WuG7n-)YZ
z2X_`e56+-)Xc-A=>Kn)6((^S-xyW5frD>v<&1NS2TdlE2$A;cJVfhU>aOd2B5rso}
zy3lge(L)2t(#{(o^BaU;Rpe+*L9ttFkLC%K(?1wG_m8G}JsL_Vf*niOlTRM1O2kXN
zVd_9Y4SeQ!8!Dr?0){D@W~Q*os@kf7fq5~Y*$D&-ZR(;F=|K&8ZMK(6-E(-fFgY}}
zIj6DK;h<@8x4W}2`E}qzTWBdO_;B4U2&V99YteTIwz0qm8CeoRO`k)0_-&<c@Oz=-
zCm3`(ITM`3zLZdTxG_l6fE<o&R`=$y;ae#Sf(tFS(<5o9HCs!dE?Z^ajcXHetw^w8
zQUbT^v9@H5-%%Qt%n5Wq_&9UyZCTqJ6nflMmL4X}4V9pq6x-IUryOkB59PVK9eZ=}
zncNSFdHX$(^da~15C}P;K(<;L9dDS<XS<>~6{Nr%IfW(9p|OCbLm+o~e0uTGOlUaM
z+VYLUef7iAonSatkb~?4R1(f~P1wF;Q%iU(ny4p9>Jy95{CaSdJ7kn@C1SKhYx3<a
zeiqyk|B9*0y2lRZN(vO#Nwl&x7m-@RszwXRg7~c=%C67P_dZA3^e&#aj(s!tWj6fl
z=ll+jL)=bxSvAUSZ>_u`WSss%a;d`3CXo|1cTmn`ENob95PgL7Y*;(`(lc>XT4L5%
zBP0X?O`<NnZNgdW`*7XCX42AqfHq;>1ms;xhy<M`r*9~7D$j_CPvZoIn=oH!m(7Vs
z$n@i~!F!vc)haZHm0OCdLq%dGe^nWR9v7shN?zS+S0DX!Yan0OyW?v}26;5}92S+G
z^n_%Q74GAB)ZYAD<h+|T%+CH{l56E1(@{*4Dp3`7ayUrcHPMfrR@g$?vY&IX0X+^1
znve3Gf`rk}>^BaTm$1|AGfEEuH(=x!TER_8`d~~Q8#{`am6*YLEPSP{r4rn!Ky9Iv
zsHN|6fC2zfR_=80E*0v!o$yt?`c8>G&T2DZN_o;5@rvT@L{FKNrw%=KJITS)GmirI
zq|kXSZdZM4{#&v=uR(mZYw6S{svn`dx({R?b02JS8-J?tX11=K)LrenRQ($hVG;ty
z7KfYpgA1@R+?uQt$5!4(jXX7x6Dajre0yi}_(jObOWDaQK2O?L3YyAq$Jc6FMjzZO
zS~io{hWBA|I_v{WxRIO0H5BZyiD2t^FJeB9S~dEh{JGk0ohDZ59fJ|CS!5@Mu-j%i
zN<sFkiaMxkDI_Y`>EQXnfne&vZq4rVv&?$w>zn%{&zvDhsd8oaWB(d!J?r&L?nY7@
z@!iMamFuZa#|6M$P?f%c?JJKIie{6%pLF>pHQ0^gnDy2+<4xP+UVT)vSl+1_Dai>n
z$F2wGO;EDzOR`*6ttIWJ>q%Qt9E;yB2l+*9m$OaI&)zC6C*gLf`vN;oT%$4Xf=Tny
zkH%jHPW3T2TV*=^nT{>k8!gkBVS*8?rGoCMoY*?Cy+RvMiM@+Etj}npss;6@az^9&
zq&9W8o|5E>El2Z)R#d4!{s@MR9S%38X1{a5*QY9OHMM6e;{I-Mi>_(yfuCCr$x3;l
zc=q|?-pfC^P!$Ly29sBPJ;95N=;gk~(Y%{6CK1o3!i9|@-?p^d$MW5`iZYdsz22{c
z@{?><Q&`EEo}W*0b!d-LaE=bNc)0EJy|b;N*?VhE(Sl8|p@Nt@AZyk9<*{JM>VavF
zQ8bL?-od<w*)g;^g7-p4cgrg0xb3b`+Nc^R^F1zK#oak}**#^BzjOg5PVJ;}SUDVf
z3z{)44q8gQt4U8=;=`7gy-uH@h1{RY+da~cD^Sx|8!F+v0=F`0>+HB9PAMKgu(Osw
zpFR}c))OyXXc;%*kSj$JA3YgGTa77$x00Wf>1;-y!4!L%DZO@^poY7Igum7vJ8CX-
zGU+tWNNip$>%4Lia~V6)oo}yV=QWk5E5SQBNGnZI8`l<Ik(Qa3@vwD&w-PjdRVlkL
zk$bHDcI)dI|HRM+;;p+=Bn==RQ4U^&0+-QkTR%K_HgT~Jk8|!rZ8rYH5+4aF9c-Vt
zr<uc8sTZ7LwjWLzfuwKe(;F^h)#h$Ce0YRrY0jzZ-+SU4idcD+Q{|=ZjHuz9fwyLZ
z-R&_WgP5b6e5;J_+*$dLH^I8&Bo3XbYh3Sh6EDoT>^*Vx?^0=AaMRi>Z+|#;jd8e^
zMSCLYV64PPwB{hb-Ki-_(n(?A{cc~uD~8IrVzi<Q9G97x>Y3;2s@bFx%ovER<rY-~
zW%+$q4SsgENb!g6XG^GibnM{rMNkvyQw3-O$SEl%J&Y*_YolztAbDQGf1xYIjn+pW
zm!;MegNxq2uofgj`%_kf8u?#m`gsOw*TZdV;t_aDCAE7^yIo(>77oek-iwWw?9J;Q
z9F4^A)h)jSLz>&(;fkXivQXu-d0V>PoYJC;Egmy-)y#Todrt_X8gn^oX-X~gQ@xpw
zZ?trTl#HcC$2#wz>lpI`JgcKC*lg1~E5rt*T-_z0vOHCz<5bf7yWs{G2MbZ8$HJbD
z4U<9(8GQ|L2U6t1Jd3stbE_|#`A`o+JoQ0!gRnPd+Jj$TLQ5%LSPkEyL>e#UI(0UK
zo%Gi$I@9T4Gu7?}!e2J>SxN*QRwb9OA_fKQu1YB?jU1K6AM97T**eGC-7OP#nV2q1
zHP@gss*$2~qhfFrX!_{$6(8Nry0z~H66Mt%FXt)Aw{OW?@z`W<wv^oR%=F7%!GZ59
zxwye$rk>sn&wQfYeUAO}=dc>bD}ja4gwZ_%EU?-U_a!s3Hu!bMlpQ*Y^)Hm&Mc(c?
zt*>BfPK!gCx-sQ;Zgq@`d%WjUz49~_X7J;|4_cH)wr>B-dDIb5ddv|(qAf^Iyb#|^
z=0{E@2I?M9dT%UieRY5vcOBK;X$O6>Grz-r6Dx*st$p{&ea8c5uZEXhQi7Almo2Ig
zbZ<g&b&QwW>}O90ct3Y<N>8VETHMX_HjWxD+DJ_fRA7V))Zlh1P4C)28$hCV4Cl)w
z$`^D^{Guku+g>6~%~6*=iPs!QOgusCk+JA49-`g77TDSDF2wCdJ|p3-O^wxQa+nQ`
z1;vl)<-fBhD=~<MJIYOBCepI=3<unYO*f}r)DS*~DFscS79OuwZ)HAn8_WyDQogX-
zi)!<-6u41XcfLm6At<UT01Ke79A5=-boi`Q<iH|Gy8}eQxfMLju%>XC$5NRm_VD0P
zFiTLq^rL01<>Wxev#XJ$oV3dUKKA*zeywKKre^b|=Cmq@c$axm6M!)^^Q*M{qNJ(0
zeo($9$+zRLy_$<bQa?fszL(g$j!X}4N|P_Qfs6u+;dL*<Ts$Jymt)+~Bx{L2>az~!
zlTJ62R14mJR(s1e5buJ_6DxNZkdwQzZ6FaX@nxgsB@|iYW6j0oeb2OEbvNyX$4FA8
z3!-VAyZg<d8IMCQ$cWk@+f#|FY#HHS&QHkev>L$3uvi)MDa~A=3imDN%ieQIh)6ys
zhEjs`U3NXG#L;6AlLl<9f3J1pyj$6RB9wI0C3<sCNSi{UxzKI+ONxn7x>bX%5u*-U
zZD9svlz2LJ_KlqEmoc9U#}9d`m6McV{V$s#CFdPhOVdrgChaPGXfIDt_?)+y6h-(A
zhf+E<NA77a*o02g^mhafQV89cteHip$bkxHq6hmLF=NUV`A5CqXPH?fftRl^;1+6H
zHCOUOg)J{Vv|cG%tV}jiIqZpS;4RL}F(NBvBfhV=b*<ecZ9VZheun_$(W@VM7G+JK
zCgeW-)wPq%oA0fgjv}fxZ|)OD%GHKZuc0(1#k)#<8a<o&**p7JTFO3nIKO$IH{H_q
zwrH&1Z3)JYf#e%jN(l{wZU@5Txb_#vk?U#F0qIePf=Lg+;^FLf>>v1ejTjnM-m$MX
zH|JU{Cx_jLDBd?(<L2A&{RpR3a9A3OIJdUa+IOCQ>X0yk!nST@m{BSg#1Gz&?fCG;
zad82C=xD9$lMg41uJX~IwWk~`-Q<e)L9e1~;tXFxR`e5}#A(2xo0Bz;{NyviBQ%sD
z(BrRH_SRe6Sxqc>6m2ID)tsgq&xMs|kAAL2h=4NQuS3dPEtITmvHVUVh`E)Z7}3;i
zwOOkq&V|IuEtyd74DO#a_khPS{g_&?xiWq4N3`4+@1mv=#a4tt2Df;121`v1wh4I?
z+U0UR$)k*y=9@$$(owZ92FqjqG`wIaMpzj3l%Igx(!Q=u*UQ$0>q$51+Hnyt4($)`
z9?>SzV~rFaDQQ}Rs5OogA6YviGI6l%yxhQ21Mj=Xk@yqUyIa&V;e_(&V=v0FP)ObT
z^Ar0Vm3!N*_)0`OHJ^fAroR+6o6Dm=#6)e9zH6!hk%&<*jg&K_2F&_EZNu3uFa$ZH
z)1wMHjZhcFwaIUtj|><B^RGExpBs@(DRD~<F)Nt+SR;=$xGHu?*DpVM^@q0%R205E
zR$HQ@@A;Wfml3_x7E;RdphksVZK<`&O&B34^%!B>rgFE-bCDLwjd)^JUp#2QS2jSR
z1G>T^(j#aYBRL4=!eBd|W@h?}#=7P^y7k2rTYrGlc<!}4=|_54&s)kJ4(4f~S2fh+
z4#+{l>5$y=#Nwt&2{#d{2@kopvNDI`W3;d|=}y0;V#$6$RMB?Do`cFVcB*zGF#6Tl
zV5iUS^Sh%)(#I-$&`v37*Kv^@JGyo5Sc9J35hO|*)g6FK;@c9G8zeUHtgxLbLe4I|
znYk?wU&{)x%>yM7SW9YL!yY1}gBy4;<EY&9@6~*~jE|#w21Jpcg!T$qVR}_0yhLI8
z&}0z`HE*nLiD#tC#I?YA%lRfsgvx+RnukNq&mdVU=SEBSIMA^9Gw_6eZUTc_A=ECe
zhl_VexUlJqlr!&PUbVw_Yz7;r%jL*|eTVPnHGXEtZ1+oMr-kfBGbVHCZS{S}_)vGv
z6+WHl0S}~4;0LXv@=`+^+6Rqj2ic^F*TusoJ?C+Th1t~gJzBP41u)zP-uY$QV<tX)
z`#N@{5Oq);i0|XxGyxm5z{1zsvGNMtmP8bL?(Gp5W^AiX%s(>_@($ZgVz5p{#`=?{
z&Rctu)t^C=BWBZ&agm<kW+1h#>_S3?+N1lEy}XufH^nM*5y78P@=o{kY7<M>#u()~
zTiSY+36oZbHLL+;3*vuV8(X;%&Enr(m|rscAQ6PKnOxY!uk|+n?DNppcGB$YY+-rr
z?i2K^PFxBS$#<~D%S*K1M^(gHeiK8N#CddOmFNe(#Sj32y9c)w<9--G5&G{;F@owb
zUyFc>5viTDBXt3dX4m2sEAOR_NM!Z9(G{O9I6NB4c1unw+Xf+mp<iRvccu5_vhS3p
z{AA27N0p&QyN{C_wMQ_CUABxfUuHi$2|9iPs?!;SeLZmLOTTV-(^Cr@$UFNy6{6-{
zIyqZ0k(dnHS)OR9I!Y+5;;hYsx0VdKryPrz2_=b@@eg1p+;dsV&+tZr5;{<<rVHzs
zv%)~BbJOwW#3I7Py7N5;FI;I`&WjtpHz2zww9P{|iEXgrA^W0bw1?51(7!2K$>6c7
zh%@wgpUvWCw@Uc^n!WDZ%GWwI_p@FG@Gl5tV!HJ|ZuXS!?39C*>=^sj*JpjNBrn0@
ztQ{I=-EV<4KgAb<a=cs&CdG}L(?08myD*pch#%DV%i%vm7dqIQbL~&Hc+=|By8m^>
zPHjE3wnNP!H>A`^`!Kv`!bCiJ;bG19Uk1tzpnX#%+_y*h#QTV8!|Me%qwJRm@x$|0
zC%j=?m2Qx+<1`?AQ}=#Rq=)qNl^fVOuTP?R@Z**V*jr0+!&-Y=u3VlnalN^O+2*&^
z4D{V)n}EAM)h*(5sJJ@1V|=*299&KN`Uq%B4sG;(eqnkl%yi3JZ)ScH5>kC3v_7*#
z!>I}k5;%4v+nD1O&JCPH#ZS5fxeTP6R0>SvMMhH-2<8T(FJLXcQ_P*t0w`$=i;(2A
zbniihz;n?GrPzkJ!1-DpSWue=$>1D&T5(gw+%ar*1aU*GvLa=x&wr?P1-CoJ-M)wE
z%he~X)<JZ5y7-()O*My)?M=-%ZP)vYI6HzQb~7h~8wVtuYkb^Hyr5hlooG;LWjV7{
zG1!4lx}VZ8k$pzSxs7tAWUdse|1pMkht^9Y(QhC)JGC5&NLXAA%FF98^TswEAsk)q
zGC?)^xIZqJ2D&<oO!MNbWbKCDOk1wCJE6d`+r?Pp-h~DP*P@G3&pHwDdVs7#$c^OK
zYNz8@*oF$`XETo*KVmo1j*=J%SFY`^UGE(Q-sf|5yT1SNh}gke>*aervo4ifdt+{N
zp&he_lFr6Dc4gl&6PC~K_Z@jv73+o!)UfMe8T{~@lNHz_0y{g~%X>l_X@B~dw+I3Z
zly2V|$qoCd!<;W^c7zs-Gx{kF<cHLt4%d*9<>a2A`}}1}J1ZsiBeO|{M<6Yyafh+D
zW3Wu&+tz`N`T@#!mqE9V?K|Qp6Sz<vH7F}c$Y}T_#eO1e!+@vGY>Qb)G@-DV9{7kG
z&i~f=TtTh2&^X=J*4~6}apsUP?u)ir6fpnzyuBi}#?&as%77s^SQ1-c<6s@q->lZH
zGhR`KTb>ZoYV3hx8*s!E;k|aD>7zOd8pr^s?F{yEBhm&3OL_0*V%N-93b73;nfT82
zxYjXmv?{H%e&nb!wcPR|7#p?ItQPiAPP@Hwz0`EPR4?_GPHFKTMp4!GdVZYM2=e;D
z+ve-E7xFn0!eZ<o4oV2AfdMK51rd+VVZnYS1h0^Ur-;xzUSkPF#d3Rjw0$^K-6GET
zd#qMb<gFG>!(x$3-4PkJ^R!yDP{P1*R^*EkHs&P<$u61$9pQPY^5zB1d=1MyF@3R9
zC->k^`<(<S_K5`VF?Y+(ub81KDSqMtw)N3rI>LuyI+y46L+t&38er$n^7o{Fxw(vg
zA`q3Y__lZ=v-Y~n@{#WsHuLz4Dk|*F1td2fk)_O8coW?ecln+*RA4gqv{=$1du5=0
zAS%~Mg9j1sWO9rU&aT>&Y+lVw3a;iK<q^xrrsTCr-I>f<NkLj0ZWdP#WeW+74U#b$
z3RF%uA|V>JvcPoJ&)Eyx)yWg3BH?M}O7_7Qgc3asmhEkVUJr1uRn`l+Jk2`i)Ra03
zUfZm>YJnfBUz>i*Z9-kz_<T=}E;^PQSzuMXY-Ke9jG7sLJKEpi<nw)_zU^b}OxTdC
z#(15q`J@i}`|*L?_)W0z*0V3I?%AeG^x`w?#c=-K^J_O^?-uWb&Fc!l2x5A9{hq1Q
z6t(i_-4tmnx(Zj5N*qRA%6Rao661m_wubc0cdXRF2ky(xz@0iPo=H)%g<6WsQ}(7{
zpPcKDt`WOTxz&_}%PLDLm6VNq@NA^wRgK!?olSG6{zv3T9VNLBo~L)}lO@}hJNUV9
z_%}u62VY<)=iC%^EFsrIC#mJE>}`Flz?!XA^G*yWbG}S&e)he+@X1fb9`R&~5{6mb
zD{US-3ptq?a27A(@w-a?5C$9y{XF?l0N2z?%HJW*YpQ2nH4n`XZRM>Nu+U07+>e!b
zvbo!t<iB9l))pO0l^?e=5+iQ=(P8N*O*K*C>Vi(YAGK;V1%FnSsh6DP>c}UqxRE<^
zHsIDl;l{&RReObSH>p9mNdjWu%oVizs;t1vZ89hdSGnpboCV=<d&*}JT~s}DaAD@P
z^pWR%^E>0~+e;sF;%Iev%I;VY^Rp$ghtwma4+}*i`~0n`>uoKJ$Xs|}vTZr5(a(r%
zyce2hoK}|>u8TjTuHrB{cvY?ro#)ttn}v98Y-ogC#aUBpHmpoK%C*HO+8<V`(TB=j
zAa3ey*2-C6of(V0|0242DJw?0)V<LVcbnY){Z?OQj?R2Svbt7m_0HI5dfS;OZb|;E
zI6a&IRr|Fdj}ou|!x_LCTo>`&ZgN_!-@2qmmZFoum>&;J#<<n&iP%irV%h6uGKv#o
z$>KOT-s6w$j6~CjH=PtR`(^_9r}&YXZ1<m%i~tY+SyAGHH&L~&E(De8Z|kfs-yoJL
z!x9TMA+N?NaZ@+!2RHDDN-|(UmDk;4T@CHv+aZTXi}HoHFh6H%TqN;e#rq*9f1s3H
z+jzf$Z>1b#WoxUpUR}cKCXrN$G^qKUh(vvsMpX_P!yyI}h<jQ!A5LpfWis)O9O51?
zPxSqyij$q)&cfpdcfVdXRa(Dvi91+WK?wpEsM^`|%S&#kfpCYFzbjnOqL0@$im2`@
zbPt~VaT?%w-N1Z|;PiC(&-lRuo(*T535?f_L2psc_})dc<PlF1Pa8`m6j)Ustb{T0
zIf@^2#3t8Ihe_OckAR|1D>lAfE>jo*s>px(FwUevRWj_DSu8w*F~QvlS_SJq<d@cb
zXP~?7^5tTJgfkQFJ1v57av$@3*2h{h|7x_sRY6o0+eGmqq#2+2ytJ|L7Xc&LpAYxF
z1r!s~cABh8T|ICssSkNwdv%Szx%LxVC_Sim16k(OUG-f*vS7XYsB(JixRV;}4(qI)
z8tBUnqO3(HgS)F-)wZwIFhBgUwIBB1Pxc~ckp=Rg_A}lbdcxkf<6NRyHFXrs3X0mY
zeZ~iLD+QkL&2k!!hZrUY#Jgb0A3Uxwb128Zp=4VsLAJB?(pr{m^auWa0rJO#X_%%M
zfI9V#{!#J+#ulKn0lDO>g7h#lPY~Hi)Abphw}MoN;=YR4Fa%ms)=DsIjc0vDN0*c2
zzPE<u=n4pzGo$=HRzIsConSD5;|Aerc>9jJ=(8)*C+&uNz&pu<1f@2^LKB=Sf6Pbu
zV|^eOZMlz=1UCki?F{I@p7bY6<pH1u?GX@P4XQ>Ia_c+pr9+2-SuPd>{lbls!o>}c
z(UBS2Nt`uTjlEG=Vp`6@x<P-OAlg^iFUnITnR2in!9@GPd?Kmi(2^jZ7_s~HYt#hr
zCcRJQ|L9NbFq%?v@X{EAp{l<W6|zgL#N`8j98pXg_YS9Z)4fy<H~R|@))oTX`z$6I
zanTvCJoH8`u9TEE;8GL!z;rgps-2sx{L_brBmBH(7Ab~6TbCt|8RwtK0%*ZemB44?
zIb%H`@3xfJLIPdSUFVZfc2j28o-19;M4p|zGVN`=-j3uXh<ej^@NkAYQAOaBk&0gG
zrmSLU_du2SH65P7k^MO*bf<obpu1WLw3RxsKVk^iirC{tvm-XyXWq}p5U>^18M=>0
zLM9X4Cq&<mW;zix$^wn05`FY81s8X{m7~IXI?zC4DGf2SDPa*0c!gr1Tp*RowuR~v
zhRDw;-4(>Hy1>B`6v&?oc%w?M%N{&w*NSckGUaBtJFZi`pK{1BqNl&v8vC+^u)}_l
z7ielN+wM^Uj4r@>x}KHgW$KVjY>Dst8BdDwxk?Um7w6PgH|KZZ-(ScWAD%N95#<4I
zm>x>FjA2ks{Duv*UAS4b0{))!4k|j>*)T(70g_!pzh-^i%G1nJq9<H}$kkKDEjl<`
znpUCW0&4u)e?-u!ESLt|qG=v*r4XZW_3dq<W8UfyyOM2>2jCd-gF^U6qhX3t9z&YK
zERIwgZD`+Qg_M0(hn-C@if+BqePqaOXyUX2{5MZTfB+U_0PBi!PQ(!vhBOq`emkjE
zpgWi-EvQ||^8%w`F=;t4qdXA38u6;JR+WirXGPw7aFuW5k109-MFDTCfz~_&Ro^E_
z$0zek3G~RCrSBrcqpu<^m@2q*_MVy8Q_AnmT?-0%e!=wE7f36`m#1ZmLmJOX?tm8k
zX3jk^#QLKelUA8+OmPW!L+>m?{7=Q;AM4-~7$JeT$0KMzb&LY7Xv@ZSfLR~bT$Ht9
zA3)5VzyPuxAzO<D1DZ0`HhQ~FaYcg0HhMffCqLxC%|7^C>}&E5nWLUJg}7|LtxKl}
z+3OwTd9-i6PW4n;e~-Tt9}ec?ky_#IeFcmh?vx;B#BIDTwDsAHNcI?#@eTf&DE-aX
zj(G%0Udx$fs|BqEuMfUZ*O27H*K9lc-_x_R8_SP)Bwp4coF*=Hcz8X}%xGjC(%na!
z(x2<UW(#!rq(dkvogS0~JNR#L77#YioYm;dO<+Wn&8#namdNPa+1j_f23oI#@3$gU
zs8i-b7ODRL!Vd%jEvapav?z^t2by%E-dw6`@l@0X+b5O;L9~8)4s5x;+he}Imd*Ws
zJXOBectVUgKYuRwQEO1qw)VsSZj<#zk^#-TIcB{mcfI|Thavtla$0Ti($6+>K9Sy9
z(Qc26mp~SNN;$2p`C+5}R3%}T%Zgk+n6{Ar656K<2k0<M13Cq`UK?iogX#$6IRM$0
z8MFDD4u4;m-vSyST1^J6|4IE}Kx2tKN4mt{!SGb(3Gkm5=OO5i%8CDM0N9}M2Y_Y5
z0&e}Yll4&)@Sg%Yw?wwz*gU<U$pq;5-y{8trC;{V|L24!eV{76znJ<}8@Cx<OBb8`
z-N?F?sFZQ^A8fe*G+?;#yv^w1^I32oOyIlG`U1?jzmac==%7eX;}(9o==$<Mod1Ij
zV`fhxby80kQuZzoGqf?zNAjbt-np?t)MVs|KBkXNR(_|=rL3gSgLp*vo5z1$_+A3K
zapKQBzug_b5uJQI>P^nclf!u}obna8*J$VSw?hiT-kzvd#;)I;VW*25Kf9<L&$QMR
zbcxFm*QOsa0KYN$MOo2J%*!GHjwVDz8YaiWqb_ya8TA5L>aPQF6V&+2f2b1QODuql
zesT3O(LeiUV!i?A@|O@J{{Z1ltSSX5sUEMeprsMGS~&|LJqwKPJbPinljz}r1F=!(
z>Yby12odEQCmED$!Iws&CJ(>m&*={w9j^N@>!DuxnvB$uKt9ZQwH~PatnK=b;S2Pa
z|L7JYFQX@<;`&y)j!3HFmL}(}y;J&#CY`cFSA5hNf5r13(u_v!EHN#O+R+1#)!N<F
z`PH-pS;1;y<yN~+F5;bs`t~39fp-?4inm`E9mP+X)(isGHfO4jk5G&y7c4J#UFOW-
z*3oS`ZN2-wJ>x0@z@9PM3{m?1)#=6UlW8OoHe~`)r`S3Dj|}iDV0H*As_gHW2mEAU
z?uk(5D<$whh>)OB3HaB)OP$D`{|f+2SIHW%v3qjSd#@D~(ge({4uTsNm_2ZV#W@3S
zz&W~#73>MN9u*#GqPG^0{EzqglXm2F0Mt|@VT3N@#X7DY?;}wD*3s0`l4{4huDF}l
zEx~T@B1_<@@Uhcm*&nKU=K3RvOq%F)k(<6!fD5C&ha)`Drr?4~?R(=4N1XRArxv}L
zFH4nl;a<btF{Epc^_qc>sbAx${2OzovH_-p;6w52T_EEk%e=I4P&#F*sD?vT$S9~+
z{J*V++c<!o(2V%pG+^KEBz3{?`H=ax;x?z=RfijMJs9(<koXdK%da%esZ??SyjgMT
zmJq%?yj00ln<8=ka7-`;xfc32`c~uxTId`E_gO2|8fy}{ku%>N<Y5X5_n|5|*1Ni6
zWmXd#P1n-bn`J{AK)5yc>BFr~-irKaBd=Q%7~5LP!S;NP3i!uJ_i?C>rWTQ5MtQmC
z$E=wOpOnsz4@Yad@6$CaS9eR~{dD|K0SUN~wu5=}nu~pz8qWuwIN1XgdkX`VQ&t%%
zI~}ws<++wxQ0RRH1(=;tshCfdy={(RTn^2qp!awN_E@W^gzfv-RinwuS)B+b$mgpM
zhXf+ewY`XMNzHcnFCM*31pp5DNCnyXfA*8LD8@D7Dg>d)LAilOt2&60BEsng#&y3}
zjRR(I$k@Iw=K(}*YraDq6{G~jzf`)zTM}^SrJLSjmfryhZR*q+S2TBdCA4wVgWg?W
z#HcJ39dk*?!@KprcJ#>L-4G>6O>#$b3~_waz^MyU1%%7O3(u*$VqnyyU2C2}jkV#)
z;GS6MPOwv_3dCu&1hd;%intb-&*|oGxnI#=c%A`!e|1-x?S7!H%VdSFi0ytI9{Kc9
zIIB{}Sjdh0v?a<G#O2Phu{x=H*Dyn1*Rk%^=l>0VfCgBA4%Xl?I|`dBl3J0dVU=Ib
zGYW8?<!{^_`Ht}9r~sh5k?_CRd+)HO((Y~a9Y+ThD+owe5RfjtgkqtHfQa<&5NXn+
z1_^;dMWri9Z_=d`dgv+%(xpRaN`O#<p+g|~cGQ_s!tcAz`Qtm+IoEmrGD0@{+0S0(
zUTfWJJ!Pdu&{_4AFu|o3ZRcu`lj)T;Sm%y}Ez7v6-`@JGZMD@~I!<Lz%F347SRmg;
z8SQEpB4*Wt6Lv(DHS2w&Vg3tezqu5z5EX2%%Vj8`gi*w?@u&u<!6w=e2&VDZdn&HE
z3>pnADG<HH9Xdx}7AOql8%nd)3DtnZHrmU}2+Z`}PsL_b2vr{{wEmW-A2NOA+L1q+
zq-o2dbP9~<7*NPKh5o*#%qS$tz^&g(Bz-8WG<SrthEyw3Q@L}JKh4|HyxS<hGQC1*
zNh7Chx<}mMhbB6bHL)x_0d<>kpC+CK1QjF7CyeS6Yj!00%JoLI%ktAWlq+sW-d91V
zy%X!Pobsp%!-Zxo_Fh{6Z$oFo)We%s-_XKT`rnp2%e`Vv_*MUaaR?V2<^Dj_Y`zgz
zTG?%xEK(N^Tn%=V39f!lBJvN*_ZM!L$4YHw*(1Y5GY2%+81*>qCTvf)+jcfGFI5b)
z|GEXSwq#Rr{#C0HeqjDZMHm>rC|nIDF4ns*3~km&IcksTIfro2_H(G1ZhDnjC0{wy
z4_`Dh2vc=tb$fg~_L$T)DwWcI-xb+-zXzfcl$EVD^1W3{uS{eDsmR$|n(aZoh_yA^
zW##IkZKwGY#^v42L9ForBP$gNnkWbJt)0u=*47h=s6IQ^Oya7r+0Zp(01hhiL?jXP
zP=_!ww1pwV9!bA!1vk8rpT}i{$D<cZYHhulRyL(zmw?&Ol0RSLx%8r3YLvc4Y+;@m
zGv?;D+AZsMF2NZg=+scan|uqG9sMY?FyXa4am=rCAjfDWI=NVg(CPQfb9{5>-vhQG
z+o&SrGk#TjB%5X8lZEQ}!|(jAl{m-<I^1~-1~Yrbou#AI;_Q9u1bX?`*{Tk-CYodV
zz0MW88&KJ^Yy)vjj$CY$?dZM-dEIvyUzwf8d%rQ(NiMq)`}QG~5;_%hoLQ^<2smCG
zS8)^`_>hLS_kl@;r69q##dpUglR!gyk<pbt1zDDvaKj-|R#&JdWH*ypx9=NkVR5#0
z!;A>3E9t4c=me+wuZA|t%BD#TGd19uPqc6BMyyAYwvce2h0W0`dy<SFC#onPkwNYA
zNy6qbsY7{QU8+dsUa@d8sD~NL#}6Jzw70zGsJUZqu0FD65C|M4BH~oDRN)hxt{RSd
zA3)t^EBN}C7jpDW)%m}(PSZ8sv#HM~<LwI(=f~#LPyDuglmGXHh^P8}O_SB45jL4m
zyaXN92`+!n3-}kMT}k~*6~uFiW*=-Lzncr!IXa5|fm#2409uR!frjYol<)88&5Z4n
z|KiMl?gDhYz5;r#59$LC6mR%dMZRCed-xL2)_rWBS0>x)3$JFR4q9x$UrNA~f{VYH
z^Y7og)U*4?0c*`T^Ac!IzjRQ;yi)wczO5#&3TOnU0s7yc9`JxKZ>}=`^R)kd09rXY
zfEN2RzgnqptD>><J-D6yCeX!Cb5J`wM~Cyj@8F;1{r6>12*ZD0#xK$iG>raNGyc(+
zZy)|^rhYM#|1ULDcj7oyK4#LKmt&;bSPGS&*tzDuxu$e~KHaXD!E!7#n5W*dwc-g6
zbkbxAn&#+i1Mk`yDqFH>=5a0z4hk&dpd23f=JJJCo=D&0ux^*-f^M|l`K#Qt-Mzi6
zjvXn1DN@Tpf_WuJ0#jZ)5|RU<&UKu}V?HXuU|Q}FNOPxS!enC>Rhq}%dZ5A0+$!ME
zt+cEx?i^WT6BF76-Tf|M?l5ppfhb>bQIWH5+~YR>&)uDE3ozmXo>K~UHPZT<!-|%J
z&1yE9#|a55>#Tyzm>k+CSg<Nvh9mTA!=By6z=|~94Idi=$(C|Ow24y}WU#K3KP-H4
zNQS~$UQPRp&Mr_qrX)=<d1nT<dxQxh-n@eB-^QRb7bb}#HI}P5oY&;@h8NTP&E8F7
zRwK+>NU4$1@L*Js*XLvpk)qs(iquY~>?X{ooBiZnC2z0Wv>lHT(#_i11Wt4b4WMSo
zdU0;QL#j3QhbM2zdM+BuZGC=HRH42k^!P6&wDMJfhTvYjt8<|q%;(`;?VlYJ_<Fjt
z!%FB^4Iug_z`wJbv6=P+t>D{J2I>9IhRn-vr5U?e<XB)fC654-A&Lq&%EtfPtp6Cr
zgElGw?w9tMS<nuVVe+99+^{{Y^{m((nbKnSeC$o<#7>nEZ)T&`3L8IC?(WB$kp!>@
zlvq5K^2`<(VunGbT0P*BICVfGi2iF>8Qe_B(GIDHj~=C)UjYC4K0Rt(o!+X~0qk?P
zLLRnuQw_6tJITakvQ4m~`=#(w$jtf#a23tP=fCI?86yM&z=fNp`qH}1qyfUW#*az(
zR0zx;m%`Ffbd%XEp>QcMW~D34Izg&wFT4!36D5TzVKzXH5j;`xKG{q7sV}4IJwCtK
z?{7}O2Y^<Y+SNlMSg9#FR&2zMxtB*f&_`-Da>PUGoUzn6jrvTqyDrRZ*9n|u_|j`V
z;*SHK)fL!5gGtC8GiIqNH={ayPa9SYy!~p8xYWD7{ETZkWKt#JgKBJvV~1V<Btv;w
z5%pEs`3b&tc$XT@@yx<|%OmmEGzR%4ouGYrlNsfLxcvY;Jimxb?55o;^WndJ%4}mQ
zb*L8|r3fW8R8<(F+2y-ff}T7tsd1X2wU|TeTda0*KmulWoLIbY?>Q5n@c6PMu_A(Y
zys*0>6Yrz*)+oW>c>qUB9-4AZ8i2XqxY*r4Mh}6_G2@58P|LQ0gmf{fza&N`f2R&w
z=-x>QtpG_jn<bb`8oJmS<yJ@xHb2WU<tqSHz^pyWeA7AE7QC(ytMG*%Gw0`}#@O0s
z^rs4E{vL$3AHwx;9c$sPt{<xBOwm}F8)X#~1eNj@EoT1`2(UK-fxw+{eyfp%L6`_`
zq9@jQ3|=E)Y&=qG|G6Y)sE-CTAlwl{c>b$1A|vgjM&^y>2~vW4YMM}B<2?&GNQP9W
zo<1sHX0YioI({uiZP>Gzq{QO4ZK~Te{36~SY~B&;Y@uF@3l^?FUpeM!Qcna9*MvX(
z^1jKCZl6##W}cM>6ji{u?DE9+jnney22I^MG7&G$X^-&H+UuIK`3J=tni{CA<h$$g
zj_BW)x@>sI4E;%jLIbZN4zVo>7-&$|#v04cy}X$a-HXejRIpaMmpA$Ud(ol8|J{fm
zYW3fZ`0qyi8p{4ZDG_f4D&OgbDiZsiz=_Eli%Swsoy()F9@(c+118Hsg=JE;mg<Wm
z_1w9=sNnC@CVsZ3vAT!!Gw8Ax2I~}g+3L<c8BX_S@QqdojJ6*JoxujlZ~Zex@E;!%
z>rK7{Tjr?-7MJo9hi+tf*()n4FZC=wa%+zj;Y8*)&=l#`%Ud6Sv%Z~vb*U$4RUf*y
zV(0;Dxim0<4BG4n^v{)2b=1^TL(YC6M)FL$>spbpl-}!#-Vr%;+?CQNU5ae>%!ADh
zgpJZ;XOWYbiSYhPNfnIQL})#pv;S9?2HEawSpLS}bBnu(wB?3hEb3BHp3awk^m$}E
zUWVi@?(NNo8AWtEU(IWuXO+J*6n3H1p<WfFF2G9hhqB^<;Ik!!4YtDXqG|$BQ&WNv
zHilwH+T9*&b-f#xP6)+&VGrNQrEnXOx{c{cT(ky85;ar#i61n?VPB%(@@5AbQsh$n
z)wu2{8@ISy>HCSW)p6An=DS)a5}!Mxb~-?uOpvDZl)P|h&`x(}YL=MIppu?vwiY|(
zAIagr@Je5{l1@)sXEfE#dTG+IH0@%vkC$4Xt{&97w6BH29o}O)elvrPXK#DiPeO8c
z^Mxa6WfMGnGVZNWv0R0Ng~CZc8YhZ*=vVIv0Lino);Pb#4xGRzx{>P58FeN}ab)gk
zV8q7#nc=JV@-4zd_ph%}8~|DBv0pnkDn9Nh_W@8w@x|$hG6B(52cDOWnX&s3#g*wJ
zfY9h98m%ISR&gmRn~{EWe)Ul2wZxPioUL=)o)z^xbK2bLfhNtiak|x@<+~vEnO{|s
zj6v_+qQ04tlro+h8)L+j=jxY2G-X+4<jffuV{}9`I3(;mJBg>VS72$2zCV#|-Q2kv
z^zWqLzcH+i-c=6A_P8Wz;HqIZM8l{}$1}n1uidAcdXR3-i<%cHU`FE<0{|u-o@&0C
z%jzYYEcc1cY%Bo9x>V4Of8y`uK3M&gzWms=Bl}~n!`zvGtQqy{-^?wDb<=BqVN)7d
zR!CidHI;V%XYTMn$)YRk<P2lS+`U8};?3QT(?q`dv1nL^TZrUIDWRY^@N5196qyOY
z&w;CkH<R}+C&&?&2_-G1)FWQW&aYaWq`&6qv4?Dq-(hge>tq1^qx}DTh<yxP{DE5{
zK@LP*-Rkz|NnbUoNw*J~GP=){Hm8#}2}D~r1YXKBXJIK)l2@Y0ud@_7c~5n*D$k?6
zS&~z<duT1kcB^hvW$X~h?e;Y3!YK3poxGrSm80OY+oX+F_^mVvai^egai1&WI+YjX
z5OHg?cYn-$`F9^<*?=WtvD-kEwfEEcl%1P5YB%L$zSP+~;lVW4*TJzSiXZEr2ohNA
zvJ~PaVHXF+s<w9D(wx3q#_C19>a)}!$0%acBze@P3+2&3QiacZ=Pl5{^!~lUkIhg4
zf*HDN^kXXlwhhzqhIdTlAJ5)R+E8w;t5mT!NF`}-r)6h@(D912iq7M7Ci+&?G>UWR
zQ^_)HP!I@(o1UNW(+eyk;A(^)Ja~Y@iJ73$=y=2I{qg=yo?_s9PHle}43^${&eWW&
zco2`J$tqLcvQaKA7N3k*r96N=JaM*-!Js+t#%Wc|OvSA2@c9Np{W5X|9VFs+(r#pf
zP2sh(A0f;Pk6$PZDt+<<OUR3bt#5;=es&ArpNtkz7flEEV;nn}9A{kT=15h#Oo~!l
zQlF3+!{)>|T>_9vpjF&e7O;>R;7pzyNtTa-ntCYj;+F5HZE)00E&T|%XPR;QP1NU}
zqao`}gK%*3<U1<%T5*Oxdp>`utRNcqS4VC)x78tp7BElLu_qS@tji{^6ctw>&6DqU
z;5{$P90?fqYPYV?&ajv;mY{4lKRGt&B3}AyyPL=zbP{;KsripT*~4X8R)4&<C*1U9
zar_YClEL%~#<z(^hOz^08B(qbEh=@P0v;m}W#4PY?kIVP`>6*1_Gd%eQC%uyZ#$nq
zm!Wi<RTu6lqmKl)L3Ca8YPlfgJ3EH4Uo9(kxPnXUDnt~3+6k=_RW<dJfw)s30fGx5
zK*X&E8?6-Om*8{b&p+@VZ~m%Bp~N@OoQJdVU3|bquju#ri`SIjuBheAWYv1Fz2!S)
z?`A{)6^&g!$fvztFA+kbRmXW4gfwOAsf-RyH^@+Kt<dAgSUA~-g&PbaTbYAFe+~32
zFN`b0GvPATH6xRsSM4Wff}!BvuQ8L+``nZN90yR)xYA2<F9U7hI_kjrB2IPk7iatF
zQ~b7v=cHr$n&0PvXAC{;M)cc5t)z;W#%?S@C)0-qi>J*~HhL73ItIdDA(_W+QKUKU
zp{c#f#J=#{shMR(#@7L@sLNU}yrYo)<t{gTUl}^ruB~jh8`hVOH^5{`W5rWR;Owfq
z6-%HsL!b4$)2L0TibCU~iY;83!*RzcYP9+Ek}WW+{#VNodkWC=W2L503^3Ff{N>fh
zfzZt+@blzg;=3<uMNhMMgY1D&eiWA|@1oUKIXQJFIP$rrmc0|5X%m5jEJyYbXCKYn
zVUcr}-dTy!8r<j;K4opt2OJ+m!Uq1id>!;?wx&DU4}y;0TFPK3qbphr-nW1YS3f@n
z%1*+DCqPVck1qN8Txlq`uBZhWOZsseKT{5oHiC0)I9kvigHnYY4SP$NeP-A_KxZ}#
zi5gqa)WNq{>EXwslZN@N6zwaJRL{oE21pKNV{jRh<Uk-*7_?BzaI?1}cV=J_btq|A
zRoVo+gu2na)?#(G`IL$Y^iQg;XERz&mRcTR@eD`yYq9?&0Qu0||HFJsdy%y@ovY~t
zKaTaL@5FF%<D^>0Iv%;f`pP0^00A<77GZl*vwPfqHK19RpKo&BClb=!9wdskCQRVh
zqYWcXbO(DG9R&rkk+#eg3THa<j^`}}$?qKCSFZjj3LX~eE%E+(UrTOVN)@cK9ul}r
ziZYmtnpeg{hsQ+P%%>!VfZ@|~p>D&QQ62GangVYN;<mhRoXtU>LbYI;G3;H*!ERVS
zj36~b)2Jxb_GGXy%CT}Rx?#RMT^0Q0eZURQulw0xrriGE<}hTMa`1`)^zR~~hJk0?
z#oxj`PpO4$6~-n6CK#AM(=(xyfL9d)4r@+Y2RPvyu?o1q9?!6Sx&eq<D?JeLKXG_?
z?*a$mtrbp3dsiqOafi~-s0je+obIWU$stdY!ci%E+qhXFn2oWuvI8*igx&0s4lrtO
zZKWhoR)>O$cHxQ<;1W@Bh<CJ9>6v(-i8Kc6<6WI5U$bBhHBrJ0B)MQ)t3@84qs&)n
z&;gF1DNgR}Ql?tj=<%{=JBD5zdM@)wFuwQqrS4%f+gb=A>n&`76YiTM^@__i_a7hr
z_XiDE;7>d~Cg<|`)sR`f{b)5m(Zdtt|L5t4g(L?b;`S4K`ALYfs-zRvfGu3FX5dkd
z-{Wk{{TtADpTP@rzrkN!f=H{H(g8`bb6`wT;x{j|epj+31@%<&b?!>W%`P?Qrgsbf
zs=>fQ8l!++R*&^>%^lv3-wrNftEX{P@<ApPg)RhHebCPjec|@^jQPKp4FnSiVAp7{
z%kK3vYdNKXXSajim1@5TInSba_wQ-t{~C+NOW*ELYg9TBQ8YKcosA#*QvdX=zgO;_
zeRot*604Y?n)<`L6gB18-9wVhH)B5>Vc${`zh8NFHYC}0xJ)wHVf386q7S%sufRF5
zb&;uzA#As6D_CBzTYD5yO5Bimm2+!Wg92s)gJ=Las2=}5SHs8G#?9U7L5r&U(ikJ6
z0j|hAIZ<dK>lt--|JDN2BrXCucb6=9mXn^(GSCK*>7s0@i;0c~<`LO|{rhbLRrvl!
zsQ*|1ookIjWiHn6<tF_3x5nq~fmC!JY4$$LGVEe!dpvDv=dNt0ZfD@Oa{@#E@MhY2
ziriVAG`<Jefsoz=KFTGu2mo=(`^A$AB(Ho&Q+5(=q-%zgFBgk?tYLPazMbDkSm;vs
z<klv5y&$yPYY@U|r~de<`eh1|=b2a~z#~U#=#%i+$zh9>z@SzmmzBoKRS7rZobB40
zE(++!xy3>KJj$ppo$5H$(>l5%JT9H%90l+?`c3vD;^cOw3?{=WCM6S*E4Ym%IK1xC
z<uAq9d(Md+1CR8K551(6)k_RtZEac(>+5$2$?#*Z&}v##an)WdU=f4=vwc54__utE
z$9(=T$78*}?FTzR9{s0yOvg+5*MLm-z7&l#L#h1(%gYu66ES1P7ZR?vEurTQh=13=
z#lOvZ)X~k+ODW-%yEmXdvz+X~<@U>=ANFp%+k-Dt+k@9t70wkv>)Cn!c-{)5ps*g*
z(EDaFmV(0y?12VsXG^8RG%&WpcK4$J2I24WL6;)zJ^I5!EJ9~SfI3rtdw!tfrW6vC
z`}}bpCK_ryx$#T`e$UxwI-%=?-p&DUoE$71i$oJ5*%=;)RyHS_nM}@P#!08Bwhi=I
zJB4UX?4@9cn*1v+W1yNzuj>{;lvbF#0SJfQvjA_bb1;9aNmE4lB6o?&X@R{DTB;YL
zHXL^6cR_tL%y0j-4u1znu^GVKX*Eoj@sSwSM0w`y=I}cQP;!p|5flwK+*KttEqkxf
zYwJ_s)3-#g`34wQwOe^ywU_-|Y05Fda|i6_ZU5F5nMHNQd8Cw-$_;fJcWdGP3WP}Y
zz$hDgLyp4uBS**fATfMf%ATX#M(DuoB5z1$eX7C(;k{zhe~wD{$54Gfe(Yuj*E;~l
zv7UAa>;xIFo*iSY;oOu0*wei6gTkz(w+;w0`_6o|#Y(SAZ|+Fo>r_2ebijXt{JT@*
zhCc0113~7Fi{z#AV`rZ<xBop0fD)_y)i_04%;ET0CNI{6VF9ofDO>50_~3B{4(pCU
z>D2{ksXWP<uDsCspkuOk_WNDH7;}+pMe(S2kuI}=&6uKOTo|XScX~}Wk7U_m=*R>S
zV{5nlDBrk2GTLh9c3-eJrHN4!0zByW;bggU7S$eZz90O1ifzJEg5Cw}bq^yzq`^<_
ze!xj(O!V1Nd@>YLrXdK>gwzweBJp4SE~};WpZ3!-Mz%{SXAM=9I=U??Jmi4$?5`yN
zKn`f;;t>4v6ah+<MORZtg_=N6#DS&m@zFb%&r8$NmA8Qh>x-I~)&tTu<h7J6{&%to
z#8$d*?S`0AOpIa_6;@kce4a-Rn}Hzb^s_u5&2Zw`_1lzkofQJeb&oE6q&0tZy;yF9
zWyv(1bG>9C{{U1KEqy1K!mLM{0t<FIq=mJwKD8(C2MxG{&PEHVj@)^NpX@sTA3PvC
z_qhZTZj<4GY-RHxn_{A3vIJD02;I;NSUxYu#!9ktJ7=D`5f&>p-c$Z`;UJUEawsYi
z92?{Jx<Ae&EvH_h;Y;KAQo`it-cXT-QclH}ozZph#V9FcN@azrmW!=@MM2n+<fYnO
z+UXz)gW7fpa3mb=n)x-(HsbUPP}xnD!R?_iTVAy^E2c_h%4qFppMu{hr6Y%jEFFmh
zeaIvNdnlpODQE{iDSEPdcY0&U1KXi3uU|)zX#zYaC*V0Va#$9FvA!;{;}u@!0%iyW
zV*_8&9jy)I+`ISDhv@W5!A_xTMl>m|`|sMQ`z=$|O-%8VQYeKBa+GpN!X7yKe63p!
zJmQ8meiZ%rfOJ(q081jQw@l$_+5DbPT|_Ib1!l=6_WbRK0wxiGOw81mT6p)<#E)vI
zk>xjbU>-7hPOu{(?V{VoH#Md56^`TQ?G;y2ZTC=MeETYou&(R@kh=$Psa%@Q{k2st
z{X4r8*znD!R2*=eTFP=*Ve<7SVZe-OI0%|J_V5ikH<QKe{Nj;>o?0E{D$W4SDF7{u
zyV%;CJG!WK&+$cM7keQ4UI&TUjL^WIG&Y=XtYuT?2s~8$WVhKu8n?3F0n|C}6ms&)
zP5-PLLmyJpxiB<-So>zi$?_C2_s{z{seoI*#|PvdGZXvkX}geOBuviMI0Rx%Yn#qo
zV1)0Cm8mwQh%5hUJa-m=QHdYl%>wzS#hUI@O|-{a#A>{_Cvzw~fQAfN1$dX1hZ*dp
zD#8j2x}1c*x+GACJk#D!1UM6m1|DB+6<S!vK^v->67*~W>f9$7YhwI`A}e|IUObTb
z;nHwNI`^aEpN(L$P~lBI)kn0G&SW3(WpDDA+uiyfPs68i)5ppW@NroQ0O{bcn8FZ*
zBaDUcC~=W_BOhC=kL^qf&YM^VX&abG<c}V(7k$89Fxv>7R@~?x>MMaHzKF=(5rh|c
zgsNUuzsW-3&a)yKnX`L3OxvN1UO4!gTG?&@7006vG?FE*p2FN5E_@0wGLFqR`?`Mh
z>3CMKsft<gqc7#>>x>Ciczg7;XPEv0yU0i0MX~K$sT{pkoC-Crc6VP>=+=My?KlA4
zYzkCJDvEPk>JL0iiD3_Cg_GUBX_ub?tUjy8(U}FOb(9jLU%28(_TE{*gbxn~v&g)~
z5Y0`w^Usr)gWo+hMDDjhza7%CXax1p%y@l9{ElA=v$TRh9mYSIOSjzTh?Jk7<8>UO
zJ!!)BqGr(4!g%(8S9$!HA&@VLdtzfAgoDqQFqS3z&<n<>G4He$n%M7YuZ7YTg;3-O
z?%g^{zA99&Ts;kekg;x$eRW3Lc=uNA-R+H7Bjc$9?sJQb1T<`vm)7AAqEsbly(rN~
zRuUjGN;(Q&qNn8XUc3FhV=hxSo!xEN7e+soNICft!qcq=yTJWB<!}KpNSb-y0r-U+
z{^7a5PNi=36eN@qJWrA2pu4&!gcid)JIA&go@EizAC2|AJ<m3Cz*8MRHso-@MN|v*
zfs}eFP`|;_@IK&mry<Mjbe<KCVPjOcPvmKNM3^mkzQMd3Dsch(%-&!RI_)@0bDG^t
zNUzt64&Wy}lJ-j~V13?ecH^8sFZWk~wL!CNTU*_<FRHm`{B1)Zeq9!-CN3kLr~(k!
zZ(+mM+3B*<>BPVw^fcUlVi2>jb6uN|%+oqrUMeh$tCNT~8CeYvwiX@s^{!4Y*cR*F
zF?5ppgjXm^b0mp%yQ-R`_WI*@^i}UYI6g&VMVg>8Kswj&C232=^0y6!0_l$ao4@@M
zuxfySWCFLwqvbC*0^*+);1N+C+ZeH;kfVj>EX?tcrp>{L$PRxGz~y_mz}8>HKJBHY
zVh<cBG_`^)Zta8_)>p~iX7QG0lB!fzZ9{F1ZNh}32gCM?ZMT8!CY<D@Iwc%)CAkc_
zIR>%l>+8Gptg@nldm!h+!<@seY9&&AW>VIhH#`tKQn&}fBxkS=k#LfoWT>E!qQuOn
zJ_~LS<GJ?bQ(d@3AATMcWbjkMtZ|xdsKl!6E%nCwdMKaH!$(K*Z>p~dad+qoz-)B6
zkoVj8$9)`6>`iRKd(FWTxbW<H1jwTM=e#dNOqD$TgGEUH6%&2m5iA?2A<b$kc$~)D
zK;vG?u_>DCb_Vg&5jHnk%1jSA8qyTR8YVbwrihQbnnvv0`87xX0k}IePTSQ})^F5f
z)<&e5J2FuN1m!d5t6&nalP7+e^Fl~5XtT4M$~qg)g2rQD3?sP3Q>lIsfj<m{*ucL(
zSwWwYBYF8)LsgZmbrv0xnG+<-Pt=$rc!27Ivw&$;@u>CWF;h{0SA+N{?q1g?ty}}o
z^s}QtAMyGgp$0MTNqBUKft0@AbahC7^T!fA>WcZ;N8iMsbmR)W(NN2W733+v=JWMk
zAaiileX^_l)akcLpq5A)!Ptj0n0G4LZ{+(7FXkPe5b=_eCHBr-uD!LyDq>TcX`eYb
z=XdcG*5ZjIh)*>`l;4J4c21^;IY8Ik?!%&sj3ZS_xU<7h^76d}QoTYU|0$?a`0ZmF
z-u^vPL(|4~(dsOjX2)Xw*s|P=shL5#BhF#eF&y9LP5y|7i+IT1ZTM~HD2*mE^zeq$
z#5v}MwSKP)l_mUyIfr59a4X2j=+>+CZeySOt{oNq@C!?&NpuEC6Ia&$!RGf*A-Vm+
zM(0Z1L^u_5Moyvp$4Cn3(j`ESk*!D0{*Ddm$_rcUmGN&Ac&Xlc35Lp3e-OzptjQGW
zbK2Ohrl(m(ogFyHFw)W|lfT{-b_#Zj(1py{d)+(XrNX(;Q>`K1>57ulwwN9Wug&h+
z!1PC<e!=dINHTF<H$Tt=IGY+}3Dd%kN5rf{l+-;dJ5uW;#6m|VwVXPwbMN|fHWj(T
z#}q>uXgxnI5?i|AK-#;yqWM&XsvFevDSt&1*1+mnxWG~?;!8o|*{aKj5KYZU(-ytX
zH_x~bCCf_unxZGUKD07UeB9t92-`RyhQ#-x#Ohs(6t#Wg@tS?Ah;@S%U*Mp2ub)t0
zRy|-}z1&Q77$7)5aoj{TFF~?YL?P~(EzuRaSiL%P)6h^-CEcNWJ0>S7%nhunBv31B
z9i|tPONPfObdq8(3J0pBYI>Vdixcyq0jv@=#wM1S=aQ%o&u@o#9I-kql+&1LJgghv
z#AKgoq;OHJ3}n-SI`kpp#_%Nz1cu)+rHn=hvXs&Fu{v;vAL2ucm_kXop2!5V8cB6;
z-5R;=E?2VLvz1!yx?EXyy7^~cS2c8qbNrJBye8%NUVl?h6p%aq2lig)&P#jX*<QfQ
z5A*yjy;{=gqY%x){9I^^Ia^5eBZlN-OM4^Pt1c(iX*{Vl`!|-fVcdh7+@iw^?huCY
zNzXt3Bfs76KK5LzsXDWqA&B*CmByjWmRQps{!O)8*E1RK+Lc@$gy6(pYqY_ztt-dk
z8oUKa@+DonUbp6jX0}|9rH&r2O?7!~PS~wdFElewaZmL0!4iar@BEq3mU-Tyr%Ju8
z4dH|)2rf<hl0y>;_`xkx@5bL>b}L@q-s+wpiITnPj=pv0Bm4FSPxHPEFZY$NA56eH
zCI%bwcAEH`3)Qg+HLWc-@R=r3s<?7{laYo<Sdmd-Iyv{xJ~@Sm!*|8>DQik%qw5_D
zd#O{1$zi*>Wjkb!?ASU3AGfqO9?5{I7;$P-WtI2tD@>yG%DF9k<@Ib!`FZ~C*IC)F
zG@^Dp{Gn=M^{KI=TW%?En#Y+Uh>SZpG5z6>E2G<nd(*9MDX9vs&76C!E{yZpW#RDF
zTSYSCdoz<gKB#Jxq16r8$^Pk1^=4V!<$p19qb|e8B{ageJ8jodVeQimmkx<Rjdc^>
zv9S~Ymg$IoB0Q;DD11*6xqVbkz^MvmGK!x|pl*RbeOn#w^nz|!$=Cp5GWt`UA%OMj
zphNKKsYu&l)Lxg$2rDA#ni#&Z-pmE&dDe8kg0slXC(PZ`)bKYc(Iy%H7VbNpw@QOy
z&Nl*+dNOX~yzZ~?F9qe6?PVguT}=y{wp=$~ZdBXh3{L#)B88s+1<Byz?P@JL%dD5s
zBF``LF57LhqG~-)OZmORp@0E^zzCcr-W0TGL~cZ4Bo}m*p}ie0zS<__E+l6>E>`9>
z>DGv6lXRZ%g64L_u--~Gr){G7Z~i?Q-FH@a4fJ<f3UY7lydId2P*F?4&qtuAT6vf(
ze31r0kiwH@>Ed?cqTSIA=D4$;M0XNweGm%GHJh(I7@$5kHs!lFl?GWKq|Ep}xBMkT
z_WACyk8dx1)ED;Ea%ja}DHbDn!=DX#lNy4i)#;TygyJNm&+u4yOA@`8Nvs7>>Sgwk
z>V;QTE}e=hgPazD(HjoK<IqV^Lxb?Z{Sx($Gz#QY-Nv#!)3-WDRFsH)rv&^>g|(Nv
zp4n)fpWmo_t@zz?*6u9JcO9=_uOUh~I)`E`-iY%?sAo(m08TWu-)veg-V45bp<~N6
z@U0Yg?TA1M%dNNSO!hK+nT!@a9_EgZWH8Y3!J7{tTsSE^f!3dL46);6XVp%++NFcM
zgbu1Ab=QEWdn{=L87#Ev+_3$~cn~Rk2%cJ;g7w>L`=G5;BdrhfRCO@ZF)a6-edv6B
znG5-?Od?dnd)3;Vrr?r8`-j0E&-PPvG*xSd><nC|X{C_Om)rthr*;)*v`X`LKbvgZ
zyA7&0Z9s@{!&C;M!l0M!481Mv2>doaYGCHPUCU76L5Nd`TFNUATC>f&W6nKWuYYp5
zSMSLI{_HR?2X%3IRU+=B-+6Ua0h*dm8GOFj5A3Ze8@p4sYt!Mmm?Y<vNJUv#?$+HB
z>XR&IV>b?A4J7Fd(xM)R07idj$U^AL-FSCK;}(51(UK)btiEiJSqArNGpZuRbaVmc
zv)yFi8PhW=AHF-q>gs`1GGU5QcWK~ko=*H3y4#T}3>lq?yL$Cm*npU3*x>Nz;h0qy
z{_)k$ShK~vk_WSwQ{9k<vk+H>WFpoWjSS-5nHE?r)%I$mS6+cHFWJ{t_f(}~)Zu9l
zYDECBA)eK<94ucyy54N;A|+rK9m<QX#*FoPd1P+R{SpE&+yL@C*YE`ZA^L$V$=(MW
z7kNy3&*ihQUu@CB&X=yBW0mz=-Km8>!*fsdfnYLL;m~%|T^!7gSO2rwu_$$s`kpBp
zA%97MKz;YBbAxp>zJ+T4BpLr|T%#!-zWUy$YtN=F?V`ybqkOk_bAgNPhv==3L6GM6
zmF<CXQP|0*E<ctezY6&v=t(6oMo@cfQ=`oHOW5ozulr7F<|-|K`u@6V?B8=qqp)Mz
zI4auH$8Hj3A|YXJ7GE!{((VO5U*uG49|K@7mm3K;Uso^|E#v2J5gz_rW$|z01<KDE
zW@Zd26T>V?3C1gDlK^(iaPk>b0LnxUq8h*Obn<kkrOG(vsQVy{ud4!2dztRCX4=Vu
zStf6Xhxtt!b|$}gKA<9N2^5~)Si0k@^}gprF+ojlQCZPmWgcej9V#dE^`|jRP80Ou
zqjh{4-893xXJYTXNrc#H4#oY|CUhtMFzYSfnbu#Z>sR8Aj2HWnh!Yw!%vGjf_C-a;
zW{X|D#1_Y<M?!?I{0U$rve_AlfYIS;CKMf_^KULQ5v>}|Ms7SXf8|m&yGHwwt2crT
z&s^lRqgGdss}c{CEQ@%UUotrOHuPU$*SKKuc?b@MlkNUhcAy=yY9`*^ob)8wl5@@0
zruA6C+hk^Hdh9^bHZVdAaX2Ev7O@a+070Ozvgt>9@-O*-omK-}Je^p_@@Rg7Ge>I`
zfIvBD_2F&~6?jh`5%Xq0UvRbT7rtQQxpN=6K1R@3ZJ#`PsXOa)eht^Zf38;*2&04C
z_7=ud&V`!?y<6;Vq*hc1C?e&HtS?4&QWO9*5{o;8DUbx7wz>x4_wS{L%}Ic1qQk!Z
zA8CXF_5b&1gt>YiTt^P*638kAYV+#We)sNO_~Q@P)1FzR6KhE?uqldcuxSZ(zu&$O
zbNyE{0PQcL229YTI1pOc`z(UlgTwcnCxYtF>WDp@M0m}75_9nhaTqS07(2@>nIB#*
z7~b6Y!t<ciZWxfea1rWCB<g*!_0&7zH(st-Uy2cPX`TUDlHk4R*7YUx^Vf&fvoVWr
zjVVg^K$=4ND#y~UMyC)<XlE2KYOOzHk^3+P_KfE<Dp*(bmnD;N-<4r<27V-j@qqFw
z9grfb;aC(pposc^<SOPC1*po2iNw<R-NGcjC^1zP)#(sz<>Qq3rK~qsr_a@94|=NC
zQKe(xFY;XLYH38kC~A0J>8I(`hQ$|*&sQa|MfmscII-cZ2UG#gfVcI^ZG_BDK(-IR
zs0L_-=WDo`rJ+j2KaK?dX{-Esxqzh8q<b>6^d6eK0&=0Vu7a(MqMC5Vi<R2&m40I4
zf&v~nROx+%kMCK-MZvMrlc;;T${yACw0}n0-&2l}cL0K#v>hD)x`qzN6~FWM|7eX~
z5i~b@??^+ueu9!cc|n-W7&!gs*ZVAsKH|j1E{G~bOYPy$i~+&}eR$N`^vj=p(ziQ*
z_8?!}dG2Jg^m)NhNNC-=t6BzSmD5BHrb=80$Y3C}H}aq=E>L2LmVUyMRwo`9ZQ?(0
zhn>0z!FRjPOTgFrjl66Pb?7D#K$bhhE)57LB@88eKVIkWQX2U={aXadY#~3MQ*tyl
z!5%=t1leF77x+K21p;)954}C80_%mj#7^R3CI+T=%@8`pD?)Ck(G?k&-ZR=wkOy%@
zf|lpP#)*~z!ao^}@6zMX3seH4uVhB)m99#xGFZs_aG_sp1xZ;j57M<gD==g}i{0N_
z4&<0!0X7>~=fs>($DEE0E$0f|6Pxrij3Uy$8OD@%AagmQuRJQXZs4-$ty|T?z(2r_
zY5$JbB@CwVTYx~;#u3q<Gk)>c-!yRhh=b(IE6=GFz6RQUbmII3xYZInN4eoD-T~h&
zlo!Edf+lKM!d94NI|a`GHBL*fa<kc6vggo=`V>d8NB{9x1Kx48cz|Bq)^6v_>}X#^
z_$<0c%+(L<C)w0s*5JO#mq{sxI@bZNF-8NdIhMOM&RJur{>}9X3gSN!yHJPnx@kfw
z8vG7tJqD7zwC$G2QH^|r8#JGo9_doM86YiYf4f~4DE;Zk@XRzgn(IP^(T5-B{)8~U
zOTG>nt4z!3>vjRN0ujjBN4zG-C|Pr{<gD;^97_oH6j74}#4sFsxRsO$f(RA*4lXO1
z6<|SagT4D{j~A}U95&Y+w{;ygGD>H8LBjXX*e>wqb$|W*Eyb;=ds^7zrwrE*J-MQM
z*g^76;>Wx~=J;+6;)Tc@cD~Eool1K_blMh2d3nR%TC8OKiM@S*If@2$VtYG3Ob@c$
zMU2wGeO|?fDvqD2am(uywYOAVeYTZU{YYf(>&(>&{MHv*D7l1Q-yOQ#@a{nwjRIv0
zT@E*yHxaVIzJ2XtApJ>rsiV4puv*%5yl#JcF!SIj;}S@63Oa<?9&zDF6+R$Y{i8w&
zh5E)9<xIHS_l2%bmL6FqJ(TQ^Ca{Dt?5t7|DVxi_5p<75dgJ=$d^90lt0FJYL<%dw
z_{^D#%vpa9)xK>(<7;vjCKk1=?sU1&-6>q2DvU>Iy?L_%A2lm9pnpk;m(oK1O^k`(
zUJe_rPeNQ8#?7qbKS(54?vYC5#uaR=Pn=2)wIJ?=Pr{wcRBW8Y9xw8g{^@8c)yerL
z;4+D#E^$;tAF$*ZaS1iz^;=hUM!i){6w+$IHd90t(cuMD5e7DVC0P-6cfYze>V7MJ
zmjcCa4}4M`C_RVFjsf?Ec}+o6LXmob7Y`~d{6&^{4NE;ov-}!2sLy2hZ4`c-VRF!h
zYE_WW8sNoHGbn`~{-4F&g^dc+<ks*)Cv!cjbTW6<5<59<p}-)FUFcE<Xf>6BZ`ZwS
za$+bLsg37oq*LhQuW!0*J6XN4u@HB1(3~*H;$<5v7kbcBdFCgO{<km(9<JpS{Ileb
zu9KJgE}iSz46%6}LSZd{S6B>GvYGNG;IbRIIxapwp+%%jEKJ@Q=l*qZ&g%<<AG_Ja
zu1XnX9T2yTW_CD};JdrOBv;6yaF35%r|PiO^XMDU=jrPVML(jR+_92EMNlmVLeVwg
zk#XGCc=Yu6tLO_^lm>7ZV5n@zTNlfYH|$^n*kqGmq{M8C&)Qd)p(rg?3_zw|l5jO>
z&kxh65ASu6B+O=?rj*UTa{#x&K6daO5!1cBZl>*}j^2$0H~0^}$>H7q?a%Aqyp!Ud
zd^mDGRB*W8I?gg}@%laP%EJZOHeTC73zPHI1RB$$KMsoh*#LZFFhpeGM{|5SUbq{j
z<`*9>$e0e=S|Op&&}T%wlqQeP{^f=Mzl+SAD@e9rH%W+}bno|?gEQi@Dy=3fo!<r{
zFacbqHz;-(;8~Dsm+^V=8u_LMLn;?|l_>k58`psEB-j>GX`xbi*uopUt3hEbbD=n(
z)cFscVt9Sku-w;CpshXWEyFtH_L7+bj<v_+9$f|o^32S#oLybS>xfhpE6Rc(SQ1k_
zE_}vA{e$`MoNp2-bTR8CP$T?QXF6pf!+4tXJmQk^UeKR=2JJpNo{C-Xv%fPcEDh^u
z_bVg7b>brX&XhfQKUT-FfYzUiH0MrG6lv0g|F<^pG&^6PxI<`HL4F&1$|EbKhwGC=
zpK8;KMGI;`Ve8><AqU`DE3&rO@dc}u(y31{>ZyqguYAJ>Rt-?sW=vD4QPkMS&wn=n
zI9r~H+p$Hj1sFmwlOi9U(`7vXoqx{m93H(?vx*6JXmv)$Ctyx9JBL~Y+duQ5Y-YH6
z&xg#gh21`gkD=1P{;dtb%^nK$RJGJSEco6CP!K-d<BEaxl(B|9f#O0nxzx%`tBZiM
z@<!L=%DcP4+oN9Z=YJN>Z-U23u9xuMl}iD*95&(A9Aq}fc*A8}yTZ5(xd5?rU;%=&
z&~5TFXLtOPLYji74@_0S!jzTglsrpl!bn=W6>yv?p`M}5zS$fTowY>hl4pf1wOd?>
zudn495KLfrvd1&ovOzjOg04yP=<?qh8j}jIK|3|QT+RMooVrwGZSUyFninU!PY~Xm
z@E0_x2noV1ta@%}UyeZlWW>W?=i3Flbw_XBTi(!w<JB!fsVX2-=^&F06qxli3KgCo
zKG1oc>Kl6mjIBdeUH6k)?-<)&L9uphUoysJc+OQ<HB>?00bK{k^GmMEKpVwwwbX<{
z>72h0jUeQ%HqZ(V6OXyj?h8ze>!c6crQ~_NybdfxDI?wxGSRs;jn;%qG57W&Dz}p1
zrcVG4ijh~8-6*}E>+0jCWEF$Nu$a|Hg*#iHdmeqr{nq<mQ-k?9!5aK7D!AEe*|@rb
zZrO}U6Cii!c{no;knrGUYV2S~`0$4ZJxS!r;p7%(_jhUgUZuYQ*OdWU3-XL@Dt4c*
zdh6NUNI>10yWm+uxKWa=&sV&2Gj0hK9R4s`{G+(iY7&W0Uyz?CFxqi~be*NM;SWbo
z-7tVG8yA4R0S#MI!3jF1v!!VaZ!C^Z!*QhnV=7c(>_K5Xr<-#uJezTK0nPH2K3*;7
z)v6<)Zr=|`{CP=NH0~)e*%&58)?U>VFpGmdb(`i%C*8M5WD{`Uy`wDwni<e-kWCND
z<7l{@U#~#V?XKN$zk7%uWv)E(PUm@gL4o6tp-sES9?N{OCncup)dSK5ce{L)b-R+7
z%gf8R8w`?^+ANlVc81L{ay!GH>Ysj$6p*i78PT^HXySx!HVs02Wh5o0DyTwjhlgn7
zj}5ZHG`T3PoPLJpSLXAJPz7K2+2O4M$lmR~{u#C!mKQc`1Qnm98sVmh1rGBU5M|6W
zEv2zhQGG8u`f@9REHHMUu;J@`tc<={ZiRE8pLZdTdAp~A4RIvBqMC1JptFr6SWh5@
z_;ZHBYqI%IkxCRrhsUnNe%`q6{(zevpv|$QW2iQoR0o`1)Zh?2r|RzvVM?%Y#ZZtj
zI=!Yw487^8f*r3Ia$jI?*6E6Y*YejuXHISh{CU9Gg7+gmDkrY!jUe%n!1Optr_u9C
zU12N+nzbIjsa-lYG`(J&Li3an4gkUDq&`6|8^=SEi;}conXr4x5z{hD4fWY8ozZ6k
zsJNh$$y7wqoi0@^_dCpO`Zf@3JMMSy?@bX~EjQrV<;9GR6g?3#mEw*JNX3Snixa+B
z&WZ5)RAT7_?0EyN;I?|7>e#Nf3FB)eSqk3>r04*SZH`I{-pz~^X(nsX6t*#ASpSCa
zk7R<&q9DqlnpAJtP{ut#+(^I-fJ%f{mc2!jmn`T8vwiNC;L@2bl}!xXJt~oA7P53H
zruuXfj=Z=N)z&6H7oc^!*`#6H8nBzEK%CdTi9#Qe9a_J`Jj$_~Q688oxsjRD6df)U
zsH2)3l7ut^fj{gRa)j%PVkpx?7$C}z{_KW}x~>fK>>pk^5)jfWOd_fxXC>gZu>tW=
zGuXKSU**^M=&~2_3z*;~5J!Vhfg7ak1Xnjxd|KVy40X7NzHU?hc3wTG)441zYiZO9
zKb<tPvZo4BEvqO?)Jl_ZK%Wiw-9W9N5)O(P%?{_^Ot_b2V<EBDO7h<#WP^@(pKM?q
zjE<6ECU6{BK=5o5h38gRK}p0as1k!6y0f50reb-5p*dwC)WXK%xqjT<MB=t{o0G!B
zvwtXB%26h#bj$#8`SV%V$;8gVH8ww9LK2J?-NN2n8C;}lvP$&x^+gVSZc{BieTQD*
zL*>cK^tadav}uDER4#RKKM>PR$#wK-H%Sb9bi>i-=j8o&d{oN^z@ozyKrr!fz+G8u
zW6##4%jv~1RVm1W9y|g4FjAtr=d$}X)z*1fUm%Iy4K-?AYPdw%R&(zzwU?^#`?Y>k
z65dv*emCFV8S1`q?so(6=FnBXG}2c<fO2GHgQo2(#+az(MB?7$P$JT8at0q;%&kAR
zXImyNp~6kLZ-dimuvRbgbJ0675i3TUP_Y!`=eGLz`b$5G$o6nn29Q2%jvc7SbSW<v
z-L|UylK;Zq&-unK)&mlT{!#*r$b>_>1Vsq*=&v}U$Qe2c3T9R;kOrK4dP<J{bmbnw
zmf73f?X6HLx92DU0Wd-C>--fo|29d=#}R#}%6Xp?B{Ro)fOOTWo44r3Zp*W704Kn-
zR7Y3i%JF+xs!r8|Iy23IC>y!0;PSP5NTw|HYW|CY&E=q?<!VJqiE|b&qeDkFEQqoh
zJz0Z(7jP8whp$`$ZkN^+n8GT=CIJj}8D&++(%)v77^62xGG{F?8(xjMjT&Aw&=cvQ
z7_AQ!70wm|8L&YhF+;7&AC0UDQY*WD&11CWPn|EtWWdJ2d{9HVKCocUET}Tfx-Y1-
zHO?R@JXhLpY9G`lhbFQB{YNu<*t3Yz3ltdxp@d$q-2u)m3CN17bb9vB?lI(A(@mt%
zm!|jJ{+$!~GtJ|>A<qhEoXzhpt6G{$ScL5ePk!;Y7e1`3hZJt;H<nYR@xx<JH<ybE
z=`w_w2O1Q5Lw3L9W~Pr9Ns9Iy0N|&986wOg&hlo^^a6XOFWbGzLp7yKyTip#cCxmr
zl+agbc=kVre-3%~CjlM{kei%YA1o?B4DA07kezl00B6vu+Za#%d4m3F6=h{)UM&3G
z`F4dqqdh+32Naw}atei<^w-sXrTMhwsb-i(fi?x;FZ>IT^P>M$t5f{usD=Wd&Mi_l
zDnD0kzQ;ii$*lRp|6cXq-S|H&8~&t*I)Z;mZvAOxlIHnqSCq~LUiXiMa~aT#gxTH1
zTq;LAYl;oa2{7E54Z^>At&+nJ=}{Qhac<Ht!HuHqQewQ?psC!2CAS0vQAVju5vsKv
zvnL@)5w2HHUk4ugjqc8(E_Yaa>j<G;=0lucyTS*?&N!JAOhhSZcVl^z;Oh%5`m=s_
zV;oS&STDd{(K@%UQTUpxZw>*!qPseH>6a`4xn$AU1Q5v+yuexe>Xc+FM+fBI^8QoJ
zP3e5NlL7Wo6SJ5+W%ljuUw~IR@=4-=bP=9h1{#l@qs%;9C95mnUxjLvekPF2uE(P^
z!YNn52TUo||IJ<V0LwtG)dxyhdsDvsJHG+HI{wB!;@6A`*^Ez;&3LUyFGXtXKmNyo
z<dfci%pxhR9<3IT%$N*~W21x{x>td^%c0qLN(lP@!&i03z+Y0r=*+A<J*d(#RPeO?
zS=OYav+F0}*6Z2e;fR0$8cshOxsA_};Z4y+zMN<@XlFjxIb^2a@0_L)`aHq|N)g~l
z0HUD}zqlM!0V)h0*r-3%H>m3F>DlzGT&k(mUP^hjzvbCk#o;jdtsw@>>FR=Q<N0(2
z*+@xrx`!jwr}7baw7<-@iCzWmf=z;;+T+Aj(X0gp1)R9jC;NosBloA#QMrtJ1_pyK
z9VHWueyj9MHd-bQgeh#_QwSV5v(Qj0DCvkkYqD$<IWjR&!pEkF9<lKJdOZsamBpxU
zUqbgtY&NBXB#~LH;V6^`zgvHKB~f$Hb?;<W`LE<tK*G*zH-?m*wL`h7Lga}31lN-S
z77dbxhSKtta)c5qm#)wymo^wHekt3R-oR91R4{os3^rKA9$05rux>x86>UbZ<0hyP
zs%${?&9@t3VV0G(>`E=HT0B?U=KB1^4kiqQd$GNWJ&k@4weLAEXE#2C_^?VpyimV|
z6pk)s^oY8APsKR!-ZZbToVo1lfxa^h9>W2@=hki|g(BAL6lKqR=u&I*blB|1Ml#EA
zG8GxIa8A6Cd1J><sVUiKu5d1$lg3o!5dwxHtr3<geJ7dxtL<QJ&9a+pK^V2bi)zIo
z&BVzWD>zuS_vIN`cEa=flE&4Di}W3=cL^_7B*_-**CnqVJ8=thPlof5PG{h1?=vPQ
zOWz5zq#N;$hP4;eN>8pbV&HYnvY6BcGj?XQD`Tx~HB8gh1bbzDgksLYg)8}hRVx<2
z)ZwvsOSDpRIOd_2=t90T4f-z?Q1W<6`%M#3cNOE;UJlFfA%vq_zN7s1MyCXXq%2V5
zGyc~`KIQTf$?&gF0aIE@`eFkl`EClfH(ba%Sy}WN94Q6y{qiI)?d5fQ<2nVWtvY_o
zjVpmQU*bzbe|<#YuYS(>E9PWknY5dFuTAsFLhsBe<g1t+F)(7|^xoGR#ByfvM1)_j
zwbS|(dtCTZkSb9wMakZfanlJV9&d0nE$L|)#nWEa1dNk-k!VyhYm7x?eNw5Ill`O1
zVRR4VEBGtQDzF3N>v;^*m7c0toEd(qP`*2=4PgbUu(5!aP30Q1SrFT_nliIe>0bZB
zQ~UOIP?tUrbc(=}h}VJ@iba{E%2c7v&$imJ=7Hhn3KI3*B_giEeGbM9%SkZ}q!L2h
z(8FHAmW`b%$VgU|_#>HX(co~o3aN>f0+$c$TvCvYI`Esfn3HhdHoriAA@>xUvqAe#
z?gHG{3-aM~H-Ymxt<9NI#%dK@S9>ZZ7z6{zng@T>5yj)YkSNL}^wDG5QN;09y%uBG
z-lNSn#QXI*!HgRbwsVDH(rYVu2vH)1gq=BW_SpmExwwU#MZP{S<v(6HRJnO4@oipk
zX;%{Ms&!ZUc;6gLLS2-!&(=foVE_nDV8|f*Dz;1bi>>lLujTK0lm*Z(dI7_idGr2M
zY%gToW?rWod%V5?Y#B{4Ds%RA9Q>j7v6j-w+lifU<hmiHhdr;?Sdf%X!@FM<068Jg
z5;r>N=T88q{Kq9_3KM!5@M|(u2mJUx;QQPD$~btIIF>>{fq?J?05KR}o};Yzk!Ojs
zP;Tpmm)C$$|Mv(EN=pHNfy9Hxc8db5wYqwjVsOt@0CU8<?XwJIWxV<a;OtM_45Cy_
z;PKo}9q{9iz-ps^Q#(rmM1BJvuag2k0UN;xtXA{_FJ*KCc>fNW|L?qvH$pcUe+Y;_
z-?^=U%6d8r(#{~&+TsUx;zLu9e!br-jDC8%Z{&a-nRd~o{<bO7>4z)`=)uX;#Lqxl
zn#x*f%&4fXe9yQ`I4x(68o%jBy6Q8<$u5(=*M0t<UGz65Bg_D8dQS3>Lh>5pk{P2b
zVz|4V>ee57X<^!bgYTh0HLe6Sx*{%4@+#`Me=n_c@8I^_`at)0v1#-IKwD@`zBgIX
z-f$o06A=@rOIrP1-R6W^TIOdv%cbAd!?8}=?!h*iiH{%4{eCE_s<Z2=Vf+t)L;>$x
z@(=OnWKn2rqQ5-Y9&xMl1jzou#8Tpi(59S?lLqyDj?*jrWhVCPbdw6b6aBcv{`66o
zkO3!jGUJ<c-r?_4Gaje{_S6$)oIzgK%cHN|tkncNPw;13cwZkdhDB8$%^Q=+Q%7Jv
z_BaWeV~tE-f8WY-J^;hyaSA&D8_@Tr(W_lRrhl>M9igN%qvfs;4BmJ%1^7>2%z%V3
z!rf2CX6CiA1%d#{THp^Ev|Wm>G0o`=*ihXODzA4%n&f!uJbyRn3h(#~Gr|ZKe)w_p
z(-jvo@a{C6`&N}q%(}{D6Ma8PEi!ZTx#I4Z(Gi(m2I>>O!;KJh%_Q?kxax=?d#9aM
zj<8^CrYU0YR!-6AUY-)nD5=e?@Eo>msaqklF9s>&D6EdSRSt?*#S;Zw&c#&zfZD*=
z(x0F==h6Fd$3(q0Zo23_*|f@W{<>H+xpSQSbZX%pvY3Ts64Bpucnx_dz&D#bD8$`j
zC(YlrGGU|WlYX__mYA0XeZ#xvbAv`h2DexOz(EVV0t_2^CG`TBgpJD4>Emz)fuIJe
z<-DdUt5C>8i|4`Vukjsdr!U4rPDRD0LowxM!oTr;-2QGvX>|aoZW)$fPKN4@7yMnj
z4!7L37>N7R$fhUbq7{r8s|p<WYHL3d4F%bVD9h3od=2(Sj0if}luZlxruS<PS&$cT
zd)=&MrzFG~z3CbQic-bb-wHJu@Cqi#_PFw55;DhJEwBL#ZW+XpAHYtx5-`3V;o;-t
z4BXW}{PV@G_2eEeRehFMQ7mi^n-gv_vVNA?Fk};_7P-M|k0^6M6Kxw_+Y|`fBOYs`
ziJ!Ipw9s!@+j=kZWPz!+C~;4Hp->lQG$IH=2y{lVROaKY1IdrwgYz=Q+OhvA1o#K?
ze%%QicC7pU>^w(QQv5|ZrQG<Fj3UN&Vii;!zGi=QfR||u?DWVU7yB?Tj1Zw0*z&@W
z++~skqj3ouDXs*o0^z(r%Lij5!dzq8<|Ec?yNoR|Coe;4>|o{YN#V~^WWV{h#{1+0
zCq+ZE3<24{^WjibqEUpEwc$XVxKW!G>wNyY6|Vs<PA0R@07ukIt9NfNDKo*jW=h8p
zWch$zq$cf7PwGKqwC@fnN5ODyrt?4lQ5aEEx~6M1BCp4~@gs5B4;0Y#$1v&Nj)uNt
z?f<?#!hJ&%pS?9~i5zMBU+sNoR8!j)?h$+8ASfVR1w^HZ^p2t;(o~dQ6r_e;LJ1+F
z2NWU%L3&dVP$0q3J18YmBQ5kq3B3eqAp}U?M)cf!<-RfApZCUi?;U=Hl)cwpbFDS!
z{N^|3L??HAb#SC!edO0_i}Nt1O7qUg4vA|)fs4<t+VBv7@sCE0CtrL8Nc;jZRvLe+
z*!20vJwj#!B*%a5CnqS!6xhQR7>yr|mQQD>U68!evC<ZmC+X02yHLqflwZ?@49r(S
ztk#fNDZzd<W;8An7LzBQ>sj7v<F48IYFXi8qdcg}c=S|0xVuDe^+M)|ey{b<?liYY
z81jdJ%3|U<m2iJUhoJgrOqkU^%e2`h+SZ0OZF4<_6h#K+^OD+W=uLn9ZfW;fBzpsd
z)9%myIF*e#&osp-h^;Xd?h>n4J}X?5fK&86XQlwRajPwe74A_S^w}O+YnO#QCgY1m
zIn{8QTF`Wd<Plt^LDe0~9oya^Z)r^U)Z&`F@dJITUH;{>T<>{)bPjwiv;R+Sxk#mG
zpDSXNgS@_{C)yJ9la8~#Fgc5SLOGG?0Bq^Ycd44MOZL~0i}M@#Hb1T|24zZ837_LT
zP@0xvksV-7*Tcf&`XuTt2gv1D<FXPOon<;ecP`MjCVj^P911On;%g@3nAbB9K#;5I
zZ=dcvG`7D*`NZ4d)!sq1#iV$Cm*I00Wz*8z%Ah9%*fir9<Fm*m>)Wf!fSRD5>AJFg
zt)B-saM5i~zJ+ONR+z9~`R{lC_2TvxprCqHZ}{)VUT^OQKDvMDP{nTb{LiocZHiC-
zGaLW=XCsFDFT6Fna30=S<#@kp0lENRLpny}&ke^RH3WHwAYOJ-Le51hT?xp*%zq!v
z&e}b2sE5(0(Br`ViuC!2+4SP-{AMckc}GV?{|9o2FL*x7zD8l-(r9*%HG~MUB(U&k
zLM1KaC9IIf;9<YX6ba9XHfDH9Nl8Q*_Rs&=UVt}|qy}`EihN>jZmwQ3;ARUnC{`@Q
zZy|Z0OCJMRkY{i(KU1f{BK{J5;knXUq|JS$z@pOHxv#-_7~a`o4rfymV#u&s6^D<@
z>imz4g`JT%T?9II8jb7DmXUlnbJxZljcNj&&CqWCE`DU2*3ne1A!4k+^y&`GQS26P
zP{6nGsUy6z0$o+=fp^F7NWf^D52-dM{<bG~UfdQ7`bMHCxP4U%ScF@O7@nBnc>0*b
z!CFmmOFaUk^{mT;bF@^f(oM~qy|z%Iv}R4?pcO^lVE|D*#t-9RQlUIhs+#{+2JyD6
z-pg6I+pdo^2H=^tdp08dTUY0rbxtNRdq2JL+GViEOg#m{)VTP_;dA%rTk>FPTYRtV
zfT(SXWX6j<@4|lfnQlV>m-W6?TXyn^$?SO<sfNvfbdZ}YC!6^5hx(S;o^xMs<n6{3
z9|6U}TD9+YJW)lkch4vUazBHF<d+hsx5_yj)Ynow(CVPV-}}05OA<IuWwMuIXJC|p
zMLCJP?aSiN9qQ*5Q>>rmVr71lJK)S|1LtExe2bYqb<P(8HJN-Pw{#a~E7JG*4PT(_
zTD8}rY5IbLLwRpB4DeEgK<HiY!s*-Tz%93dBNaul+H+92t=%wg={C3UJ%1-dCkJON
zvU=7~yv21sqGP0TwAmbzE*$46Jl=n`vF10I84#+jA%LQI?Z^0TrZeTekcwgB;Tn}p
zt5oInd!!|0>7(VOS1OH!uXVIyHD&|P$qajG1x&=o%0<FbO(v|nD-J-rWP_;xQXIcj
z#4CVvZct(D7ZLC05CC4Lnq5YEZa_P`3M3S|65~*z?I4trpXubQ3+`7?jirr67uK^1
zd6UE~Hj8~p<&C+&m#YK@dih9|3&5VK{cL1_UOYZE9|i>$L}*2Y+{UoHge$pi4hc-x
zfNAJ2)eodw(6+XhF-wHO@5G7o&+3Bq9<=)SWPf=MukY8Ocu6~5`JDS2hZ(#DU)}do
z+LzV#A{c(_W-9C2H?Ig$)b2*FkL%tsxXg3M-282W{MdfXfIG{uOYr~ixU>IY3%}tZ
z{{TJzvD*Lt1$zD$cg975@6n1dmTCGp^&SK|9#(i;59eD5cp9dtg}ix#9JBIKk)CMj
zh^y<JHHA1sl1IjBZqL>@$r5l*q*FUv?B)Rv>7G9fCwK8as!OGKxaOLPD59o+_wjxi
z$bhlQy8EwOA~l<R1s%Sh2P<#&#ubm`jTbZ}6`SF}g|V%JE}1Ph;ryWTJoi_XHF@*4
zqH)DAz{zwjy$gKnCGah|0-WJ{4`bqO6nWo_qxd7{f9cS7$!b)shu@aFPKb}Q!ArMV
zcxvzVr%YE;`W^Gc^^XglY~ggpc?e3Sytl`Jh3|{)yrqG=_~Y@6H{y!T$~O#2fLpbF
zBbp5|fBU9)-h0G&bduEeNOm3YH`<+D0)!RnX2Em)i|~hQ{WnjGyHUP>E2B|oJwZT+
zN(2yxDf}B$JO5Pep+UV{;g=&DUA9Lg+dCj@ep|@YnRxwq#qQM~&mH*q(Hplmw?$h~
z!*|wFalqV^m~KDrXA?0H_c%%%R0+(+yd~fQ4<YXXv-C(5aNhG)@8<wBbJ3Uts1)>r
zk5A0le`yqVACa^@z|cGXC^)y<xI9<$*zT>)ro_p=>&(;%oIIaub3Z=iP8G5z5{^)g
zUbS(S1(n|=@*)GaMZ|@yzO{hm)Br|6yx;@C6z#ys=zwGTZ=!p&GjnguoB_%Ak}2@i
zU&hhvCk&0XBC#1<y%v`8Rw5{Vz?p7k?K6)U|FPtD-r{b=Bs<E3pyOgbIV#~c03HSi
zrHQ5ceMIwMt$yzv#*H6R>3wi77wQJGLq=4U?Ot@ZSiTnoMU^=geu!i2nJ9xN{u;uM
zV>3squJyo0RbIeC3p}>++er#o5)T*vC|c+7m9AZph(Cf_fQS0O{Z~0zVtBwi=<VTw
zp1Cc}(B9tulD44!xbF6zKFsYCriV`MGvnKTn6NM8NSR7VV*l0GMqjlosdpE1#w7)S
z=5HadrT<HnP&B}JJ4dEWEAO&L>Nr>;Zz`&PN&I^&{Ib!G$M*xK=n2T?%HM0`Z)toz
z24KG_XYAW|n|9hxK^37RTZi!*fNSMBgzI1Wa-y?yfa4C~ILH8|5G!gd=l_5Mp^Ph+
zEO8lN(VK(H1A$@AL|j9*YOx5LE?@Ow!QI<I7bpjpQY2TeW!?$Kcy|us`<$Rqfl$<M
zzLwAs+8i=CTG6{5app&&Go00jzmT1rhH>2Rrbz9LNf>RCGA5)dZ-ph55hIw_2Ch7~
zeALU^d;F84+t<2Bf9gB!17c{@Qzw$1I$o<=y~Dp(v`yl0nMzZ`#A{6fHy~R5XXr$X
zp#?`ldx)g)_X?m4(B^2H%=_YdU#FtiFiY6sI1%Xvs(3y^IO>9xh7VLQmQZl!U{T<#
zEHP|A-=p;@{WEwDT?Xr2Dicqb!hP_7=FuBiC92ofDJ?w0R_cjWrg#=Ym7nG@!lybm
zCO#GJ?SArg;ke)CBq;2#()YT(5_TQQVa1lYjF1~9*KgD)^3oj>uW2nA&-P6Kp|z#C
z*4Rd6S(&%+?T~fs1}{~k#JW<E4=dPt)}Uz+$oDjm8vkTeLM91##|4xG3I*bl#9q)e
z#oGkFDfnOcT@FF+>^ET=ao4zfCLVIRfhK<hY<;V|&O2wE37joBdQb|ysK~`nnti?<
zo~||&<+g1y0&ds=xnSs8KjYK*n9Y`UOX^%gQA7k^lOAHF;nUIVKke!FgdIwTb~O#V
zoeM0#m{aQ{b}K!PR-1+w@3^EGyt}C^rayeWw+(F{w_1JYKElak5y|ejqASupjq;LR
zIw|gB<A(gVZoX{>Go=^(o(<CT>anS!u*C3>pf#9_(~5S8Y18#*$hz5gjw_4IGxEWg
zbSb9)zj#(TfM@mI%H{%n*;fLYO~kmD*-?&c2dI-?%6Pc~j|wE-Q3mSOb3g3%1YbV_
zK(nak7B@AzIt_sdeVLO4yK=1Nf~27o5v}jv#ih<4Q7cmv+~)x~E9P7OL3C{Gzm!rx
zzg>D9C#T=K6OL~B6^_RJ9gb$HuQ_m<h+f7QyvwhFYFMcqRo)<0&QGJVjNCo~SB+Rc
zxcd}P1I8A_YiI8aHVf2Q0(XoB(MyY!PGXKAd52g{O+1ZJzJ2)?uLy%YDV08r(Us(D
zhGP7&^ZmwgHIg2pPSv8kgCqCt&wz`a!)ufV6-V2b?6<_&ep|D97%~oPuTwCpH7-S0
z$#Iqs$WO`_^I`PWe)@0+#qyvfU$?~o&l;`Sd|jZgqHKO|T*IrP0TL{{mO{Wa#L=gx
zeHX$I4atRPFY`cf$?=NImc&`bHB3i7Ox8jbVlqdgR=f8RRt0BpBa5&`Q`-rX^DsBZ
zT_hmwt3iG9{b(Y4^d;0`=DXOB7$VjXkBd_hUh$ny?BC38wX%3vVuVel3?DSakjD(#
zbPzHc&_GC*b%r7sU#o({_r)pAg3?tCN;j|P8C}l*lKx+Kx;tUnrsA8fhI2!Z^FR_p
z<G^G`s9&L-t*aOjZH`@ltBa>SuPDSu$g{V@6M_ZI$+53?jkc!>Saj`a41esJ&3y%j
zd<WFn1Z{SI;%EWj=@WW;{`hy;cIRX_-ZxWl@@^J+UxYX51=6&n;RM7oqNAh3-rnRy
z8m*_mNUoYJa?Nar;b2;TKuN$VS0@hNdGHS5RC_M{d}6E!_aDSWj=G1L0PY2b38JbU
z)TLxxUsNWtF&=k=9{c*?E<eK$-bcHsRXuKg{W+J_YK3Y9!lL}$Li^l05{2NDKl7Ko
z+Uc#0-Z&bG&l;j(=K`tkib0Pu!Bm0EQ>yIyFn`DhM;pCfo9dtxj)SaegF2l7-dmA3
zCC!gkW2`1ikm&I@Lh1bLJSsC>R_!x-n1U;{rN`b6heYjH;zvFzXojivrIwU|y+JBp
zsP7ORW3?;YC%H<cF=v~Ls>1V|#oWH5#T#0@O_v|CSss-*ZTvMcVb1Oe<p%2%o~P8(
zQ=cT-6Ap^HHdqiHD%FR1=YK@|Mi6Q&v8-ev@n&FjXh=xM)Mq+GyM6|e;oX>7{?42)
z+WmW@RHE7>e;WM0Xz)JI#7k2fI{eBgKVp4n(L3>k3F;}GE?P7H7cRvGdZ7`-QsWp6
za*xxx1oTFD(4|(!WN72wOthA8duC=@7ITy*6tX4H2C$tXt~=l5^OWLsn$}8}&NJg1
z0y~B25QbSmoZYX7tX2u67kEvc+OGhh@BeRnr|IGk#LoUQ7aNosWR}o?LV0ycN3$>A
zFJu^5{A|g}a^NYm8Q;_Ed3_;u(e&@@W>)g+;pa<~DlV9q`ZP?$a(RCAadJXTdTOB%
zgK}Kg<keo==>gTILQ5)aJm+rG0*i^ANcWAF5HmEcb{*_B8EeM3cQ{Y`R*+Y$@lfmr
zk$isvP%<G*mxqeoR+<kkYZiqV(Z`+94&4#`{@6sb7-!Mk(Z!5FI3M1}$yBhbX+p*U
zF+&PaBDLC}44Ve^%$UoY?exW7*nogkVnivD5SmtgKxm$FbndO1Yv0TMBor<`RMi-4
ztld7J$O%VnZW%>ap87$za%)ePD!iA(uu-#0$beaWJk8LDIek30_)KKmjg6U!6><G=
z_XX$H(N8serzKpQ!^)t3)*jjIxr?`=F1SH^nx;>;Fz@5UH9U)Jn;nw(2AlP+e4{-P
ziW2P8P<<AxDf0aX6uMYrkHvCxiS;=3UELvzJrmz1i+xr_IP8F^8&N%kz!|&x`>sQ$
z0Q;-ykgPG2V&Gh+nLT4Jft8)@p!~97x`YoS?NXrs+iYKi4MX(Z*OODr<y?&0Y_7d&
zIEhPUeQjz?pkl!<p5IN~>&=}tKlyn(4VQs$en%nB2*~t2R$F@mYj3{+GV*gaQyMp#
z3-qv?3ka~A8}t;wLzd0a?^h?@>CrN~R`2W;dU{-gZOR@Y#6)ga);c;rO$oFdIBMHa
z^r(5Y6Y~w*Eb7**K~vbSYUt}bRPVkvGc5W*ouLCP$_<bEGxM2DgR8mB*HTIuB?Y=l
z+Unfn6U*4ivvEM2>r~~)TcxEby|uB?M0W!!e;!4jeX{LlpCa*dYyJ(oYCan~TKfUZ
zVQtHvSHmt}qy*ACFtjr=W2v6q$KewlFER*j?Z>VZyph@G&(yRfywK9<zhRm8TuWE!
zvVe=Ken@2h?Y#{8GaW702hhNC7W%o>X=T{7rZq)9N=%%!{t&*Jcs#!1yb7*LYS6S4
zQxVg5&ANL@j^2oD)=HtKTUG{WlEnV#)6hv~l;#_IH7#jvKw|sw+!$Kx#3a$|C8|U(
z?2k_BmEoT`VB<TJwW<Rr4MWaH2o<L}KB<draz1tE!SUti<8$wrgDT#aI~hxfq?+<G
zAB|R%2^t>iqk*AL_OnTl5wDHJGLd!7p6SHq1MhkSPTb@TgAia5QW_tm{2tHWZyu2!
zNy%pn?Zm$NA-cMkTjdA$j`8c@d;B)SjHH$ueIGdkh$_vra#ROxgAw|)BeZ9%zV%$#
zc>*49;QL;mkTq?$w&g!Vof0azO=L(w1z(tKSbwjtuHJ?fnmAA>En;yTT06~zw}y&r
zV~}tYM($;z9aEp$tE&Q$S58C2y@uFFxB8GRLY*_#?ITGU>p9dt2P<t4xEUpkF=(U>
zbA!6=)TCuFX~zPuLuh`t8~33QR*bFD=3fbjF;c$fCo|49rG8FflnG0mc)5bhNQs0W
z-H$rJO&IW8ik0cp__L8u`s|m&g_kthbaB1Zk5#MRoEN|2bULf57UXqjT__9a{3*Y-
zZA~EYmwZm$`}|--Vq;NA1Rv%bbLyv_#TDA{i=sPRcTYzH(Sw9|$Luemsa~J#mdu{?
zbzuVj+&nqKY4$zjV=44c5xelZx6C*3!|TCJsIX79F)~OCKG{t?t3>7bJHVkXl_470
z#?ZXinT}crDrv5kch^{s@RgS@DdhLTdKSDFG!PVr)l8()Ipfw7qKxl<Z07%-b=_6(
zi6%+kls1I+HMnEeEgniRv*2?!?M`SH`h-QC<$_D|yM5izo3KJ!4zS2tLj6=oRQrnc
z65r|O@n$XT;OGWKmDP21Im5Q&a8Rnr>27S;bg-pvgpjKF4GKU*T-*G}Odd2#2cAyw
zi4J+x)xR~}?D;b?4+rJar_`;*k>SFliB=!?2Osq=06oty*<UgGQWY!I$!XA3ZORuu
z!MZov{yhSeRf3H&38&>9?fN{MQB$L$5Vg<bOek)$GeJqGP*Q=+?G5djIeeGsuBSMe
ztP-S7XWJ`uV5JF^PgdqVy`>eK#wQ(^-s4M25kGbyo}=~^tUy~Y>Ik63$!Lq*?Ul98
z-ZbPX+sYlrFV3nnt^MUsfbu9`u}yh|&>kQaVijFNi#*`E->J6d)Q&Nm`R2AK$kImj
z%1Ua+K#WrDYjFfQ!naV#q!u3BEc~S~U)jV%XlseGWEpr?Of)sb;A9CX>rlN__#tgX
zuItFlsu$M!Kve6(x^A3LLy1m%IfV;1oMc)H8b}Au<<A$+wC!Q&p4{{xE!^#-h4+^)
zjEN>!Yru{VMaK<ix!#Ag4s)OodmIfMufKL_8i2mxU3f7G4<zX2BY4VdeO9a6nz0BI
zdHx3ZM70vo!);5K1e|8b@i%e??^gOz{auNbA4bh<{qH=?UATOxthn&l^XQzg>A0pA
z>F~~)H_9@@E7QK2>lqTOLWJkx&d}1iJX4@5;pO)XY{!DW?T^^>at*}b%Dy3@k`$Mt
z59!9bJYO}XX_>jd0ZqIO%}iwbOwhWNLCO9r?SeG)R*%znNj;jGkx#vJr<b*-hzd}+
z5`$|m=M!Yib$!ulmkCQ!Fzz6fJCdE=m)EgdEGlZImx~S#$u@<p(NDWL1?nZQwczbX
z;$e|Dme{bR6Bfc#)OY34>UHu#H3vmmA3oZw?AeamG>!RC$PtW-3D{~DzUlzS03gSE
zED-LxO#`))(SAAOAg=qvDkl|z?tFa95?@FeK~q+-4YFy}INzC2G6kPH8AX|~_HG3#
zozQP@en4olmOMCg?6}-!hHF#(d`dGC#4OGTu)eOR!5g*m5sih}!64f|Qerq+eYbM4
zja!ym;LT@iX6y^lS--6#C`90qqtXJG>=q%TJcTe``btALdh)$c`c|m<;$0ScSFfo1
zP*^K5uPS&pBSiPe8}i{~X+`|<8_V#tg{>P(Tf?;B1x~$yh3Vm%keE_Th-%Gc+lhq)
z52&1jqvqF4WjZ&LYA1&wbc~6(wBPmAasMpP!@*=<>_N6hy;>^0vDQl^Mj*E~ZnGpR
zAQp%=rO|GW(Y9>WkOsOMU%i{>Pu0q$9N|{<+h37NV1H(z!t8CA#^WiL$Sx^}7Fu`!
z&l6!lB@|rBc@zbJFz2T0j?BJds2<TEuE*&LEK6C2$NSwellx?uWyY8NsK_(wh!>CY
zsb>fpWa(N_g>QY*L{pV<_p|VxRS6Mzxb*qRX_MdMRbZDeJ4VNF*Uu~t#MxZ%f57M4
zR(z1=*{?QxqaU3PzVbIF^XtW>8o+P<pZyA;+-V-Tduq2=Km^)@%GA_2?rK7CJYE2}
z-v4K}@+O*A5AGnBK_={SU^kylEWMLgj})LluG#e9!<0`p4T-I<Qi^@(wj%4r*Cjk}
zam4r|nutSuh>ZMdiP(VeEoE+w*A~znl#aDPPP>$~SW0;7EdPMl0hQ{`@@2)vZc?Gu
z#~~O$TRU%>K^kGzsK}Xav1!WESw%2o7DmO(dN_r)M-$escr_X?#<mhzkG39B^#Ghl
zy|gu2!$w}=fl`V|6%KlqgYLJ8(_(L&n;uvCcA(TIWd~1<Z`+oepI75fEu@PV-K2OO
zwp7zIk)Hh)USO|lL%5A7D5LvE6SnC-+?LT4H5RU{VGYs(c?qoIH)sMcWAA-gzN(Q|
z2~tNPA*IDlncZ3_?_&p7u3=)m(Qo+E16Qlj4vm@C?}$CmxC7>nqCG}%w5zMjDqb_!
zOt426eP(fjGNIHNfH+`vhmO8q8u@}q;#1-pkx-t2&3DC1JCbJKuGypWs<!ZDi#b}>
zJr^}{-;Jq67`)lqS)pph+lcPQq3InWS9~M`fd6eQ**lBR1X%0m`C;=LhXq2trv?@<
zAsd5Bii)GC-oD1ga3+oJ^n~jNU${#&N0>z_dNo}NjoN%sCShPc^Ki0lO3{jBoSjm@
zku{y#KVoq+y%)2@71tyP@8HQY>&&{Kza+yf8PYa*@uJ~=_y{&Y(S_R9JT{qktQq^n
zC#vEFHKgIJ?B1o}o9EP;S~W*YBD+Qc`g9f>5K#h??S|(ZD$uZ|E)2SK;^xlAea#85
zKyx0RT-j;uqkTC@*_n<!A8Nn{TG%o(gE}b4=6x@}7z_3C3mDILdCoYrQBsVe!HCWP
z?64&_yhna(l&K-MeIWuL&o-H&JMs3*Rt|*}PQ(Dr$1+>z;i|kaySVU=pj+nOu9cmR
z+O*V6Ynmu_H|&0ZNF6g6DlsaS(doGE<zmhb^KQ;6Tb-ds&4`HwC_qL2j$QvJQ}uZK
zDF$VT6$qrF7&~k+P%tiT-5gy1=xUbu3!73E?#YJY&M09#8<R5nY+}B4>SFH@4<<s;
zIGv-=%3w(;7ms?nxI)_7sl1i$i?KPX*C914&z1sN!w3t_q`M3j`F1~{#w&Zr*W$Gk
zm|d0f#j><h0+-h99k|T%&t>Ub;qo4xGpj-In~&U)h1!}GeT|Bj&5dm1xJmH#vQ%20
z^L(iU7GCdZ%!A{(*~OEg1*D)xEz{Zgu#RjWjGpDxEk~4A>g;OP8|>=Ld}t~j22TxW
zpU3jRNsoqBF4V926#ePt!vi;joCtL&w6cw|R|0>KB&+%egyxIcTDaXYyQ}K@z)90|
z1aqRYf<r=?BaPHRv|LkRHy6@v7?2bXvotnSohtWHh7uEuus8D#u3dJf`??b3v^CRb
zZZ&1j_6x7Oxav_o^vyQ43aw@?Wp{o+vpzU5m(01U72BARG})AGfR9Z|DXAWRL#fYf
zN4wpw&Oo>}sgS)IYwS`(CnLNk{qM>`4K}+P*)hSA2ag_`ddK>6HHGaDSx>TMT{Gz0
z!F_^q*yX4_^ygPOk?3{AGrjax4GBq<MX?Ia9b##f;k;4Xkll4{5bx|hGUGMzWyy$8
za7hN~JYOlAAE={L<?!&W>|@@ct(yT8S+TUCRccNif-5OMbn_aLW>B%Fa<Xr1#*{OO
zw^~|u7Fk&v{=Bz6*BUb0bjAC+lr({>T_=v*cfvfy9oCvtmQRW8rByScW&ChMz1fR-
zrPh!dpf58OH7wai&21MysoIwdaWz#*-RwYuh>2rj*evg)4VITrnN-I8eVjH{ywPW>
z7u=_3jG$z4zuZ_kSjT;#zTMq{)-gmQ&&FW}K^Eb$jKM=*M;WD?Ds(`^7HZAjinXF~
zjHI8-NTgj)iWAJQi=Gr$#hq3|&e~(`-pAN?T?!TM>|+i!DYeR~PEuG1qjdK&x_39N
zXu3NCz&L%4(r>dqxeRbsN!1x-^fG$Gy0>D5<FUFbW2-9yix0T}YWn?Sb*^8aE|7}S
zhcIow_%lJo@$T0FKIIfd<(a^RYWNM=vJYnhOC>Dxwwn9Xbn(-*NLSFX!Oe8m-kz9q
zwqeostBjqImFoGDB+$rl^(C{dRUT}6Rm=U}T2U)&7z<K`bK_zQs{^R|0`GufJl;#B
z#zW%{3o_6_#IjuJg5{KvMo}IuU~W1o;9h_Hx+1$i@ZlQ=o8hv8_jJlixdn_IGg7|#
zezHSng37p6<vT`r=F;|IJh^=R>X0lMaQe77qlCg)Rh-@RTT7{Sb*l)yNgPeU1Zr(9
zMdtg$J8NLhh2Uu$S~X=_8?JbMb>mqVO)rhBwz8-;{p!J}g}$&@Jy-Y6jo38|xhIu<
zDJJGpO~%S%KPms%&2(uQQN*}TaI{jo^{n~gdWmv2`(GJ96=~R^_?HIE;P4+wqe0CV
zzp$!8E~ZRJ2eM(VuXAVv{V2EOmGNX$CiDEXQVFryvn?c$(-6-K^BPec4R5Y5ux;vm
zWMBk>y^;2Djj)LmD(sKqUsD2G=IPo>r_4xBiPg~3b4_cw`OLd~hU_OxYmVoVW+0LD
zXg$8?x$NgcQ%LGBmGV{A9Lzj@0>A^~>!v4NBc3XjHnbFTMMLM&Aef=Z#r4d?8pMyZ
z_?hU^c$)VTTenJ8Osnci`kJ?_pl5c}zP1o7rh`C(MHvSgUYm`07R|0;mIPrlCnkow
z8<wZO#FZ)3RFN~#ZnkH8M>s6YZ10Z=4AycT^ha?{aDc-*3)V{jz<A-C=LT=Bs!JKt
z^&GB3e9KeGfA&7qK7pRhsspsKS`*i^8_6UN$G2Hk=H|JYq<C0(R65@ER5&b7S9#8Y
zS?hrg)4jfN8gcr#ZgrOhfC<QC*?jp^rZjBKOWhC<VUaj>E0q{K&C=|YeR>D(zl3MV
z=UX_?^Rv}xPn6n{cK%h{^u<qdu5a^tFWfx9cel1ckhIxwAMFNSswx^=H&Nc884kz6
zwye76g9S`XzP&Q?8W#fifzOh1y~7Sfn``yCgs}p3m`n<2IWaGXx8gRqVS;O?iv$p4
zC+-$!V@^KMMS)IgOB)0eB`_jM*2C6h>oygU#HrEt4&nN?=A7-Orc09V6o}t2YZF|z
z@-X<ZLmKBwYXZU7Em=fGC8||az}T69f%JiZ>l%dqr8MA&(-Xmlc`sI5mnB<D7M@pI
zC%=hP;ziLVDalg=iHYDE2~m{*ZK9$RU!^s@a9uLf&kx2T+K21$Lytk%B?ZfM!~PT}
zh1vXlnr?2QMn85+|M>qloZ+e6Tj+|a1$b$k;qd361=R#Qhv%7ET?zJkRb8Rjjk6@-
zwnLm{u9V^sdj&r|tD4V6@%L{Ynkd!IuT_wlFH<PRLFeqb14!d?{$n?*D4&)WpDrj#
zt$@3}adClXJyq9%%yAZd7x2hyt3m?zg06>tiUWfLtaR)(i;c0f5p*Py!_Ip$x!ZE>
z%U=f8n-lq15`j>9NidM8Udz?YC3L_!Kdbi*Y`++RqKn8)log~4Et0)3os&#Wt>S#)
z{2CT7CfBx3^A5(>F8Y3dx|tKIVfj;0Fz*kciCQ(JgMWo>ZymR%YjW#GsT|dSUU{gu
z>1F6|YDZ0A?mqAc#F-K3LJ)ll-GI~0xLr9XYRcYw5)zYmV)YD9@?HAm(Pi6I%gi&f
zh3u!SS6V>=Zwwz|jjdTP-aNw?j(!jye-?jgxqC7`rgG$pa*8|dkIgHPXQnA!uBRp<
zqnp2weW?kkGT%Wr3|mPxXMXbZ+pa}^_qSgHI(-J3O4sa_O8l(zD)Zee*rlyr{xoR6
zFmTWZ<=$BL`0dE<>x;wM2&w?b$$1CPT+tfXSAFz{z_-_oYV^%jBl6jo7#ZUlhwq$l
zT_?^EE`36}3w6tlA5(;K)GZ~2*%}Lq%NbEi)+J#0QObCEeuMLDSLut0#cmj4qGwOx
zf-cS==uvlblZJ(3)uUHKK3t=sp3(22;t^)oy=uSE=gRe8U7STtrqSYPvKcZWBuMww
z^`=tcL6T7_1Z@7Mn=xD~`r6c{(I(k4!f?<P&KSI0eY;Nf=U#lcofp)0d;0v3{{d^u
z)ntYIHUql+(S&c1gU_kvGW|B~+Xi8MzJ&JnM)xzAXjjqDd){2{bRkvfdrv#=ITPl>
zgY92u@bndj5nK5Hafp>29T~=mCgX=jXSCoh4s-J&?rg(#2D;pIKSD^vSP*SQN1ZQ^
z;CXq#D@(|gac=r`@tA1tGk+AXhvHN&#-fEinc4N!rN&E|IgF_vWim-)d+7)7tE%K`
zSYxu<*K@e6IyYDLT_jM*bD#yi^j`5=W!kJze%NN^0S{-=tOY09{G4+@lL7LF?fzhM
zf0fgN&G6zMoX35Rw;teZw1`btfqAx4A0mAkj9egfXjt6`oq7L3DsVC^SiR*K{*sS-
z;yW!>9`si8Qz(23M!ta|J8yx;O{c<%!N+i2Oqi}w0h6xJoeR*(q9`9v0@%1F7ii76
zxEbl<Ge6R6NzWJe?~dGx8Tz0MrcwG|PgD|NfyE2R6y`|{^F+B>SP7jhx6)O|@$_IC
zCHs8xo~IWT$`$0+!5-M}TKXE)(ovG{9DLT=n#MLjje(tt8)re=oFWA9LG-F{glvSx
z+R)x8duA-AlrF(H|Jv^RD&M1@4Ljd$w^hvH<xSZkBAvMN0Bi<v{icIxW=uuOHs)ZR
zLq`rGu8ReC4V^H@q$11-6{;xX-Y&kcEvva!o#%amvV!fzs7Rw!)g0#$SVW0=aD%?<
zoZp!1XbF<Ic^W-WKx88((kO%>o(*uIU~ImqKtxmoW7~;wi2qFfv}>RSOgqW7Qkr)F
zT10=0f+8AuWLaIf%y}1ToebzTjz;~yt&_w>Xz(KwL#04E)qXlD@)RRT<y8_=W_5UA
zy)@!R-i9kv<44lI3-U~VZg-NZa+y8<kQ>2|^$x#l`<0}wy8dtuU#JM4AB>^Xn|(J;
zEkm)$igP$;v8v}*4teH&>vem(izUJrzC#0+AbM-HT<IHAsr?=sVYJf1rp46g^~N{V
zzs#t2cY|Ki%2Br}b4_==y}E11<GrWD`m1V$41jR#gPGwP371<mzUl5Fuyw#_*inr^
zWw(u2e&|tBFM3td0h-e3-x}t7-cM|(T)b2R^P1ZPmyDYl&%#83qE^fyi7O6p)7aAM
z>gwuM!<`ai>vS%c3K<KB0Tw>fr6xIfS%(LQ)NuzIdpx?AD44NwlQ;oH0|9SwbMsnW
zxt+~1sNQts;0=7_K9hJOL$mWB)n`7Ah#{Aom+hJ*i8G3D<=CdgEG^G=?eedqzCH!7
zxKhcC=%vJ~Rv*gP`9*yMRxP>ED`ejlT&}KPlG~e*juKqid*lb1VITFpjV0K{Tf-4H
z0dQ<zx(aGuD=f>dg^V+eqH{%gj*pMYyFJzIh+uSgUIwAZ#SI&RGds$-!~}+Zp5&5e
z0R}(EL)=c5@H^!oKf?f#66bOu0~s8GJ3w7OuhIu3Wm#>nmT<v4cmscguosM(Yc*_$
zt?XODdd$q77wF_n_uHZx4MfJO*3ubY*Z2B(3^jY~HJvt1a@Qr)Dq8q93K2=Y&CAH8
zw1}(Q258<;ks!&~(}Q50GTptJ-N1##Vp6~DKjNTmx?s*#juO)FDRC7C8n-mfbMd{f
z;cvU42*V8oyQEMR{dfn~gITUa6JOe?cq`W{N{czq<)Y<6ab=fDP`CMJyKKSGPh6$=
z6FT+DM{@f{cuLn8jZWG%XZ2Y!w+m_VM{vUpk3#vTA(ZaSS}$mQml1Sh1ijjcEA#kP
zH;u?dxHf!%YFllM0p#<6)d(7pe9qN}i5*LvBJ`n^E!y;4<`&-W|EcnG_FI$!E#WQi
zc{6!O{M0Sq8O$}-8YQuSkJnsSFcU@3!Um{UIgp-|x#8*_-DnOXbHh9A{%jQ146zxM
zpKKbI#ljPaN|o7@{U3n_0IkhD$@-(o%Y>7)Uf=R)laqXk)PShD_i)*cWY+HN2sHon
zd^#j8G|p0)Ilr20lDrxMAB|&gZ>|38&7(AT1)#5|Xt}5?^yze0|4O?lvE3Yt!LwVu
zz~fb-gF~CYr{Sj*`~@s-9TdrG^kX+aB?)KoujlFZ7O58Wy``;H0JWCkL5(t(gxdr>
zTd?++b+l86&|Jt)iB4XqJz1DFi47gAg3>a_v=hip@iAt5t(Ej|Jg>9enUDuc+u=!l
z!;zn`V36i{x63bar`B95K~ttTgdVnC{ncRfEf7-j;{9bl9C~3~=aeJSE&r218vhu8
zoKCoX@%mNR{`Q@q6Tt@|JbbNMw}c11!+$YCbv5ct74<Z*zj|v2ah{C#Af+1>`g>P_
zamjLoS4Eskd;i)=Y;`oOlk9TznY+HLT*v9#PXIQLR^8p-8T>#(G8lNK|LKpenH)9>
z+g-e-2L(_q<|C><{}<9CkS{H~N9)ecjQzq|x1rcURv<-g1kJv?;FsEAU^)aJT=<=X
z1Y~Mf0-qEslF#mjTh9Zz`N(}Dzn3on>;y+37Xt~#Sd9G?(~c?eL%j+DoXD8RnLF0R
z-vmD3h1xtooa-g^!~dg&&5sMb-UsL_mJcTdcNFx_!Sdsp@3z|~4Dvu;{u0-}K6|xY
z_=h1jefJQG0RY*c(pubO_YAQB#?GyOg!GS)er%gt|LmmgUG~pT`f=9$A3W}Y=zQ|_
XYtp7GhlKwC{xq-aUPE1d@Z^60?sBS`

diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 09b56576699e0..6e46a47338398 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -270,23 +270,92 @@ for i in range(2):
 
 ## Power iteration clustering (PIC)
 
-Power iteration clustering (PIC) is a scalable and efficient algorithm for clustering points given pointwise mutual affinity values.  Internally the algorithm:
+Power iteration clustering (PIC) is a scalable and efficient algorithm for clustering vertices of a
+graph given pairwise similarties as edge properties,
+described in [Lin and Cohen, Power Iteration Clustering](http://www.icml2010.org/papers/387.pdf).
+It computes a pseudo-eigenvector of the normalized affinity matrix of the graph via
+[power iteration](http://en.wikipedia.org/wiki/Power_iteration)  and uses it to cluster vertices.
+MLlib includes an implementation of PIC using GraphX as its backend.
+It takes an `RDD` of `(srcId, dstId, similarity)` tuples and outputs a model with the clustering assignments.
+The similarities must be nonnegative.
+PIC assumes that the similarity measure is symmetric.
+A pair `(srcId, dstId)` regardless of the ordering should appear at most once in the input data.
+If a pair is missing from input, their similarity is treated as zero.
+MLlib's PIC implementation takes the following (hyper-)parameters:
+
+* `k`: number of clusters
+* `maxIterations`: maximum number of power iterations
+* `initializationMode`: initialization model. This can be either "random", which is the default,
+  to use a random vector as vertex properties, or "degree" to use normalized sum similarities.
 
-* accepts a [Graph](api/graphx/index.html#org.apache.spark.graphx.Graph) that represents a  normalized pairwise affinity between all input points.
-* calculates the principal eigenvalue and eigenvector
-* Clusters each of the input points according to their principal eigenvector component value
+**Examples**
+
+In the following, we show code snippets to demonstrate how to use PIC in MLlib.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+[`PowerIterationClustering`](api/scala/index.html#org.apache.spark.mllib.clustering.PowerIterationClustering) 
+implements the PIC algorithm.
+It takes an `RDD` of `(srcId: Long, dstId: Long, similarity: Double)` tuples representing the
+affinity matrix.
+Calling `PowerIterationClustering.run` returns a
+[`PowerIterationClusteringModel`](api/scala/index.html#org.apache.spark.mllib.clustering.PowerIterationClusteringModel),
+which contains the computed clustering assignments.
 
-Details of this algorithm are found within [Power Iteration Clustering, Lin and Cohen]{www.icml2010.org/papers/387.pdf}
+{% highlight scala %}
+import org.apache.spark.mllib.clustering.PowerIterationClustering
+import org.apache.spark.mllib.linalg.Vectors
 
-Example outputs for a dataset inspired by the paper - but with five clusters instead of three- have he following output from our implementation:
+val similarities: RDD[(Long, Long, Double)] = ...
+
+val pic = new PowerIteartionClustering()
+  .setK(3)
+  .setMaxIterations(20)
+val model = pic.run(similarities)
+
+model.assignments.foreach { case (vertexId, clusterId) =>
+  println(s"$vertexId -> $clusterId")
+}
+{% endhighlight %}
+
+A full example that produces the experiment described in the PIC paper can be found under
+[`examples/`](https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala).
+
+</div>
 
-<p style="text-align: center;">
-  <img src="img/PIClusteringFiveCirclesInputsAndOutputs.png"
-       title="The Property Graph"
-       alt="The Property Graph"
-       width="50%" />
-  <!-- Images are downsized intentionally to improve quality on retina displays -->
-</p>
+<div data-lang="java" markdown="1">
+
+[`PowerIterationClustering`](api/java/org/apache/spark/mllib/clustering/PowerIterationClustering.html)
+implements the PIC algorithm.
+It takes an `JavaRDD` of `(srcId: Long, dstId: Long, similarity: Double)` tuples representing the
+affinity matrix.
+Calling `PowerIterationClustering.run` returns a
+[`PowerIterationClusteringModel`](api/java/org/apache/spark/mllib/clustering/PowerIterationClusteringModel.html)
+which contains the computed clustering assignments.
+
+{% highlight java %}
+import scala.Tuple2;
+import scala.Tuple3;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.clustering.PowerIterationClustering;
+import org.apache.spark.mllib.clustering.PowerIterationClusteringModel;
+
+JavaRDD<Tuple3<Long, Long, Double>> similarities = ...
+
+PowerIterationClustering pic = new PowerIterationClustering()
+  .setK(2)
+  .setMaxIterations(10);
+PowerIterationClusteringModel model = pic.run(similarities);
+
+for (Tuple2<Object, Object> assignment: model.assignments().toJavaRDD().collect()) {
+  System.out.println(assignment._1() + " -> " + assignment._2());
+}
+{% endhighlight %}
+</div>
+
+</div>
 
 ## Latent Dirichlet allocation (LDA)
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java
new file mode 100644
index 0000000000000..e9371de39f284
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import scala.Tuple2;
+import scala.Tuple3;
+
+import com.google.common.collect.Lists;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.clustering.PowerIterationClustering;
+import org.apache.spark.mllib.clustering.PowerIterationClusteringModel;
+
+/**
+ * Java example for graph clustering using power iteration clustering (PIC).
+ */
+public class JavaPowerIterationClusteringExample {
+  public static void main(String[] args) {
+    SparkConf sparkConf = new SparkConf().setAppName("JavaPowerIterationClusteringExample");
+    JavaSparkContext sc = new JavaSparkContext(sparkConf);
+
+    @SuppressWarnings("unchecked")
+    JavaRDD<Tuple3<Long, Long, Double>> similarities = sc.parallelize(Lists.newArrayList(
+      new Tuple3<Long, Long, Double>(0L, 1L, 0.9),
+      new Tuple3<Long, Long, Double>(1L, 2L, 0.9),
+      new Tuple3<Long, Long, Double>(2L, 3L, 0.9),
+      new Tuple3<Long, Long, Double>(3L, 4L, 0.1),
+      new Tuple3<Long, Long, Double>(4L, 5L, 0.9)));
+
+    PowerIterationClustering pic = new PowerIterationClustering()
+      .setK(2)
+      .setMaxIterations(10);
+    PowerIterationClusteringModel model = pic.run(similarities);
+
+    for (Tuple2<Object, Object> assignment: model.assignments().toJavaRDD().collect()) {
+      System.out.println(assignment._1() + " -> " + assignment._2());
+    }
+
+    sc.stop();
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index 3b1caf0c679ef..63d03347f4572 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.mllib.clustering
 
+import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.graphx._
@@ -115,6 +116,14 @@ class PowerIterationClustering private[clustering] (
     pic(w0)
   }
 
+  /**
+   * A Java-friendly version of [[PowerIterationClustering.run]].
+   */
+  def run(similarities: JavaRDD[(java.lang.Long, java.lang.Long, java.lang.Double)])
+    : PowerIterationClusteringModel = {
+    run(similarities.rdd.asInstanceOf[RDD[(Long, Long, Double)]])
+  }
+
   /**
    * Runs the PIC algorithm.
    *

From e945aa6139e022d13ac793f46819cfee07b782fc Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Thu, 19 Feb 2015 09:49:34 +0800
Subject: [PATCH 249/357] [SPARK-5846] Correctly set job description and pool
 for SQL jobs

marmbrus am I missing something obvious here? I verified that this fixes the problem for me (on 1.2.1) on EC2, but I'm confused about how others wouldn't have noticed this?

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #4630 from kayousterhout/SPARK-5846_1.3 and squashes the following commits:

2022ad4 [Kay Ousterhout] [SPARK-5846] Correctly set job description and pool for SQL jobs
---
 .../org/apache/spark/sql/hive/thriftserver/Shim12.scala   | 8 ++++----
 .../org/apache/spark/sql/hive/thriftserver/Shim13.scala   | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala b/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala
index ea9d61d8d0f5e..13116b40bb259 100644
--- a/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala
+++ b/sql/hive-thriftserver/v0.12.0/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim12.scala
@@ -185,6 +185,10 @@ private[hive] class SparkExecuteStatementOperation(
   def run(): Unit = {
     logInfo(s"Running query '$statement'")
     setState(OperationState.RUNNING)
+    hiveContext.sparkContext.setJobDescription(statement)
+    sessionToActivePool.get(parentSession.getSessionHandle).foreach { pool =>
+      hiveContext.sparkContext.setLocalProperty("spark.scheduler.pool", pool)
+    }
     try {
       result = hiveContext.sql(statement)
       logDebug(result.queryExecution.toString())
@@ -194,10 +198,6 @@ private[hive] class SparkExecuteStatementOperation(
           logInfo(s"Setting spark.scheduler.pool=$value for future statements in this session.")
         case _ =>
       }
-      hiveContext.sparkContext.setJobDescription(statement)
-      sessionToActivePool.get(parentSession.getSessionHandle).foreach { pool =>
-        hiveContext.sparkContext.setLocalProperty("spark.scheduler.pool", pool)
-      }
       iter = {
         val useIncrementalCollect =
           hiveContext.getConf("spark.sql.thriftServer.incrementalCollect", "false").toBoolean
diff --git a/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala b/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala
index 71e3954b2c7ac..9b8faeff94eab 100644
--- a/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala
+++ b/sql/hive-thriftserver/v0.13.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/Shim13.scala
@@ -156,6 +156,10 @@ private[hive] class SparkExecuteStatementOperation(
   def run(): Unit = {
     logInfo(s"Running query '$statement'")
     setState(OperationState.RUNNING)
+    hiveContext.sparkContext.setJobDescription(statement)
+    sessionToActivePool.get(parentSession.getSessionHandle).foreach { pool =>
+      hiveContext.sparkContext.setLocalProperty("spark.scheduler.pool", pool)
+    }
     try {
       result = hiveContext.sql(statement)
       logDebug(result.queryExecution.toString())
@@ -165,10 +169,6 @@ private[hive] class SparkExecuteStatementOperation(
           logInfo(s"Setting spark.scheduler.pool=$value for future statements in this session.")
         case _ =>
       }
-      hiveContext.sparkContext.setJobDescription(statement)
-      sessionToActivePool.get(parentSession.getSessionHandle).foreach { pool =>
-        hiveContext.sparkContext.setLocalProperty("spark.scheduler.pool", pool)
-      }
       iter = {
         val useIncrementalCollect =
           hiveContext.getConf("spark.sql.thriftServer.incrementalCollect", "false").toBoolean

From fb87f449219c673a16bc46f85c1ef7a6e3f22736 Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <lewandowski.jacek@gmail.com>
Date: Thu, 19 Feb 2015 09:53:36 -0800
Subject: [PATCH 250/357] SPARK-5548: Fix for AkkaUtilsSuite failure - attempt
 2

Author: Jacek Lewandowski <lewandowski.jacek@gmail.com>

Closes #4653 from jacek-lewandowski/SPARK-5548-2-master and squashes the following commits:

0e199b6 [Jacek Lewandowski] SPARK-5548: applied reviewer's comments
843eafb [Jacek Lewandowski] SPARK-5548: Fix for AkkaUtilsSuite failure - attempt 2
---
 .../scala/org/apache/spark/util/AkkaUtilsSuite.scala     | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
index 2cc5817758cf7..6250d50fb7036 100644
--- a/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.util
 import java.util.concurrent.TimeoutException
 
 import scala.concurrent.Await
+import scala.util.{Failure, Try}
 
 import akka.actor._
 
@@ -370,8 +371,12 @@ class AkkaUtilsSuite extends FunSuite with LocalSparkContext with ResetSystemPro
     val selection = slaveSystem.actorSelection(
       AkkaUtils.address(AkkaUtils.protocol(slaveSystem), "spark", "localhost", boundPort, "MapOutputTracker"))
     val timeout = AkkaUtils.lookupTimeout(conf)
-    intercept[TimeoutException] {
-      slaveTracker.trackerActor = Await.result(selection.resolveOne(timeout * 2), timeout)
+    val result = Try(Await.result(selection.resolveOne(timeout * 2), timeout))
+
+    result match {
+      case Failure(ex: ActorNotFound) =>
+      case Failure(ex: TimeoutException) =>
+      case r => fail(s"$r is neither Failure(ActorNotFound) nor Failure(TimeoutException)")
     }
 
     actorSystem.shutdown()

From 38e624a732b18e01ad2e7a499ce0bb0d7acdcdf6 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Thu, 19 Feb 2015 09:56:25 -0800
Subject: [PATCH 251/357] [SPARK-5816] Add huge compatibility warning in
 DriverWrapper

The stability of the new submission gateway assumes that the arguments in `DriverWrapper` are consistent across multiple Spark versions. However, this is not at all clear from the code itself. In fact, this was broken in 20a6013106b56a1a1cc3e8cda092330ffbe77cc3, which is fortunately OK because both that commit and the original commit that added this gateway are part of the same release.

To prevent this from happening again we should at the very least add a huge warning where appropriate.

Author: Andrew Or <andrew@databricks.com>

Closes #4687 from andrewor14/driver-wrapper-warning and squashes the following commits:

7989b56 [Andrew Or] Add huge compatibility warning
---
 .../org/apache/spark/deploy/worker/DriverWrapper.scala     | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
index ab467a5ee8c6c..deef6ef9043c6 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
@@ -26,10 +26,17 @@ import org.apache.spark.util.{AkkaUtils, ChildFirstURLClassLoader, MutableURLCla
 
 /**
  * Utility object for launching driver programs such that they share fate with the Worker process.
+ * This is used in standalone cluster mode only.
  */
 object DriverWrapper {
   def main(args: Array[String]) {
     args.toList match {
+      /*
+       * IMPORTANT: Spark 1.3 provides a stable application submission gateway that is both
+       * backward and forward compatible across future Spark versions. Because this gateway
+       * uses this class to launch the driver, the ordering and semantics of the arguments
+       * here must also remain consistent across versions.
+       */
       case workerUrl :: userJar :: mainClass :: extraArgs =>
         val conf = new SparkConf()
         val (actorSystem, _) = AkkaUtils.createActorSystem("Driver",

From 90095bf3ce9304d09a32ceffaa99069079071b59 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Thu, 19 Feb 2015 18:37:31 +0000
Subject: [PATCH 252/357] [SPARK-5423][Core] Cleanup resources in
 DiskMapIterator.finalize to ensure deleting the temp file

This PR adds a `finalize` method in DiskMapIterator to clean up the resources even if some exception happens during processing data.

Author: zsxwing <zsxwing@gmail.com>

Closes #4219 from zsxwing/SPARK-5423 and squashes the following commits:

d4b2ca6 [zsxwing] Cleanup resources in DiskMapIterator.finalize to ensure deleting the temp file
---
 .../collection/ExternalAppendOnlyMap.scala    | 52 +++++++++++++++----
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index 8a0f5a602de12..fc7e86e297540 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -387,6 +387,15 @@ class ExternalAppendOnlyMap[K, V, C](
     private var batchIndex = 0  // Which batch we're in
     private var fileStream: FileInputStream = null
 
+    @volatile private var closed = false
+
+    // A volatile variable to remember which DeserializationStream is using. Need to set it when we
+    // open a DeserializationStream. But we should use `deserializeStream` rather than
+    // `deserializeStreamToBeClosed` to read the content because touching a volatile variable will
+    // reduce the performance. It must be volatile so that we can see its correct value in the
+    // `finalize` method, which could run in any thread.
+    @volatile private var deserializeStreamToBeClosed: DeserializationStream = null
+
     // An intermediate stream that reads from exactly one batch
     // This guards against pre-fetching and other arbitrary behavior of higher level streams
     private var deserializeStream = nextBatchStream()
@@ -401,6 +410,7 @@ class ExternalAppendOnlyMap[K, V, C](
       // we're still in a valid batch.
       if (batchIndex < batchOffsets.length - 1) {
         if (deserializeStream != null) {
+          deserializeStreamToBeClosed = null
           deserializeStream.close()
           fileStream.close()
           deserializeStream = null
@@ -419,7 +429,11 @@ class ExternalAppendOnlyMap[K, V, C](
 
         val bufferedStream = new BufferedInputStream(ByteStreams.limit(fileStream, end - start))
         val compressedStream = blockManager.wrapForCompression(blockId, bufferedStream)
-        ser.deserializeStream(compressedStream)
+        // Before returning the stream, assign it to `deserializeStreamToBeClosed` so that we can
+        // close it in `finalize` and also avoid to touch the volatile `deserializeStreamToBeClosed`
+        // during reading the (K, C) pairs.
+        deserializeStreamToBeClosed = ser.deserializeStream(compressedStream)
+        deserializeStreamToBeClosed
       } else {
         // No more batches left
         cleanup()
@@ -468,14 +482,34 @@ class ExternalAppendOnlyMap[K, V, C](
       item
     }
 
-    // TODO: Ensure this gets called even if the iterator isn't drained.
-    private def cleanup() {
-      batchIndex = batchOffsets.length  // Prevent reading any other batch
-      val ds = deserializeStream
-      deserializeStream = null
-      fileStream = null
-      ds.close()
-      file.delete()
+    // TODO: Now only use `finalize` to ensure `close` gets called to clean up the resources. In the
+    // future, we need some mechanism to ensure this gets called once the resources are not used.
+    private def cleanup(): Unit = {
+      if (!closed) {
+        closed = true
+        batchIndex = batchOffsets.length  // Prevent reading any other batch
+        fileStream = null
+        try {
+          val ds = deserializeStreamToBeClosed
+          deserializeStreamToBeClosed = null
+          deserializeStream = null
+          if (ds != null) {
+            ds.close()
+          }
+        } finally {
+          if (file.exists()) {
+            file.delete()
+          }
+        }
+      }
+    }
+
+    override def finalize(): Unit = {
+      try {
+        cleanup()
+      } finally {
+        super.finalize()
+      }
     }
   }
 

From 94cdb05ff7e6b8fc5b3a574202ba8bc8e5bbe689 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Thu, 19 Feb 2015 12:07:51 -0800
Subject: [PATCH 253/357] [SPARK-5825] [Spark Submit] Remove the double
 checking instance name when stopping the service

`spark-daemon.sh` will confirm the process id by fuzzy matching the class name while stopping the service, however, it will fail if the java process arguments is very long (greater than 4096 characters).
This PR looses the check for the service process.

Author: Cheng Hao <hao.cheng@intel.com>

Closes #4611 from chenghao-intel/stopping_service and squashes the following commits:

a0051f6 [Cheng Hao] loosen the process checking while stopping a service
---
 sbin/spark-daemon.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sbin/spark-daemon.sh b/sbin/spark-daemon.sh
index ec6d0b5a40ef2..e1bcc7d64254a 100755
--- a/sbin/spark-daemon.sh
+++ b/sbin/spark-daemon.sh
@@ -166,7 +166,7 @@ case $option in
 
     if [ -f $pid ]; then
       TARGET_ID="$(cat "$pid")"
-      if [[ $(ps -p "$TARGET_ID" -o args=) =~ $command ]]; then
+      if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]]; then
         echo "stopping $command"
         kill "$TARGET_ID"
       else

From 8ca3418e1b3e2687e75a08c185d17045a97279fb Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 19 Feb 2015 12:09:44 -0800
Subject: [PATCH 254/357] [SPARK-5904][SQL] DataFrame API fixes.

1. Column is no longer a DataFrame to simplify class hierarchy.
2. Don't use varargs on abstract methods (see Scala compiler bug SI-9013).

Author: Reynold Xin <rxin@databricks.com>

Closes #4686 from rxin/SPARK-5904 and squashes the following commits:

fd9b199 [Reynold Xin] Fixed Python tests.
df25cef [Reynold Xin] Non final.
5221530 [Reynold Xin] [SPARK-5904][SQL] DataFrame API fixes.
---
 python/pyspark/sql/dataframe.py               |  56 +-
 .../scala/org/apache/spark/sql/Column.scala   | 223 +++-----
 .../apache/spark/sql/ComputableColumn.scala   |  33 --
 .../org/apache/spark/sql/DataFrame.scala      | 420 ++++++++++++---
 .../org/apache/spark/sql/DataFrameImpl.scala  | 483 ------------------
 .../org/apache/spark/sql/GroupedData.scala    |   2 +-
 .../apache/spark/sql/IncomputableColumn.scala | 183 -------
 .../spark/sql/ColumnExpressionSuite.scala     |  44 +-
 .../org/apache/spark/sql/DataFrameSuite.scala |   7 +-
 9 files changed, 427 insertions(+), 1024 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/ComputableColumn.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index c68c97e9260e2..010c38f93b9cf 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -546,7 +546,7 @@ def first(self):
     def __getitem__(self, item):
         """ Return the column by given name
 
-        >>> df['age'].collect()
+        >>> df.select(df['age']).collect()
         [Row(age=2), Row(age=5)]
         >>> df[ ["name", "age"]].collect()
         [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]
@@ -555,7 +555,7 @@ def __getitem__(self, item):
         """
         if isinstance(item, basestring):
             jc = self._jdf.apply(item)
-            return Column(jc, self.sql_ctx)
+            return Column(jc)
         elif isinstance(item, Column):
             return self.filter(item)
         elif isinstance(item, list):
@@ -566,13 +566,13 @@ def __getitem__(self, item):
     def __getattr__(self, name):
         """ Return the column by given name
 
-        >>> df.age.collect()
+        >>> df.select(df.age).collect()
         [Row(age=2), Row(age=5)]
         """
         if name.startswith("__"):
             raise AttributeError(name)
         jc = self._jdf.apply(name)
-        return Column(jc, self.sql_ctx)
+        return Column(jc)
 
     def select(self, *cols):
         """ Selecting a set of expressions.
@@ -698,7 +698,7 @@ def withColumnRenamed(self, existing, new):
         >>> df.withColumnRenamed('age', 'age2').collect()
         [Row(age2=2, name=u'Alice'), Row(age2=5, name=u'Bob')]
         """
-        cols = [Column(_to_java_column(c), self.sql_ctx).alias(new)
+        cols = [Column(_to_java_column(c)).alias(new)
                 if c == existing else c
                 for c in self.columns]
         return self.select(*cols)
@@ -873,15 +873,16 @@ def _unary_op(name, doc="unary operator"):
     """ Create a method for given unary operator """
     def _(self):
         jc = getattr(self._jc, name)()
-        return Column(jc, self.sql_ctx)
+        return Column(jc)
     _.__doc__ = doc
     return _
 
 
 def _func_op(name, doc=''):
     def _(self):
-        jc = getattr(self._sc._jvm.functions, name)(self._jc)
-        return Column(jc, self.sql_ctx)
+        sc = SparkContext._active_spark_context
+        jc = getattr(sc._jvm.functions, name)(self._jc)
+        return Column(jc)
     _.__doc__ = doc
     return _
 
@@ -892,7 +893,7 @@ def _bin_op(name, doc="binary operator"):
     def _(self, other):
         jc = other._jc if isinstance(other, Column) else other
         njc = getattr(self._jc, name)(jc)
-        return Column(njc, self.sql_ctx)
+        return Column(njc)
     _.__doc__ = doc
     return _
 
@@ -903,12 +904,12 @@ def _reverse_op(name, doc="binary operator"):
     def _(self, other):
         jother = _create_column_from_literal(other)
         jc = getattr(jother, name)(self._jc)
-        return Column(jc, self.sql_ctx)
+        return Column(jc)
     _.__doc__ = doc
     return _
 
 
-class Column(DataFrame):
+class Column(object):
 
     """
     A column in a DataFrame.
@@ -924,9 +925,8 @@ class Column(DataFrame):
         1 / df.colName
     """
 
-    def __init__(self, jc, sql_ctx=None):
+    def __init__(self, jc):
         self._jc = jc
-        super(Column, self).__init__(jc, sql_ctx)
 
     # arithmetic operators
     __neg__ = _func_op("negate")
@@ -975,7 +975,7 @@ def substr(self, startPos, length):
         :param startPos: start position (int or Column)
         :param length:  length of the substring (int or Column)
 
-        >>> df.name.substr(1, 3).collect()
+        >>> df.select(df.name.substr(1, 3).alias("col")).collect()
         [Row(col=u'Ali'), Row(col=u'Bob')]
         """
         if type(startPos) != type(length):
@@ -986,7 +986,7 @@ def substr(self, startPos, length):
             jc = self._jc.substr(startPos._jc, length._jc)
         else:
             raise TypeError("Unexpected type: %s" % type(startPos))
-        return Column(jc, self.sql_ctx)
+        return Column(jc)
 
     __getslice__ = substr
 
@@ -1000,10 +1000,10 @@ def substr(self, startPos, length):
     def alias(self, alias):
         """Return a alias for this column
 
-        >>> df.age.alias("age2").collect()
+        >>> df.select(df.age.alias("age2")).collect()
         [Row(age2=2), Row(age2=5)]
         """
-        return Column(getattr(self._jc, "as")(alias), self.sql_ctx)
+        return Column(getattr(self._jc, "as")(alias))
 
     def cast(self, dataType):
         """ Convert the column into type `dataType`
@@ -1013,34 +1013,18 @@ def cast(self, dataType):
         >>> df.select(df.age.cast(StringType()).alias('ages')).collect()
         [Row(ages=u'2'), Row(ages=u'5')]
         """
-        if self.sql_ctx is None:
-            sc = SparkContext._active_spark_context
-            ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
-        else:
-            ssql_ctx = self.sql_ctx._ssql_ctx
         if isinstance(dataType, basestring):
             jc = self._jc.cast(dataType)
         elif isinstance(dataType, DataType):
+            sc = SparkContext._active_spark_context
+            ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
             jdt = ssql_ctx.parseDataType(dataType.json())
             jc = self._jc.cast(jdt)
-        return Column(jc, self.sql_ctx)
+        return Column(jc)
 
     def __repr__(self):
         return 'Column<%s>' % self._jdf.toString().encode('utf8')
 
-    def toPandas(self):
-        """
-        Return a pandas.Series from the column
-
-        >>> df.age.toPandas()  # doctest: +SKIP
-        0    2
-        1    5
-        dtype: int64
-        """
-        import pandas as pd
-        data = [c for c, in self.collect()]
-        return pd.Series(data)
-
 
 def _test():
     import doctest
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 8b6241c213c87..980754322e6c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -22,20 +22,15 @@ import scala.language.implicitConversions
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.functions.lit
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan}
-import org.apache.spark.sql.catalyst.analysis.UnresolvedGetField
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedStar, UnresolvedGetField}
 import org.apache.spark.sql.types._
 
 
 private[sql] object Column {
 
-  def apply(colName: String): Column = new IncomputableColumn(colName)
+  def apply(colName: String): Column = new Column(colName)
 
-  def apply(expr: Expression): Column = new IncomputableColumn(expr)
-
-  def apply(sqlContext: SQLContext, plan: LogicalPlan, expr: Expression): Column = {
-    new ComputableColumn(sqlContext, plan, expr)
-  }
+  def apply(expr: Expression): Column = new Column(expr)
 
   def unapply(col: Column): Option[Expression] = Some(col.expr)
 }
@@ -51,68 +46,18 @@ private[sql] object Column {
  * @groupname Ungrouped Support functions for DataFrames.
  */
 @Experimental
-trait Column extends DataFrame {
-
-  protected[sql] def expr: Expression
-
-  /**
-   * Returns true iff the [[Column]] is computable.
-   */
-  def isComputable: Boolean
-
-  /** Removes the top project so we can get to the underlying plan. */
-  private def stripProject(p: LogicalPlan): LogicalPlan = p match {
-    case Project(_, child) => child
-    case p => sys.error("Unexpected logical plan (expected Project): " + p)
-  }
-
-  private def computableCol(baseCol: ComputableColumn, expr: Expression) = {
-    val namedExpr = expr match {
-      case named: NamedExpression => named
-      case unnamed: Expression => Alias(unnamed, "col")()
-    }
-    val plan = Project(Seq(namedExpr), stripProject(baseCol.plan))
-    Column(baseCol.sqlContext, plan, expr)
-  }
+class Column(protected[sql] val expr: Expression) {
 
-  /**
-   * Construct a new column based on the expression and the other column value.
-   *
-   * There are two cases that can happen here:
-   * If otherValue is a constant, it is first turned into a Column.
-   * If otherValue is a Column, then:
-   *   - If this column and otherValue are both computable and come from the same logical plan,
-   *     then we can construct a ComputableColumn by applying a Project on top of the base plan.
-   *   - If this column is not computable, but otherValue is computable, then we can construct
-   *     a ComputableColumn based on otherValue's base plan.
-   *   - If this column is computable, but otherValue is not, then we can construct a
-   *     ComputableColumn based on this column's base plan.
-   *   - If neither columns are computable, then we create an IncomputableColumn.
-   */
-  private def constructColumn(otherValue: Any)(newExpr: Column => Expression): Column = {
-    // lit(otherValue) returns a Column always.
-    (this, lit(otherValue)) match {
-      case (left: ComputableColumn, right: ComputableColumn) =>
-        if (stripProject(left.plan).sameResult(stripProject(right.plan))) {
-          computableCol(right, newExpr(right))
-        } else {
-          // We don't want to throw an exception here because "df1("a") === df2("b")" can be
-          // a valid expression for join conditions, even though standalone they are not valid.
-          Column(newExpr(right))
-        }
-      case (left: ComputableColumn, right) => computableCol(left, newExpr(right))
-      case (_, right: ComputableColumn) => computableCol(right, newExpr(right))
-      case (_, right) => Column(newExpr(right))
-    }
-  }
+  def this(name: String) = this(name match {
+    case "*" => UnresolvedStar(None)
+    case _ if name.endsWith(".*") => UnresolvedStar(Some(name.substring(0, name.length - 2)))
+    case _ => UnresolvedAttribute(name)
+  })
 
   /** Creates a column based on the given expression. */
-  private def exprToColumn(newExpr: Expression, computable: Boolean = true): Column = {
-    this match {
-      case c: ComputableColumn if computable => computableCol(c, newExpr)
-      case _ => Column(newExpr)
-    }
-  }
+  implicit private def exprToColumn(newExpr: Expression): Column = new Column(newExpr)
+
+  override def toString: String = expr.prettyString
 
   /**
    * Unary minus, i.e. negate the expression.
@@ -127,7 +72,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def unary_- : Column = exprToColumn(UnaryMinus(expr))
+  def unary_- : Column = UnaryMinus(expr)
 
   /**
    * Inversion of boolean expression, i.e. NOT.
@@ -142,7 +87,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def unary_! : Column = exprToColumn(Not(expr))
+  def unary_! : Column = Not(expr)
 
   /**
    * Equality test.
@@ -157,9 +102,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def === (other: Any): Column = constructColumn(other) { o =>
-    EqualTo(expr, o.expr)
-  }
+  def === (other: Any): Column = EqualTo(expr, lit(other).expr)
 
   /**
    * Equality test.
@@ -190,9 +133,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def !== (other: Any): Column = constructColumn(other) { o =>
-    Not(EqualTo(expr, o.expr))
-  }
+  def !== (other: Any): Column = Not(EqualTo(expr, lit(other).expr))
 
   /**
    * Inequality test.
@@ -208,9 +149,7 @@ trait Column extends DataFrame {
    *
    * @group java_expr_ops
    */
-  def notEqual(other: Any): Column = constructColumn(other) { o =>
-    Not(EqualTo(expr, o.expr))
-  }
+  def notEqual(other: Any): Column = Not(EqualTo(expr, lit(other).expr))
 
   /**
    * Greater than.
@@ -225,9 +164,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def > (other: Any): Column = constructColumn(other) { o =>
-    GreaterThan(expr, o.expr)
-  }
+  def > (other: Any): Column = GreaterThan(expr, lit(other).expr)
 
   /**
    * Greater than.
@@ -256,9 +193,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def < (other: Any): Column = constructColumn(other) { o =>
-    LessThan(expr, o.expr)
-  }
+  def < (other: Any): Column = LessThan(expr, lit(other).expr)
 
   /**
    * Less than.
@@ -286,9 +221,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def <= (other: Any): Column = constructColumn(other) { o =>
-    LessThanOrEqual(expr, o.expr)
-  }
+  def <= (other: Any): Column = LessThanOrEqual(expr, lit(other).expr)
 
   /**
    * Less than or equal to.
@@ -316,9 +249,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def >= (other: Any): Column = constructColumn(other) { o =>
-    GreaterThanOrEqual(expr, o.expr)
-  }
+  def >= (other: Any): Column = GreaterThanOrEqual(expr, lit(other).expr)
 
   /**
    * Greater than or equal to an expression.
@@ -339,9 +270,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def <=> (other: Any): Column = constructColumn(other) { o =>
-    EqualNullSafe(expr, o.expr)
-  }
+  def <=> (other: Any): Column = EqualNullSafe(expr, lit(other).expr)
 
   /**
    * Equality test that is safe for null values.
@@ -355,14 +284,14 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def isNull: Column = exprToColumn(IsNull(expr))
+  def isNull: Column = IsNull(expr)
 
   /**
    * True if the current expression is NOT null.
    *
    * @group expr_ops
    */
-  def isNotNull: Column = exprToColumn(IsNotNull(expr))
+  def isNotNull: Column = IsNotNull(expr)
 
   /**
    * Boolean OR.
@@ -376,9 +305,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def || (other: Any): Column = constructColumn(other) { o =>
-    Or(expr, o.expr)
-  }
+  def || (other: Any): Column = Or(expr, lit(other).expr)
 
   /**
    * Boolean OR.
@@ -406,9 +333,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def && (other: Any): Column = constructColumn(other) { o =>
-    And(expr, o.expr)
-  }
+  def && (other: Any): Column = And(expr, lit(other).expr)
 
   /**
    * Boolean AND.
@@ -436,9 +361,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def + (other: Any): Column = constructColumn(other) { o =>
-    Add(expr, o.expr)
-  }
+  def + (other: Any): Column = Add(expr, lit(other).expr)
 
   /**
    * Sum of this expression and another expression.
@@ -466,9 +389,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def - (other: Any): Column = constructColumn(other) { o =>
-    Subtract(expr, o.expr)
-  }
+  def - (other: Any): Column = Subtract(expr, lit(other).expr)
 
   /**
    * Subtraction. Subtract the other expression from this expression.
@@ -496,9 +417,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def * (other: Any): Column = constructColumn(other) { o =>
-    Multiply(expr, o.expr)
-  }
+  def * (other: Any): Column = Multiply(expr, lit(other).expr)
 
   /**
    * Multiplication of this expression and another expression.
@@ -526,9 +445,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def / (other: Any): Column = constructColumn(other) { o =>
-    Divide(expr, o.expr)
-  }
+  def / (other: Any): Column = Divide(expr, lit(other).expr)
 
   /**
    * Division this expression by another expression.
@@ -549,9 +466,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def % (other: Any): Column = constructColumn(other) { o =>
-    Remainder(expr, o.expr)
-  }
+  def % (other: Any): Column = Remainder(expr, lit(other).expr)
 
   /**
    * Modulo (a.k.a. remainder) expression.
@@ -567,37 +482,35 @@ trait Column extends DataFrame {
    * @group expr_ops
    */
   @scala.annotation.varargs
-  def in(list: Column*): Column = {
-    new IncomputableColumn(In(expr, list.map(_.expr)))
-  }
+  def in(list: Column*): Column = In(expr, list.map(_.expr))
 
   /**
    * SQL like expression.
    *
    * @group expr_ops
    */
-  def like(literal: String): Column = exprToColumn(Like(expr, lit(literal).expr))
+  def like(literal: String): Column = Like(expr, lit(literal).expr)
 
   /**
    * SQL RLIKE expression (LIKE with Regex).
    *
    * @group expr_ops
    */
-  def rlike(literal: String): Column = exprToColumn(RLike(expr, lit(literal).expr))
+  def rlike(literal: String): Column = RLike(expr, lit(literal).expr)
 
   /**
    * An expression that gets an item at position `ordinal` out of an array.
    *
    * @group expr_ops
    */
-  def getItem(ordinal: Int): Column = exprToColumn(GetItem(expr, Literal(ordinal)))
+  def getItem(ordinal: Int): Column = GetItem(expr, Literal(ordinal))
 
   /**
    * An expression that gets a field by name in a [[StructField]].
    *
    * @group expr_ops
    */
-  def getField(fieldName: String): Column = exprToColumn(UnresolvedGetField(expr, fieldName))
+  def getField(fieldName: String): Column = UnresolvedGetField(expr, fieldName)
 
   /**
    * An expression that returns a substring.
@@ -606,8 +519,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def substr(startPos: Column, len: Column): Column =
-    exprToColumn(Substring(expr, startPos.expr, len.expr), computable = false)
+  def substr(startPos: Column, len: Column): Column = Substring(expr, startPos.expr, len.expr)
 
   /**
    * An expression that returns a substring.
@@ -616,26 +528,21 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def substr(startPos: Int, len: Int): Column =
-    exprToColumn(Substring(expr, lit(startPos).expr, lit(len).expr))
+  def substr(startPos: Int, len: Int): Column = Substring(expr, lit(startPos).expr, lit(len).expr)
 
   /**
    * Contains the other element.
    *
    * @group expr_ops
    */
-  def contains(other: Any): Column = constructColumn(other) { o =>
-    Contains(expr, o.expr)
-  }
+  def contains(other: Any): Column = Contains(expr, lit(other).expr)
 
   /**
    * String starts with.
    *
    * @group expr_ops
    */
-  def startsWith(other: Column): Column = constructColumn(other) { o =>
-    StartsWith(expr, o.expr)
-  }
+  def startsWith(other: Column): Column = StartsWith(expr, lit(other).expr)
 
   /**
    * String starts with another string literal.
@@ -649,9 +556,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def endsWith(other: Column): Column = constructColumn(other) { o =>
-    EndsWith(expr, o.expr)
-  }
+  def endsWith(other: Column): Column = EndsWith(expr, lit(other).expr)
 
   /**
    * String ends with another string literal.
@@ -669,7 +574,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  override def as(alias: String): Column = exprToColumn(Alias(expr, alias)())
+  def as(alias: String): Column = Alias(expr, alias)()
 
   /**
    * Gives the column an alias.
@@ -680,7 +585,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  override def as(alias: Symbol): Column = exprToColumn(Alias(expr, alias.name)())
+  def as(alias: Symbol): Column = Alias(expr, alias.name)()
 
   /**
    * Casts the column to a different data type.
@@ -695,7 +600,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def cast(to: DataType): Column = exprToColumn(Cast(expr, to))
+  def cast(to: DataType): Column = Cast(expr, to)
 
   /**
    * Casts the column to a different data type, using the canonical string representation
@@ -708,22 +613,20 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def cast(to: String): Column = exprToColumn(
-    Cast(expr, to.toLowerCase match {
-      case "string" => StringType
-      case "boolean" => BooleanType
-      case "byte" => ByteType
-      case "short" => ShortType
-      case "int" => IntegerType
-      case "long" => LongType
-      case "float" => FloatType
-      case "double" => DoubleType
-      case "decimal" => DecimalType.Unlimited
-      case "date" => DateType
-      case "timestamp" => TimestampType
-      case _ => throw new RuntimeException(s"""Unsupported cast type: "$to"""")
-    })
-  )
+  def cast(to: String): Column = Cast(expr, to.toLowerCase match {
+    case "string" | "str" => StringType
+    case "boolean" => BooleanType
+    case "byte" => ByteType
+    case "short" => ShortType
+    case "int" => IntegerType
+    case "long" => LongType
+    case "float" => FloatType
+    case "double" => DoubleType
+    case "decimal" => DecimalType.Unlimited
+    case "date" => DateType
+    case "timestamp" => TimestampType
+    case _ => throw new RuntimeException(s"""Unsupported cast type: "$to"""")
+  })
 
   /**
    * Returns an ordering used in sorting.
@@ -737,7 +640,7 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def desc: Column = exprToColumn(SortOrder(expr, Descending), computable = false)
+  def desc: Column = SortOrder(expr, Descending)
 
   /**
    * Returns an ordering used in sorting.
@@ -751,14 +654,14 @@ trait Column extends DataFrame {
    *
    * @group expr_ops
    */
-  def asc: Column = exprToColumn(SortOrder(expr, Ascending), computable = false)
+  def asc: Column = SortOrder(expr, Ascending)
 
   /**
-   * Prints the plans (logical and physical) to the console for debugging purpose.
+   * Prints the expression to the console for debugging purpose.
    *
    * @group df_ops
    */
-  override def explain(extended: Boolean): Unit = {
+  def explain(extended: Boolean): Unit = {
     if (extended) {
       println(expr)
     } else {
@@ -768,7 +671,7 @@ trait Column extends DataFrame {
 }
 
 
-class ColumnName(name: String) extends IncomputableColumn(name) {
+class ColumnName(name: String) extends Column(name) {
 
   /** Creates a new AttributeReference of type boolean */
   def boolean: StructField = StructField(name, BooleanType)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/ComputableColumn.scala b/sql/core/src/main/scala/org/apache/spark/sql/ComputableColumn.scala
deleted file mode 100644
index ac479b26a7c6a..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/ComputableColumn.scala
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package org.apache.spark.sql
-
-import scala.language.implicitConversions
-
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-
-
-private[sql] class ComputableColumn protected[sql](
-    sqlContext: SQLContext,
-    protected[sql] val plan: LogicalPlan,
-    protected[sql] val expr: Expression)
-  extends DataFrameImpl(sqlContext, plan) with Column  {
-
-  override def isComputable: Boolean = true
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 5007a5a34de1a..810f7c77477bb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -17,26 +17,38 @@
 
 package org.apache.spark.sql
 
+import java.io.CharArrayWriter
 import java.sql.DriverManager
 
-
 import scala.collection.JavaConversions._
+import scala.language.implicitConversions
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 import scala.util.control.NonFatal
 
+import com.fasterxml.jackson.core.JsonFactory
+
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.api.python.SerDeUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.sql.catalyst.{ScalaReflection, SqlParser}
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedRelation, ResolvedStar}
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.{JoinType, Inner}
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, LogicalRDD}
 import org.apache.spark.sql.jdbc.JDBCWriteDetails
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.json.JsonRDD
+import org.apache.spark.sql.types.{NumericType, StructType}
+import org.apache.spark.sql.sources.{ResolvedDataSource, CreateTableUsingAsSelect}
 import org.apache.spark.util.Utils
 
+
 private[sql] object DataFrame {
   def apply(sqlContext: SQLContext, logicalPlan: LogicalPlan): DataFrame = {
-    new DataFrameImpl(sqlContext, logicalPlan)
+    new DataFrame(sqlContext, logicalPlan)
   }
 }
 
@@ -90,22 +102,100 @@ private[sql] object DataFrame {
  */
 // TODO: Improve documentation.
 @Experimental
-trait DataFrame extends RDDApi[Row] with Serializable {
+class DataFrame protected[sql](
+    @transient val sqlContext: SQLContext,
+    @DeveloperApi @transient val queryExecution: SQLContext#QueryExecution)
+  extends RDDApi[Row] with Serializable {
+
+  /**
+   * A constructor that automatically analyzes the logical plan.
+   *
+   * This reports error eagerly as the [[DataFrame]] is constructed, unless
+   * [[SQLConf.dataFrameEagerAnalysis]] is turned off.
+   */
+  def this(sqlContext: SQLContext, logicalPlan: LogicalPlan) = {
+    this(sqlContext, {
+      val qe = sqlContext.executePlan(logicalPlan)
+      if (sqlContext.conf.dataFrameEagerAnalysis) {
+        qe.analyzed  // This should force analysis and throw errors if there are any
+      }
+      qe
+    })
+  }
+
+  @transient protected[sql] val logicalPlan: LogicalPlan = queryExecution.logical match {
+    // For various commands (like DDL) and queries with side effects, we force query optimization to
+    // happen right away to let these side effects take place eagerly.
+    case _: Command |
+         _: InsertIntoTable |
+         _: CreateTableAsSelect[_] |
+         _: CreateTableUsingAsSelect |
+         _: WriteToFile =>
+      LogicalRDD(queryExecution.analyzed.output, queryExecution.toRdd)(sqlContext)
+    case _ =>
+      queryExecution.logical
+  }
+
+  /**
+   * An implicit conversion function internal to this class for us to avoid doing
+   * "new DataFrameImpl(...)" everywhere.
+   */
+  @inline private implicit def logicalPlanToDataFrame(logicalPlan: LogicalPlan): DataFrame = {
+    new DataFrame(sqlContext, logicalPlan)
+  }
 
-  val sqlContext: SQLContext
+  protected[sql] def resolve(colName: String): NamedExpression = {
+    queryExecution.analyzed.resolve(colName, sqlContext.analyzer.resolver).getOrElse {
+      throw new AnalysisException(
+        s"""Cannot resolve column name "$colName" among (${schema.fieldNames.mkString(", ")})""")
+    }
+  }
 
-  @DeveloperApi
-  def queryExecution: SQLContext#QueryExecution
+  protected[sql] def numericColumns: Seq[Expression] = {
+    schema.fields.filter(_.dataType.isInstanceOf[NumericType]).map { n =>
+      queryExecution.analyzed.resolve(n.name, sqlContext.analyzer.resolver).get
+    }
+  }
 
-  protected[sql] def logicalPlan: LogicalPlan
+  /**
+   * Internal API for Python
+   */
+  private[sql] def showString(): String = {
+    val data = take(20)
+    val numCols = schema.fieldNames.length
 
-  override def toString =
+    // For cells that are beyond 20 characters, replace it with the first 17 and "..."
+    val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { row =>
+      row.toSeq.map { cell =>
+        val str = if (cell == null) "null" else cell.toString
+        if (str.length > 20) str.substring(0, 17) + "..." else str
+      }: Seq[String]
+    }
+
+    // Compute the width of each column
+    val colWidths = Array.fill(numCols)(0)
+    for (row <- rows) {
+      for ((cell, i) <- row.zipWithIndex) {
+        colWidths(i) = math.max(colWidths(i), cell.length)
+      }
+    }
+
+    // Pad the cells
+    rows.map { row =>
+      row.zipWithIndex.map { case (cell, i) =>
+        String.format(s"%-${colWidths(i)}s", cell)
+      }.mkString(" ")
+    }.mkString("\n")
+  }
+
+  override def toString: String = {
     try {
       schema.map(f => s"${f.name}: ${f.dataType.simpleString}").mkString("[", ", ", "]")
     } catch {
       case NonFatal(e) =>
         s"Invalid tree; ${e.getMessage}:\n$queryExecution"
     }
+  }
 
   /** Left here for backward compatibility. */
   @deprecated("1.3.0", "use toDF")
@@ -130,19 +220,31 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @group basic
    */
   @scala.annotation.varargs
-  def toDF(colNames: String*): DataFrame
+  def toDF(colNames: String*): DataFrame = {
+    require(schema.size == colNames.size,
+      "The number of columns doesn't match.\n" +
+        "Old column names: " + schema.fields.map(_.name).mkString(", ") + "\n" +
+        "New column names: " + colNames.mkString(", "))
+
+    val newCols = schema.fieldNames.zip(colNames).map { case (oldName, newName) =>
+      apply(oldName).as(newName)
+    }
+    select(newCols :_*)
+  }
 
   /**
    * Returns the schema of this [[DataFrame]].
    * @group basic
    */
-  def schema: StructType
+  def schema: StructType = queryExecution.analyzed.schema
 
   /**
    * Returns all column names and their data types as an array.
    * @group basic
    */
-  def dtypes: Array[(String, String)]
+  def dtypes: Array[(String, String)] = schema.fields.map { field =>
+    (field.name, field.dataType.toString)
+  }
 
   /**
    * Returns all column names as an array.
@@ -154,13 +256,19 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * Prints the schema to the console in a nice tree format.
    * @group basic
    */
-  def printSchema(): Unit
+  def printSchema(): Unit = println(schema.treeString)
 
   /**
    * Prints the plans (logical and physical) to the console for debugging purpose.
    * @group basic
    */
-  def explain(extended: Boolean): Unit
+  def explain(extended: Boolean): Unit = {
+    ExplainCommand(
+      logicalPlan,
+      extended = extended).queryExecution.executedPlan.executeCollect().map {
+      r => println(r.getString(0))
+    }
+  }
 
   /**
    * Only prints the physical plan to the console for debugging purpose.
@@ -173,7 +281,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * (without any Spark executors).
    * @group basic
    */
-  def isLocal: Boolean
+  def isLocal: Boolean = logicalPlan.isInstanceOf[LocalRelation]
 
   /**
    * Displays the [[DataFrame]] in a tabular form. For example:
@@ -187,7 +295,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * }}}
    * @group basic
    */
-  def show(): Unit
+  def show(): Unit = println(showString())
 
   /**
    * Cartesian join with another [[DataFrame]].
@@ -197,7 +305,9 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @param right Right side of the join operation.
    * @group dfops
    */
-  def join(right: DataFrame): DataFrame
+  def join(right: DataFrame): DataFrame = {
+    Join(logicalPlan, right.logicalPlan, joinType = Inner, None)
+  }
 
   /**
    * Inner join with another [[DataFrame]], using the given join expression.
@@ -209,7 +319,9 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * }}}
    * @group dfops
    */
-  def join(right: DataFrame, joinExprs: Column): DataFrame
+  def join(right: DataFrame, joinExprs: Column): DataFrame = {
+    Join(logicalPlan, right.logicalPlan, joinType = Inner, Some(joinExprs.expr))
+  }
 
   /**
    * Join with another [[DataFrame]], using the given join expression. The following performs
@@ -230,7 +342,9 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @param joinType One of: `inner`, `outer`, `left_outer`, `right_outer`, `semijoin`.
    * @group dfops
    */
-  def join(right: DataFrame, joinExprs: Column, joinType: String): DataFrame
+  def join(right: DataFrame, joinExprs: Column, joinType: String): DataFrame = {
+    Join(logicalPlan, right.logicalPlan, JoinType(joinType), Some(joinExprs.expr))
+  }
 
   /**
    * Returns a new [[DataFrame]] sorted by the specified column, all in ascending order.
@@ -243,7 +357,9 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @group dfops
    */
   @scala.annotation.varargs
-  def sort(sortCol: String, sortCols: String*): DataFrame
+  def sort(sortCol: String, sortCols: String*): DataFrame = {
+    sort((sortCol +: sortCols).map(apply) :_*)
+  }
 
   /**
    * Returns a new [[DataFrame]] sorted by the given expressions. For example:
@@ -253,7 +369,17 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @group dfops
    */
   @scala.annotation.varargs
-  def sort(sortExprs: Column*): DataFrame
+  def sort(sortExprs: Column*): DataFrame = {
+    val sortOrder: Seq[SortOrder] = sortExprs.map { col =>
+      col.expr match {
+        case expr: SortOrder =>
+          expr
+        case expr: Expression =>
+          SortOrder(expr, Ascending)
+      }
+    }
+    Sort(sortOrder, global = true, logicalPlan)
+  }
 
   /**
    * Returns a new [[DataFrame]] sorted by the given expressions.
@@ -261,7 +387,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @group dfops
    */
   @scala.annotation.varargs
-  def orderBy(sortCol: String, sortCols: String*): DataFrame
+  def orderBy(sortCol: String, sortCols: String*): DataFrame = sort(sortCol, sortCols :_*)
 
   /**
    * Returns a new [[DataFrame]] sorted by the given expressions.
@@ -269,7 +395,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @group dfops
    */
   @scala.annotation.varargs
-  def orderBy(sortExprs: Column*): DataFrame
+  def orderBy(sortExprs: Column*): DataFrame = sort(sortExprs :_*)
 
   /**
    * Selects column based on the column name and return it as a [[Column]].
@@ -281,19 +407,25 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * Selects column based on the column name and return it as a [[Column]].
    * @group dfops
    */
-  def col(colName: String): Column
+  def col(colName: String): Column = colName match {
+    case "*" =>
+      Column(ResolvedStar(schema.fieldNames.map(resolve)))
+    case _ =>
+      val expr = resolve(colName)
+      Column(expr)
+  }
 
   /**
    * Returns a new [[DataFrame]] with an alias set.
    * @group dfops
    */
-  def as(alias: String): DataFrame
+  def as(alias: String): DataFrame = Subquery(alias, logicalPlan)
 
   /**
    * (Scala-specific) Returns a new [[DataFrame]] with an alias set.
    * @group dfops
    */
-  def as(alias: Symbol): DataFrame
+  def as(alias: Symbol): DataFrame = as(alias.name)
 
   /**
    * Selects a set of expressions.
@@ -303,7 +435,13 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @group dfops
    */
   @scala.annotation.varargs
-  def select(cols: Column*): DataFrame
+  def select(cols: Column*): DataFrame = {
+    val namedExpressions = cols.map {
+      case Column(expr: NamedExpression) => expr
+      case Column(expr: Expression) => Alias(expr, expr.prettyString)()
+    }
+    Project(namedExpressions.toSeq, logicalPlan)
+  }
 
   /**
    * Selects a set of columns. This is a variant of `select` that can only select
@@ -317,7 +455,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @group dfops
    */
   @scala.annotation.varargs
-  def select(col: String, cols: String*): DataFrame
+  def select(col: String, cols: String*): DataFrame = select((col +: cols).map(Column(_)) :_*)
 
   /**
    * Selects a set of SQL expressions. This is a variant of `select` that accepts
@@ -329,7 +467,11 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @group dfops
    */
   @scala.annotation.varargs
-  def selectExpr(exprs: String*): DataFrame
+  def selectExpr(exprs: String*): DataFrame = {
+    select(exprs.map { expr =>
+      Column(new SqlParser().parseExpression(expr))
+    }: _*)
+  }
 
   /**
    * Filters rows using the given condition.
@@ -341,7 +483,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * }}}
    * @group dfops
    */
-  def filter(condition: Column): DataFrame
+  def filter(condition: Column): DataFrame = Filter(condition.expr, logicalPlan)
 
   /**
    * Filters rows using the given SQL expression.
@@ -350,7 +492,9 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * }}}
    * @group dfops
    */
-  def filter(conditionExpr: String): DataFrame
+  def filter(conditionExpr: String): DataFrame = {
+    filter(Column(new SqlParser().parseExpression(conditionExpr)))
+  }
 
   /**
    * Filters rows using the given condition. This is an alias for `filter`.
@@ -362,7 +506,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * }}}
    * @group dfops
    */
-  def where(condition: Column): DataFrame
+  def where(condition: Column): DataFrame = filter(condition)
 
   /**
    * Groups the [[DataFrame]] using the specified columns, so we can run aggregation on them.
@@ -381,7 +525,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @group dfops
    */
   @scala.annotation.varargs
-  def groupBy(cols: Column*): GroupedData
+  def groupBy(cols: Column*): GroupedData = new GroupedData(this, cols.map(_.expr))
 
   /**
    * Groups the [[DataFrame]] using the specified columns, so we can run aggregation on them.
@@ -403,7 +547,10 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @group dfops
    */
   @scala.annotation.varargs
-  def groupBy(col1: String, cols: String*): GroupedData
+  def groupBy(col1: String, cols: String*): GroupedData = {
+    val colNames: Seq[String] = col1 +: cols
+    new GroupedData(this, colNames.map(colName => resolve(colName)))
+  }
 
   /**
    * (Scala-specific) Compute aggregates by specifying a map from column name to
@@ -462,28 +609,28 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * and `head` is that `head` returns an array while `limit` returns a new [[DataFrame]].
    * @group dfops
    */
-  def limit(n: Int): DataFrame
+  def limit(n: Int): DataFrame = Limit(Literal(n), logicalPlan)
 
   /**
    * Returns a new [[DataFrame]] containing union of rows in this frame and another frame.
    * This is equivalent to `UNION ALL` in SQL.
    * @group dfops
    */
-  def unionAll(other: DataFrame): DataFrame
+  def unionAll(other: DataFrame): DataFrame = Union(logicalPlan, other.logicalPlan)
 
   /**
    * Returns a new [[DataFrame]] containing rows only in both this frame and another frame.
    * This is equivalent to `INTERSECT` in SQL.
    * @group dfops
    */
-  def intersect(other: DataFrame): DataFrame
+  def intersect(other: DataFrame): DataFrame = Intersect(logicalPlan, other.logicalPlan)
 
   /**
    * Returns a new [[DataFrame]] containing rows in this frame but not in another frame.
    * This is equivalent to `EXCEPT` in SQL.
    * @group dfops
    */
-  def except(other: DataFrame): DataFrame
+  def except(other: DataFrame): DataFrame = Except(logicalPlan, other.logicalPlan)
 
   /**
    * Returns a new [[DataFrame]] by sampling a fraction of rows.
@@ -493,7 +640,9 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @param seed Seed for sampling.
    * @group dfops
    */
-  def sample(withReplacement: Boolean, fraction: Double, seed: Long): DataFrame
+  def sample(withReplacement: Boolean, fraction: Double, seed: Long): DataFrame = {
+    Sample(fraction, withReplacement, seed, logicalPlan)
+  }
 
   /**
    * Returns a new [[DataFrame]] by sampling a fraction of rows, using a random seed.
@@ -527,8 +676,15 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * }}}
    * @group dfops
    */
-  def explode[A <: Product : TypeTag](input: Column*)(f: Row => TraversableOnce[A]): DataFrame
+  def explode[A <: Product : TypeTag](input: Column*)(f: Row => TraversableOnce[A]): DataFrame = {
+    val schema = ScalaReflection.schemaFor[A].dataType.asInstanceOf[StructType]
+    val attributes = schema.toAttributes
+    val rowFunction =
+      f.andThen(_.map(ScalaReflection.convertToCatalyst(_, schema).asInstanceOf[Row]))
+    val generator = UserDefinedGenerator(attributes, rowFunction, input.map(_.expr))
 
+    Generate(generator, join = true, outer = false, None, logicalPlan)
+  }
 
   /**
    * (Scala-specific) Returns a new [[DataFrame]] where a single column has been expanded to zero
@@ -540,10 +696,17 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * }}}
    * @group dfops
    */
-  def explode[A, B : TypeTag](
-      inputColumn: String,
-      outputColumn: String)(
-      f: A => TraversableOnce[B]): DataFrame
+  def explode[A, B : TypeTag](inputColumn: String, outputColumn: String)(f: A => TraversableOnce[B])
+    : DataFrame = {
+    val dataType = ScalaReflection.schemaFor[B].dataType
+    val attributes = AttributeReference(outputColumn, dataType)() :: Nil
+    def rowFunction(row: Row) = {
+      f(row(0).asInstanceOf[A]).map(o => Row(ScalaReflection.convertToCatalyst(o, dataType)))
+    }
+    val generator = UserDefinedGenerator(attributes, rowFunction, apply(inputColumn).expr :: Nil)
+
+    Generate(generator, join = true, outer = false, None, logicalPlan)
+  }
 
   /////////////////////////////////////////////////////////////////////////////
 
@@ -551,110 +714,130 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * Returns a new [[DataFrame]] by adding a column.
    * @group dfops
    */
-  def withColumn(colName: String, col: Column): DataFrame
+  def withColumn(colName: String, col: Column): DataFrame = select(Column("*"), col.as(colName))
 
   /**
    * Returns a new [[DataFrame]] with a column renamed.
    * @group dfops
    */
-  def withColumnRenamed(existingName: String, newName: String): DataFrame
+  def withColumnRenamed(existingName: String, newName: String): DataFrame = {
+    val resolver = sqlContext.analyzer.resolver
+    val colNames = schema.map { field =>
+      val name = field.name
+      if (resolver(name, existingName)) Column(name).as(newName) else Column(name)
+    }
+    select(colNames :_*)
+  }
 
   /**
    * Returns the first `n` rows.
    */
-  def head(n: Int): Array[Row]
+  def head(n: Int): Array[Row] = limit(n).collect()
 
   /**
    * Returns the first row.
    */
-  def head(): Row
+  def head(): Row = head(1).head
 
   /**
    * Returns the first row. Alias for head().
    */
-  override def first(): Row
+  override def first(): Row = head()
 
   /**
    * Returns a new RDD by applying a function to all rows of this DataFrame.
    * @group rdd
    */
-  override def map[R: ClassTag](f: Row => R): RDD[R]
+  override def map[R: ClassTag](f: Row => R): RDD[R] = rdd.map(f)
 
   /**
    * Returns a new RDD by first applying a function to all rows of this [[DataFrame]],
    * and then flattening the results.
    * @group rdd
    */
-  override def flatMap[R: ClassTag](f: Row => TraversableOnce[R]): RDD[R]
+  override def flatMap[R: ClassTag](f: Row => TraversableOnce[R]): RDD[R] = rdd.flatMap(f)
 
   /**
    * Returns a new RDD by applying a function to each partition of this DataFrame.
    * @group rdd
    */
-  override def mapPartitions[R: ClassTag](f: Iterator[Row] => Iterator[R]): RDD[R]
+  override def mapPartitions[R: ClassTag](f: Iterator[Row] => Iterator[R]): RDD[R] = {
+    rdd.mapPartitions(f)
+  }
 
   /**
    * Applies a function `f` to all rows.
    * @group rdd
    */
-  override def foreach(f: Row => Unit): Unit
+  override def foreach(f: Row => Unit): Unit = rdd.foreach(f)
 
   /**
    * Applies a function f to each partition of this [[DataFrame]].
    * @group rdd
    */
-  override def foreachPartition(f: Iterator[Row] => Unit): Unit
+  override def foreachPartition(f: Iterator[Row] => Unit): Unit = rdd.foreachPartition(f)
 
   /**
    * Returns the first `n` rows in the [[DataFrame]].
    * @group action
    */
-  override def take(n: Int): Array[Row]
+  override def take(n: Int): Array[Row] = head(n)
 
   /**
    * Returns an array that contains all of [[Row]]s in this [[DataFrame]].
    * @group action
    */
-  override def collect(): Array[Row]
+  override def collect(): Array[Row] = queryExecution.executedPlan.executeCollect()
 
   /**
    * Returns a Java list that contains all of [[Row]]s in this [[DataFrame]].
    * @group action
    */
-  override def collectAsList(): java.util.List[Row]
+  override def collectAsList(): java.util.List[Row] = java.util.Arrays.asList(rdd.collect() :_*)
 
   /**
    * Returns the number of rows in the [[DataFrame]].
    * @group action
    */
-  override def count(): Long
+  override def count(): Long = groupBy().count().rdd.collect().head.getLong(0)
 
   /**
    * Returns a new [[DataFrame]] that has exactly `numPartitions` partitions.
    * @group rdd
    */
-  override def repartition(numPartitions: Int): DataFrame
+  override def repartition(numPartitions: Int): DataFrame = {
+    sqlContext.createDataFrame(rdd.repartition(numPartitions), schema)
+  }
 
   /**
    * Returns a new [[DataFrame]] that contains only the unique rows from this [[DataFrame]].
    * @group dfops
    */
-  override def distinct: DataFrame
+  override def distinct: DataFrame = Distinct(logicalPlan)
 
   /**
    * @group basic
    */
-  override def persist(): this.type
+  override def persist(): this.type = {
+    sqlContext.cacheManager.cacheQuery(this)
+    this
+  }
 
   /**
    * @group basic
    */
-  override def persist(newLevel: StorageLevel): this.type
+  override def persist(newLevel: StorageLevel): this.type = {
+    sqlContext.cacheManager.cacheQuery(this, None, newLevel)
+    this
+  }
 
   /**
    * @group basic
    */
-  override def unpersist(blocking: Boolean): this.type
+  override def unpersist(blocking: Boolean): this.type = {
+    sqlContext.cacheManager.tryUncacheQuery(this, blocking)
+    this
+  }
 
   /////////////////////////////////////////////////////////////////////////////
   // I/O
@@ -664,7 +847,11 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * Returns the content of the [[DataFrame]] as an [[RDD]] of [[Row]]s.
    * @group rdd
    */
-  def rdd: RDD[Row]
+  def rdd: RDD[Row] = {
+    // use a local variable to make sure the map closure doesn't capture the whole DataFrame
+    val schema = this.schema
+    queryExecution.executedPlan.execute().map(ScalaReflection.convertRowToScala(_, schema))
+  }
 
   /**
    * Returns the content of the [[DataFrame]] as a [[JavaRDD]] of [[Row]]s.
@@ -684,7 +871,9 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    *
    * @group basic
    */
-  def registerTempTable(tableName: String): Unit
+  def registerTempTable(tableName: String): Unit = {
+    sqlContext.registerDataFrameAsTable(this, tableName)
+  }
 
   /**
    * Saves the contents of this [[DataFrame]] as a parquet file, preserving the schema.
@@ -692,7 +881,13 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * using the `parquetFile` function in [[SQLContext]].
    * @group output
    */
-  def saveAsParquetFile(path: String): Unit
+  def saveAsParquetFile(path: String): Unit = {
+    if (sqlContext.conf.parquetUseDataSourceApi) {
+      save("org.apache.spark.sql.parquet", SaveMode.ErrorIfExists, Map("path" -> path))
+    } else {
+      sqlContext.executePlan(WriteToFile(path, logicalPlan)).toRdd
+    }
+  }
 
   /**
    * :: Experimental ::
@@ -747,9 +942,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @group output
    */
   @Experimental
-  def saveAsTable(
-      tableName: String,
-      source: String): Unit = {
+  def saveAsTable(tableName: String, source: String): Unit = {
     saveAsTable(tableName, source, SaveMode.ErrorIfExists)
   }
 
@@ -765,10 +958,7 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @group output
    */
   @Experimental
-  def saveAsTable(
-      tableName: String,
-      source: String,
-      mode: SaveMode): Unit = {
+  def saveAsTable(tableName: String, source: String, mode: SaveMode): Unit = {
     saveAsTable(tableName, source, mode, Map.empty[String, String])
   }
 
@@ -809,7 +999,18 @@ trait DataFrame extends RDDApi[Row] with Serializable {
       tableName: String,
       source: String,
       mode: SaveMode,
-      options: Map[String, String]): Unit
+      options: Map[String, String]): Unit = {
+    val cmd =
+      CreateTableUsingAsSelect(
+        tableName,
+        source,
+        temporary = false,
+        mode,
+        options,
+        logicalPlan)
+
+    sqlContext.executePlan(cmd).toRdd
+  }
 
   /**
    * :: Experimental ::
@@ -882,7 +1083,9 @@ trait DataFrame extends RDDApi[Row] with Serializable {
   def save(
       source: String,
       mode: SaveMode,
-      options: Map[String, String]): Unit
+      options: Map[String, String]): Unit = {
+    ResolvedDataSource(sqlContext, source, mode, options, this)
+  }
 
   /**
    * :: Experimental ::
@@ -890,7 +1093,10 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * @group output
    */
   @Experimental
-  def insertInto(tableName: String, overwrite: Boolean): Unit
+  def insertInto(tableName: String, overwrite: Boolean): Unit = {
+    sqlContext.executePlan(InsertIntoTable(UnresolvedRelation(Seq(tableName)),
+      Map.empty, logicalPlan, overwrite)).toRdd
+  }
 
   /**
    * :: Experimental ::
@@ -905,7 +1111,31 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * Returns the content of the [[DataFrame]] as a RDD of JSON strings.
    * @group rdd
    */
-  def toJSON: RDD[String]
+  def toJSON: RDD[String] = {
+    val rowSchema = this.schema
+    this.mapPartitions { iter =>
+      val writer = new CharArrayWriter()
+      // create the Generator without separator inserted between 2 records
+      val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null)
+
+      new Iterator[String] {
+        override def hasNext = iter.hasNext
+        override def next(): String = {
+          JsonRDD.rowToJSON(rowSchema, gen)(iter.next())
+          gen.flush()
+
+          val json = writer.toString
+          if (hasNext) {
+            writer.reset()
+          } else {
+            gen.close()
+          }
+
+          json
+        }
+      }
+    }
+  }
 
   ////////////////////////////////////////////////////////////////////////////
   // JDBC Write Support
@@ -919,7 +1149,21 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * exists.
    * @group output
    */
-  def createJDBCTable(url: String, table: String, allowExisting: Boolean): Unit
+  def createJDBCTable(url: String, table: String, allowExisting: Boolean): Unit = {
+    val conn = DriverManager.getConnection(url)
+    try {
+      if (allowExisting) {
+        val sql = s"DROP TABLE IF EXISTS $table"
+        conn.prepareStatement(sql).executeUpdate()
+      }
+      val schema = JDBCWriteDetails.schemaString(this, url)
+      val sql = s"CREATE TABLE $table ($schema)"
+      conn.prepareStatement(sql).executeUpdate()
+    } finally {
+      conn.close()
+    }
+    JDBCWriteDetails.saveTable(this, url, table)
+  }
 
   /**
    * Save this RDD to a JDBC database at `url` under the table name `table`.
@@ -933,8 +1177,18 @@ trait DataFrame extends RDDApi[Row] with Serializable {
    * `INSERT INTO table VALUES (?, ?, ..., ?)` should not fail.
    * @group output
    */
-  def insertIntoJDBC(url: String, table: String, overwrite: Boolean): Unit
-
+  def insertIntoJDBC(url: String, table: String, overwrite: Boolean): Unit = {
+    if (overwrite) {
+      val conn = DriverManager.getConnection(url)
+      try {
+        val sql = s"TRUNCATE TABLE $table"
+        conn.prepareStatement(sql).executeUpdate()
+      } finally {
+        conn.close()
+      }
+    }
+    JDBCWriteDetails.saveTable(this, url, table)
+  }
 
   ////////////////////////////////////////////////////////////////////////////
   // for Python API
@@ -943,5 +1197,9 @@ trait DataFrame extends RDDApi[Row] with Serializable {
   /**
    * Converts a JavaRDD to a PythonRDD.
    */
-  protected[sql] def javaToPython: JavaRDD[Array[Byte]]
+  protected[sql] def javaToPython: JavaRDD[Array[Byte]] = {
+    val fieldTypes = schema.fields.map(_.dataType)
+    val jrdd = rdd.map(EvaluatePython.rowToArray(_, fieldTypes)).toJavaRDD()
+    SerDeUtil.javaToPython(jrdd)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
deleted file mode 100644
index 25bc9d929237d..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package org.apache.spark.sql
-
-import java.io.CharArrayWriter
-import java.sql.DriverManager
-
-import scala.language.implicitConversions
-import scala.reflect.ClassTag
-import scala.reflect.runtime.universe.TypeTag
-import scala.collection.JavaConversions._
-
-import com.fasterxml.jackson.core.JsonFactory
-
-import org.apache.spark.api.java.JavaRDD
-import org.apache.spark.api.python.SerDeUtil
-import org.apache.spark.rdd.RDD
-import org.apache.spark.storage.StorageLevel
-import org.apache.spark.sql.catalyst.{expressions, SqlParser, ScalaReflection}
-import org.apache.spark.sql.catalyst.analysis.{ResolvedStar, UnresolvedRelation}
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.{JoinType, Inner}
-import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.execution.{ExplainCommand, LogicalRDD, EvaluatePython}
-import org.apache.spark.sql.jdbc.JDBCWriteDetails
-import org.apache.spark.sql.json.JsonRDD
-import org.apache.spark.sql.sources._
-import org.apache.spark.sql.types.{NumericType, StructType}
-
-/**
- * Internal implementation of [[DataFrame]]. Users of the API should use [[DataFrame]] directly.
- */
-private[sql] class DataFrameImpl protected[sql](
-    @transient override val sqlContext: SQLContext,
-    @transient val queryExecution: SQLContext#QueryExecution)
-  extends DataFrame {
-
-  /**
-   * A constructor that automatically analyzes the logical plan.
-   *
-   * This reports error eagerly as the [[DataFrame]] is constructed, unless
-   * [[SQLConf.dataFrameEagerAnalysis]] is turned off.
-   */
-  def this(sqlContext: SQLContext, logicalPlan: LogicalPlan) = {
-    this(sqlContext, {
-      val qe = sqlContext.executePlan(logicalPlan)
-      if (sqlContext.conf.dataFrameEagerAnalysis) {
-        qe.analyzed  // This should force analysis and throw errors if there are any
-      }
-      qe
-    })
-  }
-
-  @transient protected[sql] override val logicalPlan: LogicalPlan = queryExecution.logical match {
-    // For various commands (like DDL) and queries with side effects, we force query optimization to
-    // happen right away to let these side effects take place eagerly.
-    case _: Command |
-         _: InsertIntoTable |
-         _: CreateTableAsSelect[_] |
-         _: CreateTableUsingAsSelect |
-         _: WriteToFile =>
-      LogicalRDD(queryExecution.analyzed.output, queryExecution.toRdd)(sqlContext)
-    case _ =>
-      queryExecution.logical
-  }
-
-  /**
-   * An implicit conversion function internal to this class for us to avoid doing
-   * "new DataFrameImpl(...)" everywhere.
-   */
-  @inline private implicit def logicalPlanToDataFrame(logicalPlan: LogicalPlan): DataFrame = {
-    new DataFrameImpl(sqlContext, logicalPlan)
-  }
-
-  protected[sql] def resolve(colName: String): NamedExpression = {
-    queryExecution.analyzed.resolve(colName, sqlContext.analyzer.resolver).getOrElse {
-      throw new AnalysisException(
-        s"""Cannot resolve column name "$colName" among (${schema.fieldNames.mkString(", ")})""")
-    }
-  }
-
-  protected[sql] def numericColumns: Seq[Expression] = {
-    schema.fields.filter(_.dataType.isInstanceOf[NumericType]).map { n =>
-      queryExecution.analyzed.resolve(n.name, sqlContext.analyzer.resolver).get
-    }
-  }
-
-  override def toDF(colNames: String*): DataFrame = {
-    require(schema.size == colNames.size,
-      "The number of columns doesn't match.\n" +
-        "Old column names: " + schema.fields.map(_.name).mkString(", ") + "\n" +
-        "New column names: " + colNames.mkString(", "))
-
-    val newCols = schema.fieldNames.zip(colNames).map { case (oldName, newName) =>
-      apply(oldName).as(newName)
-    }
-    select(newCols :_*)
-  }
-
-  override def schema: StructType = queryExecution.analyzed.schema
-
-  override def dtypes: Array[(String, String)] = schema.fields.map { field =>
-    (field.name, field.dataType.toString)
-  }
-
-  override def columns: Array[String] = schema.fields.map(_.name)
-
-  override def printSchema(): Unit = println(schema.treeString)
-
-  override def explain(extended: Boolean): Unit = {
-    ExplainCommand(
-      logicalPlan,
-      extended = extended).queryExecution.executedPlan.executeCollect().map {
-      r => println(r.getString(0))
-    }
-  }
-
-  override def isLocal: Boolean = {
-    logicalPlan.isInstanceOf[LocalRelation]
-  }
-
-  /**
-   * Internal API for Python
-   */
-  private[sql] def showString(): String = {
-    val data = take(20)
-    val numCols = schema.fieldNames.length
-
-    // For cells that are beyond 20 characters, replace it with the first 17 and "..."
-    val rows: Seq[Seq[String]] = schema.fieldNames.toSeq +: data.map { row =>
-      row.toSeq.map { cell =>
-        val str = if (cell == null) "null" else cell.toString
-        if (str.length > 20) str.substring(0, 17) + "..." else str
-      } : Seq[String]
-    }
-
-    // Compute the width of each column
-    val colWidths = Array.fill(numCols)(0)
-    for (row <- rows) {
-      for ((cell, i) <- row.zipWithIndex)  {
-        colWidths(i) = math.max(colWidths(i), cell.length)
-      }
-    }
-
-    // Pad the cells
-    rows.map { row =>
-      row.zipWithIndex.map { case (cell, i) =>
-        String.format(s"%-${colWidths(i)}s", cell)
-      }.mkString(" ")
-    }.mkString("\n")
-  }
-
-  override def show(): Unit = {
-    println(showString())
-  }
-
-  override def join(right: DataFrame): DataFrame = {
-    Join(logicalPlan, right.logicalPlan, joinType = Inner, None)
-  }
-
-  override def join(right: DataFrame, joinExprs: Column): DataFrame = {
-    Join(logicalPlan, right.logicalPlan, Inner, Some(joinExprs.expr))
-  }
-
-  override def join(right: DataFrame, joinExprs: Column, joinType: String): DataFrame = {
-    Join(logicalPlan, right.logicalPlan, JoinType(joinType), Some(joinExprs.expr))
-  }
-
-  override def sort(sortCol: String, sortCols: String*): DataFrame = {
-    sort((sortCol +: sortCols).map(apply) :_*)
-  }
-
-  override def sort(sortExprs: Column*): DataFrame = {
-    val sortOrder: Seq[SortOrder] = sortExprs.map { col =>
-      col.expr match {
-        case expr: SortOrder =>
-          expr
-        case expr: Expression =>
-          SortOrder(expr, Ascending)
-      }
-    }
-    Sort(sortOrder, global = true, logicalPlan)
-  }
-
-  override def orderBy(sortCol: String, sortCols: String*): DataFrame = {
-    sort(sortCol, sortCols :_*)
-  }
-
-  override def orderBy(sortExprs: Column*): DataFrame = {
-    sort(sortExprs :_*)
-  }
-
-  override def col(colName: String): Column = colName match {
-    case "*" =>
-      Column(ResolvedStar(schema.fieldNames.map(resolve)))
-    case _ =>
-      val expr = resolve(colName)
-      Column(sqlContext, Project(Seq(expr), logicalPlan), expr)
-  }
-
-  override def as(alias: String): DataFrame = Subquery(alias, logicalPlan)
-
-  override def as(alias: Symbol): DataFrame = Subquery(alias.name, logicalPlan)
-
-  override def select(cols: Column*): DataFrame = {
-    val namedExpressions = cols.map {
-      case Column(expr: NamedExpression) => expr
-      case Column(expr: Expression) => Alias(expr, expr.prettyString)()
-    }
-    Project(namedExpressions.toSeq, logicalPlan)
-  }
-
-  override def select(col: String, cols: String*): DataFrame = {
-    select((col +: cols).map(Column(_)) :_*)
-  }
-
-  override def selectExpr(exprs: String*): DataFrame = {
-    select(exprs.map { expr =>
-      Column(new SqlParser().parseExpression(expr))
-    }: _*)
-  }
-
-  override def withColumn(colName: String, col: Column): DataFrame = {
-    select(Column("*"), col.as(colName))
-  }
-
-  override def withColumnRenamed(existingName: String, newName: String): DataFrame = {
-    val resolver = sqlContext.analyzer.resolver
-    val colNames = schema.map { field =>
-      val name = field.name
-      if (resolver(name, existingName)) Column(name).as(newName) else Column(name)
-    }
-    select(colNames :_*)
-  }
-
-  override def filter(condition: Column): DataFrame = {
-    Filter(condition.expr, logicalPlan)
-  }
-
-  override def filter(conditionExpr: String): DataFrame = {
-    filter(Column(new SqlParser().parseExpression(conditionExpr)))
-  }
-
-  override def where(condition: Column): DataFrame = {
-    filter(condition)
-  }
-
-  override def groupBy(cols: Column*): GroupedData = {
-    new GroupedData(this, cols.map(_.expr))
-  }
-
-  override def groupBy(col1: String, cols: String*): GroupedData = {
-    val colNames: Seq[String] = col1 +: cols
-    new GroupedData(this, colNames.map(colName => resolve(colName)))
-  }
-
-  override def limit(n: Int): DataFrame = {
-    Limit(Literal(n), logicalPlan)
-  }
-
-  override def unionAll(other: DataFrame): DataFrame = {
-    Union(logicalPlan, other.logicalPlan)
-  }
-
-  override def intersect(other: DataFrame): DataFrame = {
-    Intersect(logicalPlan, other.logicalPlan)
-  }
-
-  override def except(other: DataFrame): DataFrame = {
-    Except(logicalPlan, other.logicalPlan)
-  }
-
-  override def sample(withReplacement: Boolean, fraction: Double, seed: Long): DataFrame = {
-    Sample(fraction, withReplacement, seed, logicalPlan)
-  }
-
-  override def explode[A <: Product : TypeTag]
-      (input: Column*)(f: Row => TraversableOnce[A]): DataFrame = {
-    val schema = ScalaReflection.schemaFor[A].dataType.asInstanceOf[StructType]
-    val attributes = schema.toAttributes
-    val rowFunction =
-      f.andThen(_.map(ScalaReflection.convertToCatalyst(_, schema).asInstanceOf[Row]))
-    val generator = UserDefinedGenerator(attributes, rowFunction, input.map(_.expr))
-
-    Generate(generator, join = true, outer = false, None, logicalPlan)
-  }
-
-  override def explode[A, B : TypeTag](
-      inputColumn: String,
-      outputColumn: String)(
-      f: A => TraversableOnce[B]): DataFrame = {
-    val dataType = ScalaReflection.schemaFor[B].dataType
-    val attributes = AttributeReference(outputColumn, dataType)() :: Nil
-    def rowFunction(row: Row) = {
-      f(row(0).asInstanceOf[A]).map(o => Row(ScalaReflection.convertToCatalyst(o, dataType)))
-    }
-    val generator = UserDefinedGenerator(attributes, rowFunction, apply(inputColumn).expr :: Nil)
-
-    Generate(generator, join = true, outer = false, None, logicalPlan)
-
-  }
-
-  /////////////////////////////////////////////////////////////////////////////
-  // RDD API
-  /////////////////////////////////////////////////////////////////////////////
-
-  override def head(n: Int): Array[Row] = limit(n).collect()
-
-  override def head(): Row = head(1).head
-
-  override def first(): Row = head()
-
-  override def map[R: ClassTag](f: Row => R): RDD[R] = rdd.map(f)
-
-  override def flatMap[R: ClassTag](f: Row => TraversableOnce[R]): RDD[R] = rdd.flatMap(f)
-
-  override def mapPartitions[R: ClassTag](f: Iterator[Row] => Iterator[R]): RDD[R] = {
-    rdd.mapPartitions(f)
-  }
-
-  override def foreach(f: Row => Unit): Unit = rdd.foreach(f)
-
-  override def foreachPartition(f: Iterator[Row] => Unit): Unit = rdd.foreachPartition(f)
-
-  override def take(n: Int): Array[Row] = head(n)
-
-  override def collect(): Array[Row] = queryExecution.executedPlan.executeCollect()
-
-  override def collectAsList(): java.util.List[Row] = java.util.Arrays.asList(rdd.collect() :_*)
-
-  override def count(): Long = groupBy().count().rdd.collect().head.getLong(0)
-
-  override def repartition(numPartitions: Int): DataFrame = {
-    sqlContext.createDataFrame(rdd.repartition(numPartitions), schema)
-  }
-
-  override def distinct: DataFrame = Distinct(logicalPlan)
-
-  override def persist(): this.type = {
-    sqlContext.cacheManager.cacheQuery(this)
-    this
-  }
-
-  override def persist(newLevel: StorageLevel): this.type = {
-    sqlContext.cacheManager.cacheQuery(this, None, newLevel)
-    this
-  }
-
-  override def unpersist(blocking: Boolean): this.type = {
-    sqlContext.cacheManager.tryUncacheQuery(this, blocking)
-    this
-  }
-
-  /////////////////////////////////////////////////////////////////////////////
-  // I/O
-  /////////////////////////////////////////////////////////////////////////////
-
-  override def rdd: RDD[Row] = {
-    // use a local variable to make sure the map closure doesn't capture the whole DataFrame
-    val schema = this.schema
-    queryExecution.executedPlan.execute().map(ScalaReflection.convertRowToScala(_, schema))
-  }
-
-  override def registerTempTable(tableName: String): Unit = {
-    sqlContext.registerDataFrameAsTable(this, tableName)
-  }
-
-  override def saveAsParquetFile(path: String): Unit = {
-    if (sqlContext.conf.parquetUseDataSourceApi) {
-      save("org.apache.spark.sql.parquet", SaveMode.ErrorIfExists, Map("path" -> path))
-    } else {
-      sqlContext.executePlan(WriteToFile(path, logicalPlan)).toRdd
-    }
-  }
-
-  override def saveAsTable(
-      tableName: String,
-      source: String,
-      mode: SaveMode,
-      options: Map[String, String]): Unit = {
-    val cmd =
-      CreateTableUsingAsSelect(
-        tableName,
-        source,
-        temporary = false,
-        mode,
-        options,
-        logicalPlan)
-
-    sqlContext.executePlan(cmd).toRdd
-  }
-
-  override def save(
-      source: String,
-      mode: SaveMode,
-      options: Map[String, String]): Unit = {
-    ResolvedDataSource(sqlContext, source, mode, options, this)
-  }
-
-  override def insertInto(tableName: String, overwrite: Boolean): Unit = {
-    sqlContext.executePlan(InsertIntoTable(UnresolvedRelation(Seq(tableName)),
-      Map.empty, logicalPlan, overwrite)).toRdd
-  }
-
-  override def toJSON: RDD[String] = {
-    val rowSchema = this.schema
-    this.mapPartitions { iter =>
-      val writer = new CharArrayWriter()
-      // create the Generator without separator inserted between 2 records
-      val gen = new JsonFactory().createGenerator(writer).setRootValueSeparator(null)
-
-      new Iterator[String] {
-        override def hasNext = iter.hasNext
-        override def next(): String = {
-          JsonRDD.rowToJSON(rowSchema, gen)(iter.next())
-          gen.flush()
-
-          val json = writer.toString
-          if (hasNext) {
-            writer.reset()
-          } else {
-            gen.close()
-          }
-
-          json
-        }
-      }
-    }
-  }
-
-  def createJDBCTable(url: String, table: String, allowExisting: Boolean): Unit = {
-    val conn = DriverManager.getConnection(url)
-    try {
-      if (allowExisting) {
-        val sql = s"DROP TABLE IF EXISTS $table"
-        conn.prepareStatement(sql).executeUpdate()
-      }
-      val schema = JDBCWriteDetails.schemaString(this, url)
-      val sql = s"CREATE TABLE $table ($schema)"
-      conn.prepareStatement(sql).executeUpdate()
-    } finally {
-      conn.close()
-    }
-    JDBCWriteDetails.saveTable(this, url, table)
-  }
-
-  def insertIntoJDBC(url: String, table: String, overwrite: Boolean): Unit = {
-    if (overwrite) {
-      val conn = DriverManager.getConnection(url)
-      try {
-        val sql = s"TRUNCATE TABLE $table"
-        conn.prepareStatement(sql).executeUpdate()
-      } finally {
-        conn.close()
-      }
-    }
-    JDBCWriteDetails.saveTable(this, url, table)
-  }
-
-  ////////////////////////////////////////////////////////////////////////////
-  // for Python API
-  ////////////////////////////////////////////////////////////////////////////
-  protected[sql] override def javaToPython: JavaRDD[Array[Byte]] = {
-    val fieldTypes = schema.fields.map(_.dataType)
-    val jrdd = rdd.map(EvaluatePython.rowToArray(_, fieldTypes)).toJavaRDD()
-    SerDeUtil.javaToPython(jrdd)
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index 17158303b889a..d00175265924c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.types.NumericType
  * A set of methods for aggregations on a [[DataFrame]], created by [[DataFrame.groupBy]].
  */
 @Experimental
-class GroupedData protected[sql](df: DataFrameImpl, groupingExprs: Seq[Expression]) {
+class GroupedData protected[sql](df: DataFrame, groupingExprs: Seq[Expression]) {
 
   private[this] implicit def toDF(aggExprs: Seq[NamedExpression]): DataFrame = {
     val namedGroupingExprs = groupingExprs.map {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala b/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
deleted file mode 100644
index b48b682b36e1f..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/IncomputableColumn.scala
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package org.apache.spark.sql
-
-import scala.reflect.ClassTag
-import scala.reflect.runtime.universe.TypeTag
-
-import org.apache.spark.api.java.JavaRDD
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedStar}
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.storage.StorageLevel
-import org.apache.spark.sql.types.StructType
-
-private[sql] class IncomputableColumn(protected[sql] val expr: Expression) extends Column {
-
-  def this(name: String) = this(name match {
-    case "*" => UnresolvedStar(None)
-    case _ if name.endsWith(".*") => UnresolvedStar(Some(name.substring(0, name.length - 2)))
-    case _ => UnresolvedAttribute(name)
-  })
-
-  private def err[T](): T = {
-    throw new UnsupportedOperationException("Cannot run this method on an UncomputableColumn")
-  }
-
-  override def toString = expr.prettyString
-
-  override def isComputable: Boolean = false
-
-  override val sqlContext: SQLContext = null
-
-  override def queryExecution = err()
-
-  protected[sql] override def logicalPlan: LogicalPlan = err()
-
-  override def toDF(colNames: String*): DataFrame = err()
-
-  override def schema: StructType = err()
-
-  override def dtypes: Array[(String, String)] = err()
-
-  override def columns: Array[String] = err()
-
-  override def printSchema(): Unit = err()
-
-  override def show(): Unit = err()
-
-  override def isLocal: Boolean = false
-
-  override def join(right: DataFrame): DataFrame = err()
-
-  override def join(right: DataFrame, joinExprs: Column): DataFrame = err()
-
-  override def join(right: DataFrame, joinExprs: Column, joinType: String): DataFrame = err()
-
-  override def sort(sortCol: String, sortCols: String*): DataFrame = err()
-
-  override def sort(sortExprs: Column*): DataFrame = err()
-
-  override def orderBy(sortCol: String, sortCols: String*): DataFrame = err()
-
-  override def orderBy(sortExprs: Column*): DataFrame = err()
-
-  override def col(colName: String): Column = err()
-
-  override def select(cols: Column*): DataFrame = err()
-
-  override def select(col: String, cols: String*): DataFrame = err()
-
-  override def selectExpr(exprs: String*): DataFrame = err()
-
-  override def withColumn(colName: String, col: Column): DataFrame = err()
-
-  override def withColumnRenamed(existingName: String, newName: String): DataFrame = err()
-
-  override def filter(condition: Column): DataFrame = err()
-
-  override def filter(conditionExpr: String): DataFrame = err()
-
-  override def where(condition: Column): DataFrame = err()
-
-  override def groupBy(cols: Column*): GroupedData = err()
-
-  override def groupBy(col1: String, cols: String*): GroupedData = err()
-
-  override def limit(n: Int): DataFrame = err()
-
-  override def unionAll(other: DataFrame): DataFrame = err()
-
-  override def intersect(other: DataFrame): DataFrame = err()
-
-  override def except(other: DataFrame): DataFrame = err()
-
-  override def sample(withReplacement: Boolean, fraction: Double, seed: Long): DataFrame = err()
-
-  override def explode[A <: Product : TypeTag]
-      (input: Column*)(f: Row => TraversableOnce[A]): DataFrame = err()
-
-  override def explode[A, B : TypeTag](
-      inputColumn: String,
-      outputColumn: String)(
-      f: A => TraversableOnce[B]): DataFrame = err()
-
-  /////////////////////////////////////////////////////////////////////////////
-
-  override def head(n: Int): Array[Row] = err()
-
-  override def head(): Row = err()
-
-  override def first(): Row = err()
-
-  override def map[R: ClassTag](f: Row => R): RDD[R] = err()
-
-  override def flatMap[R: ClassTag](f: Row => TraversableOnce[R]): RDD[R] = err()
-
-  override def mapPartitions[R: ClassTag](f: Iterator[Row] => Iterator[R]): RDD[R] = err()
-
-  override def foreach(f: Row => Unit): Unit = err()
-
-  override def foreachPartition(f: Iterator[Row] => Unit): Unit = err()
-
-  override def take(n: Int): Array[Row] = err()
-
-  override def collect(): Array[Row] = err()
-
-  override def collectAsList(): java.util.List[Row] = err()
-
-  override def count(): Long = err()
-
-  override def repartition(numPartitions: Int): DataFrame = err()
-
-  override def distinct: DataFrame = err()
-
-  override def persist(): this.type = err()
-
-  override def persist(newLevel: StorageLevel): this.type = err()
-
-  override def unpersist(blocking: Boolean): this.type = err()
-
-  override def rdd: RDD[Row] = err()
-
-  override def registerTempTable(tableName: String): Unit = err()
-
-  override def saveAsParquetFile(path: String): Unit = err()
-
-  override def saveAsTable(
-      tableName: String,
-      source: String,
-      mode: SaveMode,
-      options: Map[String, String]): Unit = err()
-
-  override def save(
-      source: String,
-      mode: SaveMode,
-      options: Map[String, String]): Unit = err()
-
-  override def insertInto(tableName: String, overwrite: Boolean): Unit = err()
-
-  def createJDBCTable(url: String, table: String, allowExisting: Boolean): Unit = err()
-
-  def insertIntoJDBC(url: String, table: String, overwrite: Boolean): Unit = err()
-
-  override def toJSON: RDD[String] = err()
-
-  protected[sql] override def javaToPython: JavaRDD[Array[Byte]] = err()
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index a63d733ece627..928b0deb61921 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -28,49 +28,10 @@ class ColumnExpressionSuite extends QueryTest {
 
   // TODO: Add test cases for bitwise operations.
 
-  test("computability check") {
-    def shouldBeComputable(c: Column): Unit = assert(c.isComputable === true)
-
-    def shouldNotBeComputable(c: Column): Unit = {
-      assert(c.isComputable === false)
-      intercept[UnsupportedOperationException] { c.head() }
-    }
-
-    shouldBeComputable(testData2("a"))
-    shouldBeComputable(testData2("b"))
-
-    shouldBeComputable(testData2("a") + testData2("b"))
-    shouldBeComputable(testData2("a") + testData2("b") + 1)
-
-    shouldBeComputable(-testData2("a"))
-    shouldBeComputable(!testData2("a"))
-
-    shouldNotBeComputable(testData2.select(($"a" + 1).as("c"))("c") + testData2("b"))
-    shouldNotBeComputable(
-      testData2.select(($"a" + 1).as("c"))("c") + testData2.select(($"b" / 2).as("d"))("d"))
-    shouldNotBeComputable(
-      testData2.select(($"a" + 1).as("c")).select(($"c" + 2).as("d"))("d") + testData2("b"))
-
-    // Literals and unresolved columns should not be computable.
-    shouldNotBeComputable(col("1"))
-    shouldNotBeComputable(col("1") + 2)
-    shouldNotBeComputable(lit(100))
-    shouldNotBeComputable(lit(100) + 10)
-    shouldNotBeComputable(-col("1"))
-    shouldNotBeComputable(!col("1"))
-
-    // Getting data from different frames should not be computable.
-    shouldNotBeComputable(testData2("a") + testData("key"))
-    shouldNotBeComputable(testData2("a") + 1 + testData("key"))
-
-    // Aggregate functions alone should not be computable.
-    shouldNotBeComputable(sum(testData2("a")))
-  }
-
   test("collect on column produced by a binary operator") {
     val df = Seq((1, 2, 3)).toDF("a", "b", "c")
-    checkAnswer(df("a") + df("b"), Seq(Row(3)))
-    checkAnswer(df("a") + df("b").as("c"), Seq(Row(3)))
+    checkAnswer(df.select(df("a") + df("b")), Seq(Row(3)))
+    checkAnswer(df.select(df("a") + df("b").as("c")), Seq(Row(3)))
   }
 
   test("star") {
@@ -78,7 +39,6 @@ class ColumnExpressionSuite extends QueryTest {
   }
 
   test("star qualified by data frame object") {
-    // This is not yet supported.
     val df = testData.toDF
     val goldAnswer = df.collect().toSeq
     checkAnswer(df.select(df("*")), goldAnswer)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index f31bc38922d4e..6b9b3a8425964 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -56,10 +56,7 @@ class DataFrameSuite extends QueryTest {
 
   test("dataframe toString") {
     assert(testData.toString === "[key: int, value: string]")
-    assert(testData("key").toString === "[key: int]")
-  }
-
-  test("incomputable toString") {
+    assert(testData("key").toString === "key")
     assert($"test".toString === "test")
   }
 
@@ -431,7 +428,7 @@ class DataFrameSuite extends QueryTest {
 
   test("apply on query results (SPARK-5462)") {
     val df = testData.sqlContext.sql("select key from testData")
-    checkAnswer(df("key"), testData.select('key).collect().toSeq)
+    checkAnswer(df.select(df("key")), testData.select('key).collect().toSeq)
   }
 
 }

From a5fed34355b403188ad50b567ab62ee54597b493 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Thu, 19 Feb 2015 12:46:27 -0800
Subject: [PATCH 255/357] [SPARK-5902] [ml] Made PipelineStage.transformSchema
 public instead of private to ml

For users to implement their own PipelineStages, we need to make PipelineStage.transformSchema be public instead of private to ml.  This would be nice to include in Spark 1.3

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #4682 from jkbradley/SPARK-5902 and squashes the following commits:

6f02357 [Joseph K. Bradley] Made transformSchema public
0e6d0a0 [Joseph K. Bradley] made implementations of transformSchema protected as well
fdaf26a [Joseph K. Bradley] Made PipelineStage.transformSchema protected instead of private[ml]
---
 .../scala/org/apache/spark/ml/Pipeline.scala     | 16 ++++++++++++----
 .../apache/spark/ml/feature/StandardScaler.scala |  4 ++--
 .../spark/ml/impl/estimator/Predictor.scala      |  4 ++--
 .../org/apache/spark/ml/recommendation/ALS.scala |  4 ++--
 .../apache/spark/ml/tuning/CrossValidator.scala  |  4 ++--
 5 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index 5607ed21afe18..5bbcd2e080e07 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml
 import scala.collection.mutable.ListBuffer
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.{AlphaComponent, DeveloperApi}
 import org.apache.spark.ml.param.{Param, ParamMap}
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types.StructType
@@ -33,9 +33,17 @@ import org.apache.spark.sql.types.StructType
 abstract class PipelineStage extends Serializable with Logging {
 
   /**
+   * :: DeveloperAPI ::
+   *
    * Derives the output schema from the input schema and parameters.
+   * The schema describes the columns and types of the data.
+   *
+   * @param schema  Input schema to this stage
+   * @param paramMap  Parameters passed to this stage
+   * @return  Output schema from this stage
    */
-  private[ml] def transformSchema(schema: StructType, paramMap: ParamMap): StructType
+  @DeveloperApi
+  def transformSchema(schema: StructType, paramMap: ParamMap): StructType
 
   /**
    * Derives the output schema from the input schema and parameters, optionally with logging.
@@ -126,7 +134,7 @@ class Pipeline extends Estimator[PipelineModel] {
     new PipelineModel(this, map, transformers.toArray)
   }
 
-  private[ml] override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
+  override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
     val map = this.paramMap ++ paramMap
     val theStages = map(stages)
     require(theStages.toSet.size == theStages.size,
@@ -171,7 +179,7 @@ class PipelineModel private[ml] (
     stages.foldLeft(dataset)((cur, transformer) => transformer.transform(cur, map))
   }
 
-  private[ml] override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
+  override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
     // Precedence of ParamMaps: paramMap > this.paramMap > fittingParamMap
     val map = (fittingParamMap ++ this.paramMap) ++ paramMap
     stages.foldLeft(schema)((cur, transformer) => transformer.transformSchema(cur, map))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index ddbd648d64f23..1142aa4f8e73d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -55,7 +55,7 @@ class StandardScaler extends Estimator[StandardScalerModel] with StandardScalerP
     model
   }
 
-  private[ml] override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
+  override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
     val map = this.paramMap ++ paramMap
     val inputType = schema(map(inputCol)).dataType
     require(inputType.isInstanceOf[VectorUDT],
@@ -91,7 +91,7 @@ class StandardScalerModel private[ml] (
     dataset.withColumn(map(outputCol), scale(col(map(inputCol))))
   }
 
-  private[ml] override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
+  override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
     val map = this.paramMap ++ paramMap
     val inputType = schema(map(inputCol)).dataType
     require(inputType.isInstanceOf[VectorUDT],
diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
index 7daeff980f0ea..dfb89cc8d4af3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
@@ -132,7 +132,7 @@ private[spark] abstract class Predictor[
   @DeveloperApi
   protected def featuresDataType: DataType = new VectorUDT
 
-  private[ml] override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
+  override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
     validateAndTransformSchema(schema, paramMap, fitting = true, featuresDataType)
   }
 
@@ -184,7 +184,7 @@ private[spark] abstract class PredictionModel[FeaturesType, M <: PredictionModel
   @DeveloperApi
   protected def featuresDataType: DataType = new VectorUDT
 
-  private[ml] override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
+  override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
     validateAndTransformSchema(schema, paramMap, fitting = false, featuresDataType)
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 8d70e4347c4c9..c2ec716f08b7c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -188,7 +188,7 @@ class ALSModel private[ml] (
       .select(dataset("*"), predict(users("features"), items("features")).as(map(predictionCol)))
   }
 
-  override private[ml] def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
+  override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
     validateAndTransformSchema(schema, paramMap)
   }
 }
@@ -292,7 +292,7 @@ class ALS extends Estimator[ALSModel] with ALSParams {
     model
   }
 
-  override private[ml] def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
+  override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
     validateAndTransformSchema(schema, paramMap)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index b07a68269cc2b..2eb1dac56f1e9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -129,7 +129,7 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
     cvModel
   }
 
-  private[ml] override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
+  override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
     val map = this.paramMap ++ paramMap
     map(estimator).transformSchema(schema, paramMap)
   }
@@ -150,7 +150,7 @@ class CrossValidatorModel private[ml] (
     bestModel.transform(dataset, paramMap)
   }
 
-  private[ml] override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
+  override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
     bestModel.transformSchema(schema, paramMap)
   }
 }

From ad6b169dee84df175b51933b7a3ad7f0bbc52cf3 Mon Sep 17 00:00:00 2001
From: Zhan Zhang <zhazhan@gmail.com>
Date: Thu, 19 Feb 2015 23:13:02 +0000
Subject: [PATCH 256/357] [Spark-5889] Remove pid file after stopping service.

Currently the pid file is not deleted, and potentially may cause some problem after service is stopped. The fix remove the pid file after service stopped.

Author: Zhan Zhang <zhazhan@gmail.com>

Closes #4676 from zhzhan/spark-5889 and squashes the following commits:

eb01be1 [Zhan Zhang] solve review comments
b4c009e [Zhan Zhang] solve review comments
018110a [Zhan Zhang] spark-5889: remove pid file after stopping service
088d2a2 [Zhan Zhang] squash all commits
c1f1fa5 [Zhan Zhang] test
---
 sbin/spark-daemon.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sbin/spark-daemon.sh b/sbin/spark-daemon.sh
index e1bcc7d64254a..5e812a1d91c6b 100755
--- a/sbin/spark-daemon.sh
+++ b/sbin/spark-daemon.sh
@@ -168,7 +168,7 @@ case $option in
       TARGET_ID="$(cat "$pid")"
       if [[ $(ps -p "$TARGET_ID" -o comm=) =~ "java" ]]; then
         echo "stopping $command"
-        kill "$TARGET_ID"
+        kill "$TARGET_ID" && rm -f "$pid"
       else
         echo "no $command to stop"
       fi

From 34b7c35380c88569a1396fb4ed991a0bed4288e7 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Thu, 19 Feb 2015 15:35:23 -0800
Subject: [PATCH 257/357] SPARK-4682 [CORE] Consolidate various 'Clock' classes

Another one from JoshRosen 's wish list. The first commit is much smaller and removes 2 of the 4 Clock classes. The second is much larger, necessary for consolidating the streaming one. I put together implementations in the way that seemed simplest. Almost all the change is standardizing class and method names.

Author: Sean Owen <sowen@cloudera.com>

Closes #4514 from srowen/SPARK-4682 and squashes the following commits:

5ed3a03 [Sean Owen] Javadoc Clock classes; make ManualClock private[spark]
169dd13 [Sean Owen] Add support for legacy org.apache.spark.streaming clock class names
277785a [Sean Owen] Reduce the net change in this patch by reversing some unnecessary syntax changes along the way
b5e53df [Sean Owen] FakeClock -> ManualClock; getTime() -> getTimeMillis()
160863a [Sean Owen] Consolidate Streaming Clock class into common util Clock
7c956b2 [Sean Owen] Consolidate Clocks except for Streaming Clock
---
 .../spark/ExecutorAllocationManager.scala     | 28 +-----
 .../spark/deploy/worker/DriverRunner.scala    | 17 ++--
 .../apache/spark/scheduler/DAGScheduler.scala | 20 ++---
 .../spark/scheduler/TaskSetManager.scala      | 16 ++--
 .../scala/org/apache/spark/util/Clock.scala   | 44 ++++++++-
 .../org/apache/spark/util/ManualClock.scala   | 69 ++++++++++++++
 .../ExecutorAllocationManagerSuite.scala      | 65 +++++++-------
 .../deploy/worker/DriverRunnerTest.scala      |  3 +-
 .../spark/scheduler/TaskSetManagerSuite.scala | 36 ++++----
 .../org/apache/spark/util/FakeClock.scala     | 26 ------
 .../streaming/LocalJavaStreamingContext.java  |  2 +-
 .../flume/FlumePollingStreamSuite.scala       |  7 +-
 .../streaming/LocalJavaStreamingContext.java  |  2 +-
 .../streaming/LocalJavaStreamingContext.java  |  2 +-
 .../streaming/LocalJavaStreamingContext.java  |  2 +-
 .../kinesis/KinesisCheckpointState.scala      | 10 +--
 .../kinesis/KinesisRecordProcessor.scala      |  2 +-
 .../kinesis/KinesisReceiverSuite.scala        | 25 +++---
 .../JavaStreamingLogisticRegressionSuite.java |  2 +-
 .../JavaStreamingLinearRegressionSuite.java   |  2 +-
 project/MimaExcludes.scala                    |  5 ++
 .../streaming/dstream/FileInputDStream.scala  |  6 +-
 .../streaming/receiver/BlockGenerator.scala   |  3 +-
 .../receiver/ReceivedBlockHandler.scala       |  4 +-
 .../streaming/scheduler/JobGenerator.scala    | 13 ++-
 .../scheduler/ReceivedBlockTracker.scala      |  6 +-
 .../apache/spark/streaming/util/Clock.scala   | 89 -------------------
 .../spark/streaming/util/RecurringTimer.scala |  5 +-
 .../streaming/util/WriteAheadLogManager.scala |  5 +-
 .../streaming/LocalJavaStreamingContext.java  |  2 +-
 .../streaming/BasicOperationsSuite.scala      |  9 +-
 .../spark/streaming/CheckpointSuite.scala     | 33 ++++---
 .../spark/streaming/InputStreamsSuite.scala   | 37 ++++----
 .../streaming/ReceivedBlockHandlerSuite.scala |  6 +-
 .../streaming/ReceivedBlockTrackerSuite.scala | 10 +--
 .../spark/streaming/TestSuiteBase.scala       | 15 ++--
 .../streaming/util/WriteAheadLogSuite.scala   | 10 +--
 37 files changed, 301 insertions(+), 337 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/util/ManualClock.scala
 delete mode 100644 core/src/test/scala/org/apache/spark/util/FakeClock.scala
 delete mode 100644 streaming/src/main/scala/org/apache/spark/streaming/util/Clock.scala

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 998695b6ac8ab..21c6e6ffa6666 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -20,6 +20,7 @@ package org.apache.spark
 import scala.collection.mutable
 
 import org.apache.spark.scheduler._
+import org.apache.spark.util.{SystemClock, Clock}
 
 /**
  * An agent that dynamically allocates and removes executors based on the workload.
@@ -123,7 +124,7 @@ private[spark] class ExecutorAllocationManager(
   private val intervalMillis: Long = 100
 
   // Clock used to schedule when executors should be added and removed
-  private var clock: Clock = new RealClock
+  private var clock: Clock = new SystemClock()
 
   // Listener for Spark events that impact the allocation policy
   private val listener = new ExecutorAllocationListener
@@ -588,28 +589,3 @@ private[spark] class ExecutorAllocationManager(
 private object ExecutorAllocationManager {
   val NOT_SET = Long.MaxValue
 }
-
-/**
- * An abstract clock for measuring elapsed time.
- */
-private trait Clock {
-  def getTimeMillis: Long
-}
-
-/**
- * A clock backed by a monotonically increasing time source.
- * The time returned by this clock does not correspond to any notion of wall-clock time.
- */
-private class RealClock extends Clock {
-  override def getTimeMillis: Long = System.nanoTime / (1000 * 1000)
-}
-
-/**
- * A clock that allows the caller to customize the time.
- * This is used mainly for testing.
- */
-private class TestClock(startTimeMillis: Long) extends Clock {
-  private var time: Long = startTimeMillis
-  override def getTimeMillis: Long = time
-  def tick(ms: Long): Unit = { time += ms }
-}
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
index b964a09bdb218..e16bccb24d2c4 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala
@@ -20,19 +20,18 @@ package org.apache.spark.deploy.worker
 import java.io._
 
 import scala.collection.JavaConversions._
-import scala.collection.Map
 
 import akka.actor.ActorRef
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.Files
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileUtil, Path}
 
 import org.apache.spark.{Logging, SparkConf}
-import org.apache.spark.deploy.{Command, DriverDescription, SparkHadoopUtil}
+import org.apache.spark.deploy.{DriverDescription, SparkHadoopUtil}
 import org.apache.spark.deploy.DeployMessages.DriverStateChanged
 import org.apache.spark.deploy.master.DriverState
 import org.apache.spark.deploy.master.DriverState.DriverState
+import org.apache.spark.util.{Clock, SystemClock}
 
 /**
  * Manages the execution of one driver, including automatically restarting the driver on failure.
@@ -59,9 +58,7 @@ private[spark] class DriverRunner(
   // Decoupled for testing
   private[deploy] def setClock(_clock: Clock) = clock = _clock
   private[deploy] def setSleeper(_sleeper: Sleeper) = sleeper = _sleeper
-  private var clock = new Clock {
-    def currentTimeMillis(): Long = System.currentTimeMillis()
-  }
+  private var clock: Clock = new SystemClock()
   private var sleeper = new Sleeper {
     def sleep(seconds: Int): Unit = (0 until seconds).takeWhile(f => {Thread.sleep(1000); !killed})
   }
@@ -190,9 +187,9 @@ private[spark] class DriverRunner(
         initialize(process.get)
       }
 
-      val processStart = clock.currentTimeMillis()
+      val processStart = clock.getTimeMillis()
       val exitCode = process.get.waitFor()
-      if (clock.currentTimeMillis() - processStart > successfulRunDuration * 1000) {
+      if (clock.getTimeMillis() - processStart > successfulRunDuration * 1000) {
         waitSeconds = 1
       }
 
@@ -208,10 +205,6 @@ private[spark] class DriverRunner(
   }
 }
 
-private[deploy] trait Clock {
-  def currentTimeMillis(): Long
-}
-
 private[deploy] trait Sleeper {
   def sleep(seconds: Int)
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 8b62d2405ecb7..c58721c2c82b7 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -63,7 +63,7 @@ class DAGScheduler(
     mapOutputTracker: MapOutputTrackerMaster,
     blockManagerMaster: BlockManagerMaster,
     env: SparkEnv,
-    clock: org.apache.spark.util.Clock = SystemClock)
+    clock: Clock = new SystemClock())
   extends Logging {
 
   def this(sc: SparkContext, taskScheduler: TaskScheduler) = {
@@ -657,7 +657,7 @@ class DAGScheduler(
       // completion events or stage abort
       stageIdToStage -= s.id
       jobIdToStageIds -= job.jobId
-      listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTime(), jobResult))
+      listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTimeMillis(), jobResult))
     }
   }
 
@@ -706,7 +706,7 @@ class DAGScheduler(
         stage.latestInfo.stageFailed(stageFailedMessage)
         listenerBus.post(SparkListenerStageCompleted(stage.latestInfo))
       }
-      listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTime(), JobFailed(error)))
+      listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTimeMillis(), JobFailed(error)))
     }
   }
 
@@ -745,7 +745,7 @@ class DAGScheduler(
       logInfo("Missing parents: " + getMissingParentStages(finalStage))
       val shouldRunLocally =
         localExecutionEnabled && allowLocal && finalStage.parents.isEmpty && partitions.length == 1
-      val jobSubmissionTime = clock.getTime()
+      val jobSubmissionTime = clock.getTimeMillis()
       if (shouldRunLocally) {
         // Compute very short actions like first() or take() with no parent stages locally.
         listenerBus.post(
@@ -871,7 +871,7 @@ class DAGScheduler(
       logDebug("New pending tasks: " + stage.pendingTasks)
       taskScheduler.submitTasks(
         new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties))
-      stage.latestInfo.submissionTime = Some(clock.getTime())
+      stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
     } else {
       // Because we posted SparkListenerStageSubmitted earlier, we should post
       // SparkListenerStageCompleted here in case there are no tasks to run.
@@ -940,12 +940,12 @@ class DAGScheduler(
 
     def markStageAsFinished(stage: Stage, errorMessage: Option[String] = None) = {
       val serviceTime = stage.latestInfo.submissionTime match {
-        case Some(t) => "%.03f".format((clock.getTime() - t) / 1000.0)
+        case Some(t) => "%.03f".format((clock.getTimeMillis() - t) / 1000.0)
         case _ => "Unknown"
       }
       if (errorMessage.isEmpty) {
         logInfo("%s (%s) finished in %s s".format(stage, stage.name, serviceTime))
-        stage.latestInfo.completionTime = Some(clock.getTime())
+        stage.latestInfo.completionTime = Some(clock.getTimeMillis())
       } else {
         stage.latestInfo.stageFailed(errorMessage.get)
         logInfo("%s (%s) failed in %s s".format(stage, stage.name, serviceTime))
@@ -971,7 +971,7 @@ class DAGScheduler(
                     markStageAsFinished(stage)
                     cleanupStateForJobAndIndependentStages(job)
                     listenerBus.post(
-                      SparkListenerJobEnd(job.jobId, clock.getTime(), JobSucceeded))
+                      SparkListenerJobEnd(job.jobId, clock.getTimeMillis(), JobSucceeded))
                   }
 
                   // taskSucceeded runs some user code that might throw an exception. Make sure
@@ -1187,7 +1187,7 @@ class DAGScheduler(
     }
     val dependentJobs: Seq[ActiveJob] =
       activeJobs.filter(job => stageDependsOn(job.finalStage, failedStage)).toSeq
-    failedStage.latestInfo.completionTime = Some(clock.getTime())
+    failedStage.latestInfo.completionTime = Some(clock.getTimeMillis())
     for (job <- dependentJobs) {
       failJobAndIndependentStages(job, s"Job aborted due to stage failure: $reason")
     }
@@ -1242,7 +1242,7 @@ class DAGScheduler(
     if (ableToCancelStages) {
       job.listener.jobFailed(error)
       cleanupStateForJobAndIndependentStages(job)
-      listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTime(), JobFailed(error)))
+      listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTimeMillis(), JobFailed(error)))
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 99a5f7117790d..529237f0d35dc 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -51,7 +51,7 @@ private[spark] class TaskSetManager(
     sched: TaskSchedulerImpl,
     val taskSet: TaskSet,
     val maxTaskFailures: Int,
-    clock: Clock = SystemClock)
+    clock: Clock = new SystemClock())
   extends Schedulable with Logging {
 
   val conf = sched.sc.conf
@@ -166,7 +166,7 @@ private[spark] class TaskSetManager(
   // last launched a task at that level, and move up a level when localityWaits[curLevel] expires.
   // We then move down if we manage to launch a "more local" task.
   var currentLocalityIndex = 0    // Index of our current locality level in validLocalityLevels
-  var lastLaunchTime = clock.getTime()  // Time we last launched a task at this level
+  var lastLaunchTime = clock.getTimeMillis()  // Time we last launched a task at this level
 
   override def schedulableQueue = null
 
@@ -281,7 +281,7 @@ private[spark] class TaskSetManager(
       val failed = failedExecutors.get(taskId).get
 
       return failed.contains(execId) &&
-        clock.getTime() - failed.get(execId).get < EXECUTOR_TASK_BLACKLIST_TIMEOUT
+        clock.getTimeMillis() - failed.get(execId).get < EXECUTOR_TASK_BLACKLIST_TIMEOUT
     }
 
     false
@@ -428,7 +428,7 @@ private[spark] class TaskSetManager(
     : Option[TaskDescription] =
   {
     if (!isZombie) {
-      val curTime = clock.getTime()
+      val curTime = clock.getTimeMillis()
 
       var allowedLocality = maxLocality
 
@@ -459,7 +459,7 @@ private[spark] class TaskSetManager(
             lastLaunchTime = curTime
           }
           // Serialize and return the task
-          val startTime = clock.getTime()
+          val startTime = clock.getTimeMillis()
           val serializedTask: ByteBuffer = try {
             Task.serializeWithDependencies(task, sched.sc.addedFiles, sched.sc.addedJars, ser)
           } catch {
@@ -674,7 +674,7 @@ private[spark] class TaskSetManager(
           return
         }
         val key = ef.description
-        val now = clock.getTime()
+        val now = clock.getTimeMillis()
         val (printFull, dupCount) = {
           if (recentExceptions.contains(key)) {
             val (dupCount, printTime) = recentExceptions(key)
@@ -706,7 +706,7 @@ private[spark] class TaskSetManager(
     }
     // always add to failed executors
     failedExecutors.getOrElseUpdate(index, new HashMap[String, Long]()).
-      put(info.executorId, clock.getTime())
+      put(info.executorId, clock.getTimeMillis())
     sched.dagScheduler.taskEnded(tasks(index), reason, null, null, info, taskMetrics)
     addPendingTask(index)
     if (!isZombie && state != TaskState.KILLED && !reason.isInstanceOf[TaskCommitDenied]) {
@@ -821,7 +821,7 @@ private[spark] class TaskSetManager(
     val minFinishedForSpeculation = (SPECULATION_QUANTILE * numTasks).floor.toInt
     logDebug("Checking for speculative tasks: minFinished = " + minFinishedForSpeculation)
     if (tasksSuccessful >= minFinishedForSpeculation && tasksSuccessful > 0) {
-      val time = clock.getTime()
+      val time = clock.getTimeMillis()
       val durations = taskInfos.values.filter(_.successful).map(_.duration).toArray
       Arrays.sort(durations)
       val medianDuration = durations(min((0.5 * tasksSuccessful).round.toInt, durations.size - 1))
diff --git a/core/src/main/scala/org/apache/spark/util/Clock.scala b/core/src/main/scala/org/apache/spark/util/Clock.scala
index 97c2b45aabf28..e92ed11bd165b 100644
--- a/core/src/main/scala/org/apache/spark/util/Clock.scala
+++ b/core/src/main/scala/org/apache/spark/util/Clock.scala
@@ -21,9 +21,47 @@ package org.apache.spark.util
  * An interface to represent clocks, so that they can be mocked out in unit tests.
  */
 private[spark] trait Clock {
-  def getTime(): Long
+  def getTimeMillis(): Long
+  def waitTillTime(targetTime: Long): Long
 }
 
-private[spark] object SystemClock extends Clock {
-  def getTime(): Long = System.currentTimeMillis()
+/**
+ * A clock backed by the actual time from the OS as reported by the `System` API.
+ */
+private[spark] class SystemClock extends Clock {
+
+  val minPollTime = 25L
+
+  /**
+   * @return the same time (milliseconds since the epoch)
+   *         as is reported by `System.currentTimeMillis()`
+   */
+  def getTimeMillis(): Long = System.currentTimeMillis()
+
+  /**
+   * @param targetTime block until the current time is at least this value
+   * @return current system time when wait has completed
+   */
+  def waitTillTime(targetTime: Long): Long = {
+    var currentTime = 0L
+    currentTime = System.currentTimeMillis()
+
+    var waitTime = targetTime - currentTime
+    if (waitTime <= 0) {
+      return currentTime
+    }
+
+    val pollTime = math.max(waitTime / 10.0, minPollTime).toLong
+
+    while (true) {
+      currentTime = System.currentTimeMillis()
+      waitTime = targetTime - currentTime
+      if (waitTime <= 0) {
+        return currentTime
+      }
+      val sleepTime = math.min(waitTime, pollTime)
+      Thread.sleep(sleepTime)
+    }
+    -1
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/util/ManualClock.scala b/core/src/main/scala/org/apache/spark/util/ManualClock.scala
new file mode 100644
index 0000000000000..cf89c1782fd67
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/util/ManualClock.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+/**
+ * A `Clock` whose time can be manually set and modified. Its reported time does not change
+ * as time elapses, but only as its time is modified by callers. This is mainly useful for
+ * testing.
+ *
+ * @param time initial time (in milliseconds since the epoch)
+ */
+private[spark] class ManualClock(private var time: Long) extends Clock {
+
+  /**
+   * @return `ManualClock` with initial time 0
+   */
+  def this() = this(0L)
+
+  def getTimeMillis(): Long =
+    synchronized {
+      time
+    }
+
+  /**
+   * @param timeToSet new time (in milliseconds) that the clock should represent
+   */
+  def setTime(timeToSet: Long) =
+    synchronized {
+      time = timeToSet
+      notifyAll()
+    }
+
+  /**
+   * @param timeToAdd time (in milliseconds) to add to the clock's time
+   */
+  def advance(timeToAdd: Long) =
+    synchronized {
+      time += timeToAdd
+      notifyAll()
+    }
+
+  /**
+   * @param targetTime block until the clock time is set or advanced to at least this time
+   * @return current time reported by the clock when waiting finishes
+   */
+  def waitTillTime(targetTime: Long): Long =
+    synchronized {
+      while (time < targetTime) {
+        wait(100)
+      }
+      getTimeMillis()
+    }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
index d3123e854016b..abfcee75728dc 100644
--- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -23,6 +23,7 @@ import org.scalatest.{FunSuite, PrivateMethodTester}
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.ExecutorInfo
+import org.apache.spark.util.ManualClock
 
 /**
  * Test add and remove behavior of ExecutorAllocationManager.
@@ -321,7 +322,7 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
 
   test("starting/canceling add timer") {
     sc = createSparkContext(2, 10)
-    val clock = new TestClock(8888L)
+    val clock = new ManualClock(8888L)
     val manager = sc.executorAllocationManager.get
     manager.setClock(clock)
 
@@ -330,21 +331,21 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     onSchedulerBacklogged(manager)
     val firstAddTime = addTime(manager)
     assert(firstAddTime === clock.getTimeMillis + schedulerBacklogTimeout * 1000)
-    clock.tick(100L)
+    clock.advance(100L)
     onSchedulerBacklogged(manager)
     assert(addTime(manager) === firstAddTime) // timer is already started
-    clock.tick(200L)
+    clock.advance(200L)
     onSchedulerBacklogged(manager)
     assert(addTime(manager) === firstAddTime)
     onSchedulerQueueEmpty(manager)
 
     // Restart add timer
-    clock.tick(1000L)
+    clock.advance(1000L)
     assert(addTime(manager) === NOT_SET)
     onSchedulerBacklogged(manager)
     val secondAddTime = addTime(manager)
     assert(secondAddTime === clock.getTimeMillis + schedulerBacklogTimeout * 1000)
-    clock.tick(100L)
+    clock.advance(100L)
     onSchedulerBacklogged(manager)
     assert(addTime(manager) === secondAddTime) // timer is already started
     assert(addTime(manager) !== firstAddTime)
@@ -353,7 +354,7 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
 
   test("starting/canceling remove timers") {
     sc = createSparkContext(2, 10)
-    val clock = new TestClock(14444L)
+    val clock = new ManualClock(14444L)
     val manager = sc.executorAllocationManager.get
     manager.setClock(clock)
 
@@ -366,17 +367,17 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     assert(removeTimes(manager).contains("1"))
     val firstRemoveTime = removeTimes(manager)("1")
     assert(firstRemoveTime === clock.getTimeMillis + executorIdleTimeout * 1000)
-    clock.tick(100L)
+    clock.advance(100L)
     onExecutorIdle(manager, "1")
     assert(removeTimes(manager)("1") === firstRemoveTime) // timer is already started
-    clock.tick(200L)
+    clock.advance(200L)
     onExecutorIdle(manager, "1")
     assert(removeTimes(manager)("1") === firstRemoveTime)
-    clock.tick(300L)
+    clock.advance(300L)
     onExecutorIdle(manager, "2")
     assert(removeTimes(manager)("2") !== firstRemoveTime) // different executor
     assert(removeTimes(manager)("2") === clock.getTimeMillis + executorIdleTimeout * 1000)
-    clock.tick(400L)
+    clock.advance(400L)
     onExecutorIdle(manager, "3")
     assert(removeTimes(manager)("3") !== firstRemoveTime)
     assert(removeTimes(manager)("3") === clock.getTimeMillis + executorIdleTimeout * 1000)
@@ -385,7 +386,7 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     assert(removeTimes(manager).contains("3"))
 
     // Restart remove timer
-    clock.tick(1000L)
+    clock.advance(1000L)
     onExecutorBusy(manager, "1")
     assert(removeTimes(manager).size === 2)
     onExecutorIdle(manager, "1")
@@ -401,7 +402,7 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
   test("mock polling loop with no events") {
     sc = createSparkContext(1, 20)
     val manager = sc.executorAllocationManager.get
-    val clock = new TestClock(2020L)
+    val clock = new ManualClock(2020L)
     manager.setClock(clock)
 
     // No events - we should not be adding or removing
@@ -410,15 +411,15 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     schedule(manager)
     assert(numExecutorsPending(manager) === 0)
     assert(executorsPendingToRemove(manager).isEmpty)
-    clock.tick(100L)
+    clock.advance(100L)
     schedule(manager)
     assert(numExecutorsPending(manager) === 0)
     assert(executorsPendingToRemove(manager).isEmpty)
-    clock.tick(1000L)
+    clock.advance(1000L)
     schedule(manager)
     assert(numExecutorsPending(manager) === 0)
     assert(executorsPendingToRemove(manager).isEmpty)
-    clock.tick(10000L)
+    clock.advance(10000L)
     schedule(manager)
     assert(numExecutorsPending(manager) === 0)
     assert(executorsPendingToRemove(manager).isEmpty)
@@ -426,57 +427,57 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
 
   test("mock polling loop add behavior") {
     sc = createSparkContext(1, 20)
-    val clock = new TestClock(2020L)
+    val clock = new ManualClock(2020L)
     val manager = sc.executorAllocationManager.get
     manager.setClock(clock)
     sc.listenerBus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 1000)))
 
     // Scheduler queue backlogged
     onSchedulerBacklogged(manager)
-    clock.tick(schedulerBacklogTimeout * 1000 / 2)
+    clock.advance(schedulerBacklogTimeout * 1000 / 2)
     schedule(manager)
     assert(numExecutorsPending(manager) === 0) // timer not exceeded yet
-    clock.tick(schedulerBacklogTimeout * 1000)
+    clock.advance(schedulerBacklogTimeout * 1000)
     schedule(manager)
     assert(numExecutorsPending(manager) === 1) // first timer exceeded
-    clock.tick(sustainedSchedulerBacklogTimeout * 1000 / 2)
+    clock.advance(sustainedSchedulerBacklogTimeout * 1000 / 2)
     schedule(manager)
     assert(numExecutorsPending(manager) === 1) // second timer not exceeded yet
-    clock.tick(sustainedSchedulerBacklogTimeout * 1000)
+    clock.advance(sustainedSchedulerBacklogTimeout * 1000)
     schedule(manager)
     assert(numExecutorsPending(manager) === 1 + 2) // second timer exceeded
-    clock.tick(sustainedSchedulerBacklogTimeout * 1000)
+    clock.advance(sustainedSchedulerBacklogTimeout * 1000)
     schedule(manager)
     assert(numExecutorsPending(manager) === 1 + 2 + 4) // third timer exceeded
 
     // Scheduler queue drained
     onSchedulerQueueEmpty(manager)
-    clock.tick(sustainedSchedulerBacklogTimeout * 1000)
+    clock.advance(sustainedSchedulerBacklogTimeout * 1000)
     schedule(manager)
     assert(numExecutorsPending(manager) === 7) // timer is canceled
-    clock.tick(sustainedSchedulerBacklogTimeout * 1000)
+    clock.advance(sustainedSchedulerBacklogTimeout * 1000)
     schedule(manager)
     assert(numExecutorsPending(manager) === 7)
 
     // Scheduler queue backlogged again
     onSchedulerBacklogged(manager)
-    clock.tick(schedulerBacklogTimeout * 1000)
+    clock.advance(schedulerBacklogTimeout * 1000)
     schedule(manager)
     assert(numExecutorsPending(manager) === 7 + 1) // timer restarted
-    clock.tick(sustainedSchedulerBacklogTimeout * 1000)
+    clock.advance(sustainedSchedulerBacklogTimeout * 1000)
     schedule(manager)
     assert(numExecutorsPending(manager) === 7 + 1 + 2)
-    clock.tick(sustainedSchedulerBacklogTimeout * 1000)
+    clock.advance(sustainedSchedulerBacklogTimeout * 1000)
     schedule(manager)
     assert(numExecutorsPending(manager) === 7 + 1 + 2 + 4)
-    clock.tick(sustainedSchedulerBacklogTimeout * 1000)
+    clock.advance(sustainedSchedulerBacklogTimeout * 1000)
     schedule(manager)
     assert(numExecutorsPending(manager) === 20) // limit reached
   }
 
   test("mock polling loop remove behavior") {
     sc = createSparkContext(1, 20)
-    val clock = new TestClock(2020L)
+    val clock = new ManualClock(2020L)
     val manager = sc.executorAllocationManager.get
     manager.setClock(clock)
 
@@ -486,11 +487,11 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     onExecutorAdded(manager, "executor-3")
     assert(removeTimes(manager).size === 3)
     assert(executorsPendingToRemove(manager).isEmpty)
-    clock.tick(executorIdleTimeout * 1000 / 2)
+    clock.advance(executorIdleTimeout * 1000 / 2)
     schedule(manager)
     assert(removeTimes(manager).size === 3) // idle threshold not reached yet
     assert(executorsPendingToRemove(manager).isEmpty)
-    clock.tick(executorIdleTimeout * 1000)
+    clock.advance(executorIdleTimeout * 1000)
     schedule(manager)
     assert(removeTimes(manager).isEmpty) // idle threshold exceeded
     assert(executorsPendingToRemove(manager).size === 2) // limit reached (1 executor remaining)
@@ -511,7 +512,7 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     assert(!removeTimes(manager).contains("executor-5"))
     assert(!removeTimes(manager).contains("executor-6"))
     assert(executorsPendingToRemove(manager).size === 2)
-    clock.tick(executorIdleTimeout * 1000)
+    clock.advance(executorIdleTimeout * 1000)
     schedule(manager)
     assert(removeTimes(manager).isEmpty) // idle executors are removed
     assert(executorsPendingToRemove(manager).size === 4)
@@ -529,7 +530,7 @@ class ExecutorAllocationManagerSuite extends FunSuite with LocalSparkContext {
     assert(removeTimes(manager).contains("executor-5"))
     assert(removeTimes(manager).contains("executor-6"))
     assert(executorsPendingToRemove(manager).size === 4)
-    clock.tick(executorIdleTimeout * 1000)
+    clock.advance(executorIdleTimeout * 1000)
     schedule(manager)
     assert(removeTimes(manager).isEmpty)
     assert(executorsPendingToRemove(manager).size === 6) // limit reached (1 executor remaining)
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala
index b6f4411e0587a..aa6e4874cecde 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala
@@ -27,6 +27,7 @@ import org.scalatest.FunSuite
 
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.{Command, DriverDescription}
+import org.apache.spark.util.Clock
 
 class DriverRunnerTest extends FunSuite {
   private def createDriverRunner() = {
@@ -129,7 +130,7 @@ class DriverRunnerTest extends FunSuite {
       .thenReturn(-1) // fail 3
       .thenReturn(-1) // fail 4
       .thenReturn(0) // success
-    when(clock.currentTimeMillis())
+    when(clock.getTimeMillis())
       .thenReturn(0).thenReturn(1000) // fail 1 (short)
       .thenReturn(1000).thenReturn(2000) // fail 2 (short)
       .thenReturn(2000).thenReturn(10000) // fail 3 (long)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index 59580561cb45a..12330d8f63c40 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -27,7 +27,7 @@ import org.scalatest.FunSuite
 
 import org.apache.spark._
 import org.apache.spark.executor.TaskMetrics
-import org.apache.spark.util.FakeClock
+import org.apache.spark.util.ManualClock
 
 class FakeDAGScheduler(sc: SparkContext, taskScheduler: FakeTaskScheduler)
   extends DAGScheduler(sc) {
@@ -164,7 +164,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
     sc = new SparkContext("local", "test")
     val sched = new FakeTaskScheduler(sc, ("exec1", "host1"))
     val taskSet = FakeTask.createTaskSet(1)
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
 
     // Offer a host with NO_PREF as the constraint,
@@ -213,7 +213,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
     sc = new SparkContext("local", "test")
     val sched = new FakeTaskScheduler(sc, ("execA", "host1"), ("execC", "host2"))
     val taskSet = FakeTask.createTaskSet(1, Seq(TaskLocation("host1", "execB")))
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
 
     // An executor that is not NODE_LOCAL should be rejected.
@@ -234,7 +234,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
       Seq(TaskLocation("host1"), TaskLocation("host2", "exec2")),
       Seq()   // Last task has no locality prefs
     )
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
     // First offer host1, exec1: first task should be chosen
     assert(manager.resourceOffer("exec1", "host1", ANY).get.index === 0)
@@ -263,7 +263,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
       Seq(TaskLocation("host2", "exec3")),
       Seq()   // Last task has no locality prefs
     )
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
     // First offer host1, exec1: first task should be chosen
     assert(manager.resourceOffer("exec1", "host1", PROCESS_LOCAL).get.index === 0)
@@ -283,7 +283,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
       Seq(TaskLocation("host3")),
       Seq(TaskLocation("host2"))
     )
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
 
     // First offer host1: first task should be chosen
@@ -321,7 +321,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
       Seq(TaskLocation("host2")),
       Seq(TaskLocation("host3"))
     )
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
 
     // First offer host1: first task should be chosen
@@ -353,7 +353,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
     sc = new SparkContext("local", "test")
     val sched = new FakeTaskScheduler(sc, ("exec1", "host1"))
     val taskSet = FakeTask.createTaskSet(1)
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
 
     assert(manager.resourceOffer("exec1", "host1", ANY).get.index === 0)
@@ -370,7 +370,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
     sc = new SparkContext("local", "test")
     val sched = new FakeTaskScheduler(sc, ("exec1", "host1"))
     val taskSet = FakeTask.createTaskSet(1)
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
 
     // Fail the task MAX_TASK_FAILURES times, and check that the task set is aborted
@@ -402,7 +402,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
       ("exec1.1", "host1"), ("exec2", "host2"))
     // affinity to exec1 on host1 - which we will fail.
     val taskSet = FakeTask.createTaskSet(1, Seq(TaskLocation("host1", "exec1")))
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, 4, clock)
 
     {
@@ -486,7 +486,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
       Seq(TaskLocation("host1", "execB")),
       Seq(TaskLocation("host2", "execC")),
       Seq())
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
     // Only ANY is valid
     assert(manager.myLocalityLevels.sameElements(Array(NO_PREF, ANY)))
@@ -522,7 +522,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
     val taskSet = FakeTask.createTaskSet(2,
       Seq(TaskLocation("host1", "execA")),
       Seq(TaskLocation("host1", "execA")))
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
 
     assert(manager.myLocalityLevels.sameElements(Array(PROCESS_LOCAL, NODE_LOCAL, RACK_LOCAL, ANY)))
@@ -611,7 +611,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
       Seq(TaskLocation("host2"), TaskLocation("host1")),
       Seq(),
       Seq(TaskLocation("host3", "execC")))
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
 
     assert(manager.resourceOffer("execA", "host1", PROCESS_LOCAL).get.index === 0)
@@ -637,7 +637,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
       Seq(TaskLocation("host2")),
       Seq(),
       Seq(TaskLocation("host3")))
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
 
     // node-local tasks are scheduled without delay
@@ -658,7 +658,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
       Seq(TaskLocation("host2")),
       Seq(ExecutorCacheTaskLocation("host1", "execA")),
       Seq(ExecutorCacheTaskLocation("host2", "execB")))
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
 
     // process-local tasks are scheduled first
@@ -678,7 +678,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
       Seq(),
       Seq(ExecutorCacheTaskLocation("host1", "execA")),
       Seq(ExecutorCacheTaskLocation("host2", "execB")))
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
 
     // process-local tasks are scheduled first
@@ -698,7 +698,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
     val taskSet = FakeTask.createTaskSet(2,
       Seq(TaskLocation("host1", "execA")),
       Seq(TaskLocation("host2", "execB.1")))
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
     // Only ANY is valid
     assert(manager.myLocalityLevels.sameElements(Array(ANY)))
@@ -732,7 +732,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
       Seq(HostTaskLocation("host1")),
       Seq(HostTaskLocation("host2")),
       Seq(HDFSCacheTaskLocation("host3")))
-    val clock = new FakeClock
+    val clock = new ManualClock
     val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
     assert(manager.myLocalityLevels.sameElements(Array(PROCESS_LOCAL, NODE_LOCAL, ANY)))
     sched.removeExecutor("execA")
diff --git a/core/src/test/scala/org/apache/spark/util/FakeClock.scala b/core/src/test/scala/org/apache/spark/util/FakeClock.scala
deleted file mode 100644
index 0a45917b08dd2..0000000000000
--- a/core/src/test/scala/org/apache/spark/util/FakeClock.scala
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.util
-
-class FakeClock extends Clock {
-  private var time = 0L
-
-  def advance(millis: Long): Unit = time += millis
-
-  def getTime(): Long = time
-}
diff --git a/external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
index 1e24da7f5f60c..cfedb5a042a35 100644
--- a/external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
+++ b/external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
@@ -31,7 +31,7 @@ public void setUp() {
         SparkConf conf = new SparkConf()
             .setMaster("local[2]")
             .setAppName("test")
-            .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
+            .set("spark.streaming.clock", "org.apache.spark.util.ManualClock");
         ssc = new JavaStreamingContext(conf, new Duration(1000));
         ssc.checkpoint("checkpoint");
     }
diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
index b57a1c71e35b9..e04d4088df7dc 100644
--- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
+++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
@@ -34,10 +34,9 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
 import org.apache.spark.{SparkConf, Logging}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.dstream.ReceiverInputDStream
-import org.apache.spark.streaming.util.ManualClock
 import org.apache.spark.streaming.{Seconds, TestOutputStream, StreamingContext}
 import org.apache.spark.streaming.flume.sink._
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ManualClock, Utils}
 
 class FlumePollingStreamSuite extends FunSuite with BeforeAndAfter with Logging {
 
@@ -54,7 +53,7 @@ class FlumePollingStreamSuite extends FunSuite with BeforeAndAfter with Logging
 
   def beforeFunction() {
     logInfo("Using manual clock")
-    conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock")
+    conf.set("spark.streaming.clock", "org.apache.spark.util.ManualClock")
   }
 
   before(beforeFunction())
@@ -236,7 +235,7 @@ class FlumePollingStreamSuite extends FunSuite with BeforeAndAfter with Logging
         tx.commit()
         tx.close()
         Thread.sleep(500) // Allow some time for the events to reach
-        clock.addToTime(batchDuration.milliseconds)
+        clock.advance(batchDuration.milliseconds)
       }
       null
     }
diff --git a/external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
index 1e24da7f5f60c..cfedb5a042a35 100644
--- a/external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
+++ b/external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
@@ -31,7 +31,7 @@ public void setUp() {
         SparkConf conf = new SparkConf()
             .setMaster("local[2]")
             .setAppName("test")
-            .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
+            .set("spark.streaming.clock", "org.apache.spark.util.ManualClock");
         ssc = new JavaStreamingContext(conf, new Duration(1000));
         ssc.checkpoint("checkpoint");
     }
diff --git a/external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
index 1e24da7f5f60c..cfedb5a042a35 100644
--- a/external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
+++ b/external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
@@ -31,7 +31,7 @@ public void setUp() {
         SparkConf conf = new SparkConf()
             .setMaster("local[2]")
             .setAppName("test")
-            .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
+            .set("spark.streaming.clock", "org.apache.spark.util.ManualClock");
         ssc = new JavaStreamingContext(conf, new Duration(1000));
         ssc.checkpoint("checkpoint");
     }
diff --git a/external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
index 1e24da7f5f60c..cfedb5a042a35 100644
--- a/external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
+++ b/external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
@@ -31,7 +31,7 @@ public void setUp() {
         SparkConf conf = new SparkConf()
             .setMaster("local[2]")
             .setAppName("test")
-            .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
+            .set("spark.streaming.clock", "org.apache.spark.util.ManualClock");
         ssc = new JavaStreamingContext(conf, new Duration(1000));
         ssc.checkpoint("checkpoint");
     }
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala
index 0b80b611cdce7..588e86a1887ec 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala
@@ -18,9 +18,7 @@ package org.apache.spark.streaming.kinesis
 
 import org.apache.spark.Logging
 import org.apache.spark.streaming.Duration
-import org.apache.spark.streaming.util.Clock
-import org.apache.spark.streaming.util.ManualClock
-import org.apache.spark.streaming.util.SystemClock
+import org.apache.spark.util.{Clock, ManualClock, SystemClock}
 
 /**
  * This is a helper class for managing checkpoint clocks.
@@ -35,7 +33,7 @@ private[kinesis] class KinesisCheckpointState(
   
   /* Initialize the checkpoint clock using the given currentClock + checkpointInterval millis */
   val checkpointClock = new ManualClock()
-  checkpointClock.setTime(currentClock.currentTime() + checkpointInterval.milliseconds)
+  checkpointClock.setTime(currentClock.getTimeMillis() + checkpointInterval.milliseconds)
 
   /**
    * Check if it's time to checkpoint based on the current time and the derived time 
@@ -44,13 +42,13 @@ private[kinesis] class KinesisCheckpointState(
    * @return true if it's time to checkpoint
    */
   def shouldCheckpoint(): Boolean = {
-    new SystemClock().currentTime() > checkpointClock.currentTime()
+    new SystemClock().getTimeMillis() > checkpointClock.getTimeMillis()
   }
 
   /**
    * Advance the checkpoint clock by the checkpoint interval.
    */
   def advanceCheckpoint() = {
-    checkpointClock.addToTime(checkpointInterval.milliseconds)
+    checkpointClock.advance(checkpointInterval.milliseconds)
   }
 }
diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
index 8ecc2d90160b1..af8cd875b4541 100644
--- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
+++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
@@ -104,7 +104,7 @@ private[kinesis] class KinesisRecordProcessor(
           logDebug(s"Checkpoint:  WorkerId $workerId completed checkpoint of ${batch.size}" +
               s" records for shardId $shardId")
           logDebug(s"Checkpoint:  Next checkpoint is at " +
-              s" ${checkpointState.checkpointClock.currentTime()} for shardId $shardId")
+              s" ${checkpointState.checkpointClock.getTimeMillis()} for shardId $shardId")
         }
       } catch {
         case e: Throwable => {
diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
index f56898af029c1..255fe65819608 100644
--- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
+++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala
@@ -25,8 +25,7 @@ import org.apache.spark.streaming.Milliseconds
 import org.apache.spark.streaming.Seconds
 import org.apache.spark.streaming.StreamingContext
 import org.apache.spark.streaming.TestSuiteBase
-import org.apache.spark.streaming.util.Clock
-import org.apache.spark.streaming.util.ManualClock
+import org.apache.spark.util.{ManualClock, Clock}
 
 import org.mockito.Mockito._
 import org.scalatest.BeforeAndAfter
@@ -129,45 +128,45 @@ class KinesisReceiverSuite extends TestSuiteBase with Matchers with BeforeAndAft
   }
 
   test("should set checkpoint time to currentTime + checkpoint interval upon instantiation") {
-    when(currentClockMock.currentTime()).thenReturn(0)
+    when(currentClockMock.getTimeMillis()).thenReturn(0)
 
     val checkpointIntervalMillis = 10
     val checkpointState =
       new KinesisCheckpointState(Milliseconds(checkpointIntervalMillis), currentClockMock)
-    assert(checkpointState.checkpointClock.currentTime() == checkpointIntervalMillis)
+    assert(checkpointState.checkpointClock.getTimeMillis() == checkpointIntervalMillis)
 
-    verify(currentClockMock, times(1)).currentTime()
+    verify(currentClockMock, times(1)).getTimeMillis()
   }
 
   test("should checkpoint if we have exceeded the checkpoint interval") {
-    when(currentClockMock.currentTime()).thenReturn(0)
+    when(currentClockMock.getTimeMillis()).thenReturn(0)
 
     val checkpointState = new KinesisCheckpointState(Milliseconds(Long.MinValue), currentClockMock)
     assert(checkpointState.shouldCheckpoint())
 
-    verify(currentClockMock, times(1)).currentTime()
+    verify(currentClockMock, times(1)).getTimeMillis()
   }
 
   test("shouldn't checkpoint if we have not exceeded the checkpoint interval") {
-    when(currentClockMock.currentTime()).thenReturn(0)
+    when(currentClockMock.getTimeMillis()).thenReturn(0)
 
     val checkpointState = new KinesisCheckpointState(Milliseconds(Long.MaxValue), currentClockMock)
     assert(!checkpointState.shouldCheckpoint())
 
-    verify(currentClockMock, times(1)).currentTime()
+    verify(currentClockMock, times(1)).getTimeMillis()
   }
 
   test("should add to time when advancing checkpoint") {
-    when(currentClockMock.currentTime()).thenReturn(0)
+    when(currentClockMock.getTimeMillis()).thenReturn(0)
 
     val checkpointIntervalMillis = 10
     val checkpointState =
       new KinesisCheckpointState(Milliseconds(checkpointIntervalMillis), currentClockMock)
-    assert(checkpointState.checkpointClock.currentTime() == checkpointIntervalMillis)
+    assert(checkpointState.checkpointClock.getTimeMillis() == checkpointIntervalMillis)
     checkpointState.advanceCheckpoint()
-    assert(checkpointState.checkpointClock.currentTime() == (2 * checkpointIntervalMillis))
+    assert(checkpointState.checkpointClock.getTimeMillis() == (2 * checkpointIntervalMillis))
 
-    verify(currentClockMock, times(1)).currentTime()
+    verify(currentClockMock, times(1)).getTimeMillis()
   }
 
   test("shutdown should checkpoint if the reason is TERMINATE") {
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaStreamingLogisticRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaStreamingLogisticRegressionSuite.java
index ac945ba6f23c1..640d2ec55e4e7 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaStreamingLogisticRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaStreamingLogisticRegressionSuite.java
@@ -47,7 +47,7 @@ public void setUp() {
     SparkConf conf = new SparkConf()
       .setMaster("local[2]")
       .setAppName("test")
-      .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
+      .set("spark.streaming.clock", "org.apache.spark.util.ManualClock");
     ssc = new JavaStreamingContext(conf, new Duration(1000));
     ssc.checkpoint("checkpoint");
   }
diff --git a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaStreamingLinearRegressionSuite.java b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaStreamingLinearRegressionSuite.java
index a4dd1ac39a3c8..899c4ea607869 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaStreamingLinearRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaStreamingLinearRegressionSuite.java
@@ -45,7 +45,7 @@ public void setUp() {
     SparkConf conf = new SparkConf()
       .setMaster("local[2]")
       .setAppName("test")
-      .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
+      .set("spark.streaming.clock", "org.apache.spark.util.ManualClock");
     ssc = new JavaStreamingContext(conf, new Duration(1000));
     ssc.checkpoint("checkpoint");
   }
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 4065a562a1a18..ee6229aa6bbe1 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -148,6 +148,11 @@ object MimaExcludes {
             ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.mllib.linalg.VectorUDT"),
             ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.mllib.linalg.VectorUDT.serialize"),
             ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.mllib.linalg.VectorUDT.sqlType")
+          ) ++ Seq(
+            // SPARK-4682
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.RealClock"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.Clock"),
+            ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.TestClock")
           )
 
         case v if v.startsWith("1.2") =>
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
index 4f7db41abe76f..22de8c02e63c8 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala
@@ -88,7 +88,7 @@ class FileInputDStream[K, V, F <: NewInputFormat[K,V]](
 
   // Initial ignore threshold based on which old, existing files in the directory (at the time of
   // starting the streaming application) will be ignored or considered
-  private val initialModTimeIgnoreThreshold = if (newFilesOnly) clock.currentTime() else 0L
+  private val initialModTimeIgnoreThreshold = if (newFilesOnly) clock.getTimeMillis() else 0L
 
   /*
    * Make sure that the information of files selected in the last few batches are remembered.
@@ -161,7 +161,7 @@ class FileInputDStream[K, V, F <: NewInputFormat[K,V]](
    */
   private def findNewFiles(currentTime: Long): Array[String] = {
     try {
-      lastNewFileFindingTime = clock.currentTime()
+      lastNewFileFindingTime = clock.getTimeMillis()
 
       // Calculate ignore threshold
       val modTimeIgnoreThreshold = math.max(
@@ -174,7 +174,7 @@ class FileInputDStream[K, V, F <: NewInputFormat[K,V]](
         def accept(path: Path): Boolean = isNewFile(path, currentTime, modTimeIgnoreThreshold)
       }
       val newFiles = fs.listStatus(directoryPath, filter).map(_.getPath.toString)
-      val timeTaken = clock.currentTime() - lastNewFileFindingTime
+      val timeTaken = clock.getTimeMillis() - lastNewFileFindingTime
       logInfo("Finding new files took " + timeTaken + " ms")
       logDebug("# cached file times = " + fileToModTime.size)
       if (timeTaken > slideDuration.milliseconds) {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
index 79263a7183977..ee5e639b26d91 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
@@ -23,7 +23,8 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.{Logging, SparkConf}
 import org.apache.spark.storage.StreamBlockId
-import org.apache.spark.streaming.util.{RecurringTimer, SystemClock}
+import org.apache.spark.streaming.util.RecurringTimer
+import org.apache.spark.util.SystemClock
 
 /** Listener object for BlockGenerator events */
 private[streaming] trait BlockGeneratorListener {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala
index f7a8ebee8a544..dcdc27d29c270 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala
@@ -27,8 +27,8 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.{Logging, SparkConf, SparkException}
 import org.apache.spark.storage._
-import org.apache.spark.streaming.util.{Clock, SystemClock, WriteAheadLogFileSegment, WriteAheadLogManager}
-import org.apache.spark.util.Utils
+import org.apache.spark.streaming.util.{WriteAheadLogFileSegment, WriteAheadLogManager}
+import org.apache.spark.util.{Clock, SystemClock, Utils}
 
 /** Trait that represents the metadata related to storage of blocks */
 private[streaming] trait ReceivedBlockStoreResult {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
index 8632c94349bf9..ac92774a38273 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
@@ -23,7 +23,8 @@ import akka.actor.{ActorRef, Props, Actor}
 
 import org.apache.spark.{SparkEnv, Logging}
 import org.apache.spark.streaming.{Checkpoint, CheckpointWriter, Time}
-import org.apache.spark.streaming.util.{Clock, ManualClock, RecurringTimer}
+import org.apache.spark.streaming.util.RecurringTimer
+import org.apache.spark.util.{Clock, ManualClock}
 
 /** Event classes for JobGenerator */
 private[scheduler] sealed trait JobGeneratorEvent
@@ -45,8 +46,14 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging {
 
   val clock = {
     val clockClass = ssc.sc.conf.get(
-      "spark.streaming.clock", "org.apache.spark.streaming.util.SystemClock")
-    Class.forName(clockClass).newInstance().asInstanceOf[Clock]
+      "spark.streaming.clock", "org.apache.spark.util.SystemClock")
+    try {
+      Class.forName(clockClass).newInstance().asInstanceOf[Clock]
+    } catch {
+      case e: ClassNotFoundException if clockClass.startsWith("org.apache.spark.streaming") =>
+        val newClockClass = clockClass.replace("org.apache.spark.streaming", "org.apache.spark")
+        Class.forName(newClockClass).newInstance().asInstanceOf[Clock]
+    }
   }
 
   private val timer = new RecurringTimer(clock, ssc.graph.batchDuration.milliseconds,
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
index e19ac939f9ac5..200cf4ef4b0f1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala
@@ -27,8 +27,8 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.{SparkException, Logging, SparkConf}
 import org.apache.spark.streaming.Time
-import org.apache.spark.streaming.util.{Clock, WriteAheadLogManager}
-import org.apache.spark.util.Utils
+import org.apache.spark.streaming.util.WriteAheadLogManager
+import org.apache.spark.util.{Clock, Utils}
 
 /** Trait representing any event in the ReceivedBlockTracker that updates its state. */
 private[streaming] sealed trait ReceivedBlockTrackerLogEvent
@@ -150,7 +150,7 @@ private[streaming] class ReceivedBlockTracker(
    * returns only after the files are cleaned up.
    */
   def cleanupOldBatches(cleanupThreshTime: Time, waitForCompletion: Boolean): Unit = synchronized {
-    assert(cleanupThreshTime.milliseconds < clock.currentTime())
+    assert(cleanupThreshTime.milliseconds < clock.getTimeMillis())
     val timesToCleanup = timeToAllocatedBlocks.keys.filter { _ < cleanupThreshTime }.toSeq
     logInfo("Deleting batches " + timesToCleanup)
     writeToLog(BatchCleanupEvent(timesToCleanup))
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/Clock.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/Clock.scala
deleted file mode 100644
index d6d96d7ba00fd..0000000000000
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/Clock.scala
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.streaming.util
-
-private[streaming]
-trait Clock {
-  def currentTime(): Long
-  def waitTillTime(targetTime: Long): Long
-}
-
-private[streaming]
-class SystemClock() extends Clock {
-
-  val minPollTime = 25L
-
-  def currentTime(): Long = {
-    System.currentTimeMillis()
-  }
-
-  def waitTillTime(targetTime: Long): Long = {
-    var currentTime = 0L
-    currentTime = System.currentTimeMillis()
-
-    var waitTime = targetTime - currentTime
-    if (waitTime <= 0) {
-      return currentTime
-    }
-
-    val pollTime = math.max(waitTime / 10.0, minPollTime).toLong
-
-    while (true) {
-      currentTime = System.currentTimeMillis()
-      waitTime = targetTime - currentTime
-      if (waitTime <= 0) {
-        return currentTime
-      }
-      val sleepTime = math.min(waitTime, pollTime)
-      Thread.sleep(sleepTime)
-    }
-    -1
-  }
-}
-
-private[streaming]
-class ManualClock() extends Clock {
-
-  private var time = 0L
-
-  def currentTime() = this.synchronized {
-    time
-  }
-
-  def setTime(timeToSet: Long) = {
-    this.synchronized {
-      time = timeToSet
-      this.notifyAll()
-    }
-  }
-
-  def addToTime(timeToAdd: Long) = {
-    this.synchronized {
-      time += timeToAdd
-      this.notifyAll()
-    }
-  }
-  def waitTillTime(targetTime: Long): Long = {
-    this.synchronized {
-      while (time < targetTime) {
-        this.wait(100)
-      }
-    }
-    currentTime()
-  }
-}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala
index 1a616a0434f2c..c8eef833eb431 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.streaming.util
 
 import org.apache.spark.Logging
+import org.apache.spark.util.{Clock, SystemClock}
 
 private[streaming]
 class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: String)
@@ -38,7 +39,7 @@ class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name:
    * current system time.
    */
   def getStartTime(): Long = {
-    (math.floor(clock.currentTime.toDouble / period) + 1).toLong * period
+    (math.floor(clock.getTimeMillis().toDouble / period) + 1).toLong * period
   }
 
   /**
@@ -48,7 +49,7 @@ class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name:
    * more than current time.
    */
   def getRestartTime(originalStartTime: Long): Long = {
-    val gap = clock.currentTime - originalStartTime
+    val gap = clock.getTimeMillis() - originalStartTime
     (math.floor(gap.toDouble / period).toLong + 1) * period + originalStartTime
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogManager.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogManager.scala
index 166661b7496df..985ded9111f74 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogManager.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogManager.scala
@@ -19,13 +19,12 @@ package org.apache.spark.streaming.util
 import java.nio.ByteBuffer
 
 import scala.collection.mutable.ArrayBuffer
-import scala.concurrent.duration.Duration
 import scala.concurrent.{Await, ExecutionContext, Future}
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.spark.Logging
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{Clock, SystemClock, Utils}
 import WriteAheadLogManager._
 
 /**
@@ -82,7 +81,7 @@ private[streaming] class WriteAheadLogManager(
     var succeeded = false
     while (!succeeded && failures < maxFailures) {
       try {
-        fileSegment = getLogWriter(clock.currentTime).write(byteBuffer)
+        fileSegment = getLogWriter(clock.getTimeMillis()).write(byteBuffer)
         succeeded = true
       } catch {
         case ex: Exception =>
diff --git a/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
index 1e24da7f5f60c..cfedb5a042a35 100644
--- a/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
+++ b/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java
@@ -31,7 +31,7 @@ public void setUp() {
         SparkConf conf = new SparkConf()
             .setMaster("local[2]")
             .setAppName("test")
-            .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock");
+            .set("spark.streaming.clock", "org.apache.spark.util.ManualClock");
         ssc = new JavaStreamingContext(conf, new Duration(1000));
         ssc.checkpoint("checkpoint");
     }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
index e8f4a7779ec21..cf191715d29d6 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
@@ -22,13 +22,12 @@ import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer}
 import scala.language.existentials
 import scala.reflect.ClassTag
 
-import util.ManualClock
-
 import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.{BlockRDD, RDD}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.dstream.{DStream, WindowedDStream}
+import org.apache.spark.util.{Clock, ManualClock}
 import org.apache.spark.HashPartitioner
 
 class BasicOperationsSuite extends TestSuiteBase {
@@ -586,7 +585,7 @@ class BasicOperationsSuite extends TestSuiteBase {
         for (i <- 0 until input.size) {
           testServer.send(input(i).toString + "\n")
           Thread.sleep(200)
-          clock.addToTime(batchDuration.milliseconds)
+          clock.advance(batchDuration.milliseconds)
           collectRddInfo()
         }
 
@@ -637,8 +636,8 @@ class BasicOperationsSuite extends TestSuiteBase {
         ssc.graph.getOutputStreams().head.dependencies.head.asInstanceOf[DStream[T]]
       if (rememberDuration != null) ssc.remember(rememberDuration)
       val output = runStreams[(Int, Int)](ssc, cleanupTestInput.size, numExpectedOutput)
-      val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
-      assert(clock.currentTime() === Seconds(10).milliseconds)
+      val clock = ssc.scheduler.clock.asInstanceOf[Clock]
+      assert(clock.getTimeMillis() === Seconds(10).milliseconds)
       assert(output.size === numExpectedOutput)
       operatedStream
     }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
index 8f8bc61437ba5..03c448f1df5f1 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
@@ -32,8 +32,7 @@ import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutput
 import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark.streaming.dstream.{DStream, FileInputDStream}
-import org.apache.spark.streaming.util.ManualClock
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{Clock, ManualClock, Utils}
 
 /**
  * This test suites tests the checkpointing functionality of DStreams -
@@ -61,7 +60,7 @@ class CheckpointSuite extends TestSuiteBase {
 
     assert(batchDuration === Milliseconds(500), "batchDuration for this test must be 1 second")
 
-    conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock")
+    conf.set("spark.streaming.clock", "org.apache.spark.util.ManualClock")
 
     val stateStreamCheckpointInterval = Seconds(1)
     val fs = FileSystem.getLocal(new Configuration())
@@ -324,13 +323,13 @@ class CheckpointSuite extends TestSuiteBase {
      * Writes a file named `i` (which contains the number `i`) to the test directory and sets its
      * modification time to `clock`'s current time.
      */
-    def writeFile(i: Int, clock: ManualClock): Unit = {
+    def writeFile(i: Int, clock: Clock): Unit = {
       val file = new File(testDir, i.toString)
       Files.write(i + "\n", file, Charsets.UTF_8)
-      assert(file.setLastModified(clock.currentTime()))
+      assert(file.setLastModified(clock.getTimeMillis()))
       // Check that the file's modification date is actually the value we wrote, since rounding or
       // truncation will break the test:
-      assert(file.lastModified() === clock.currentTime())
+      assert(file.lastModified() === clock.getTimeMillis())
     }
 
     /**
@@ -372,13 +371,13 @@ class CheckpointSuite extends TestSuiteBase {
         ssc.start()
 
         // Advance half a batch so that the first file is created after the StreamingContext starts
-        clock.addToTime(batchDuration.milliseconds / 2)
+        clock.advance(batchDuration.milliseconds / 2)
         // Create files and advance manual clock to process them
         for (i <- Seq(1, 2, 3)) {
           writeFile(i, clock)
           // Advance the clock after creating the file to avoid a race when
           // setting its modification time
-          clock.addToTime(batchDuration.milliseconds)
+          clock.advance(batchDuration.milliseconds)
           if (i != 3) {
             // Since we want to shut down while the 3rd batch is processing
             eventually(eventuallyTimeout) {
@@ -386,7 +385,7 @@ class CheckpointSuite extends TestSuiteBase {
             }
           }
         }
-        clock.addToTime(batchDuration.milliseconds)
+        clock.advance(batchDuration.milliseconds)
         eventually(eventuallyTimeout) {
           // Wait until all files have been recorded and all batches have started
           assert(recordedFiles(ssc) === Seq(1, 2, 3) && batchCounter.getNumStartedBatches === 3)
@@ -410,7 +409,7 @@ class CheckpointSuite extends TestSuiteBase {
         writeFile(i, clock)
         // Advance the clock after creating the file to avoid a race when
         // setting its modification time
-        clock.addToTime(batchDuration.milliseconds)
+        clock.advance(batchDuration.milliseconds)
       }
 
       // Recover context from checkpoint file and verify whether the files that were
@@ -419,7 +418,7 @@ class CheckpointSuite extends TestSuiteBase {
       withStreamingContext(new StreamingContext(checkpointDir)) { ssc =>
         // So that the restarted StreamingContext's clock has gone forward in time since failure
         ssc.conf.set("spark.streaming.manualClock.jump", (batchDuration * 3).milliseconds.toString)
-        val oldClockTime = clock.currentTime()
+        val oldClockTime = clock.getTimeMillis()
         clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
         val batchCounter = new BatchCounter(ssc)
         val outputStream = ssc.graph.getOutputStreams().head.asInstanceOf[TestOutputStream[Int]]
@@ -430,7 +429,7 @@ class CheckpointSuite extends TestSuiteBase {
         ssc.start()
         // Verify that the clock has traveled forward to the expected time
         eventually(eventuallyTimeout) {
-          clock.currentTime() === oldClockTime
+          clock.getTimeMillis() === oldClockTime
         }
         // Wait for pre-failure batch to be recomputed (3 while SSC was down plus last batch)
         val numBatchesAfterRestart = 4
@@ -441,12 +440,12 @@ class CheckpointSuite extends TestSuiteBase {
           writeFile(i, clock)
           // Advance the clock after creating the file to avoid a race when
           // setting its modification time
-          clock.addToTime(batchDuration.milliseconds)
+          clock.advance(batchDuration.milliseconds)
           eventually(eventuallyTimeout) {
             assert(batchCounter.getNumCompletedBatches === index + numBatchesAfterRestart + 1)
           }
         }
-        clock.addToTime(batchDuration.milliseconds)
+        clock.advance(batchDuration.milliseconds)
         logInfo("Output after restart = " + outputStream.output.mkString("[", ", ", "]"))
         assert(outputStream.output.size > 0, "No files processed after restart")
         ssc.stop()
@@ -521,12 +520,12 @@ class CheckpointSuite extends TestSuiteBase {
    */
   def advanceTimeWithRealDelay[V: ClassTag](ssc: StreamingContext, numBatches: Long): Seq[Seq[V]] = {
     val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
-    logInfo("Manual clock before advancing = " + clock.currentTime())
+    logInfo("Manual clock before advancing = " + clock.getTimeMillis())
     for (i <- 1 to numBatches.toInt) {
-      clock.addToTime(batchDuration.milliseconds)
+      clock.advance(batchDuration.milliseconds)
       Thread.sleep(batchDuration.milliseconds)
     }
-    logInfo("Manual clock after advancing = " + clock.currentTime())
+    logInfo("Manual clock after advancing = " + clock.getTimeMillis())
     Thread.sleep(batchDuration.milliseconds)
 
     val outputStream = ssc.graph.getOutputStreams.filter { dstream =>
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
index 01084a457db4f..7ed6320a3d0bc 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala
@@ -17,12 +17,8 @@
 
 package org.apache.spark.streaming
 
-import akka.actor.Actor
-import akka.actor.Props
-import akka.util.ByteString
-
 import java.io.{File, BufferedWriter, OutputStreamWriter}
-import java.net.{InetSocketAddress, SocketException, ServerSocket}
+import java.net.{SocketException, ServerSocket}
 import java.nio.charset.Charset
 import java.util.concurrent.{Executors, TimeUnit, ArrayBlockingQueue}
 import java.util.concurrent.atomic.AtomicInteger
@@ -36,9 +32,8 @@ import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark.Logging
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.streaming.util.ManualClock
-import org.apache.spark.util.Utils
-import org.apache.spark.streaming.receiver.{ActorHelper, Receiver}
+import org.apache.spark.util.{ManualClock, Utils}
+import org.apache.spark.streaming.receiver.Receiver
 import org.apache.spark.rdd.RDD
 import org.apache.hadoop.io.{Text, LongWritable}
 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
@@ -69,7 +64,7 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     for (i <- 0 until input.size) {
       testServer.send(input(i).toString + "\n")
       Thread.sleep(500)
-      clock.addToTime(batchDuration.milliseconds)
+      clock.advance(batchDuration.milliseconds)
     }
     Thread.sleep(1000)
     logInfo("Stopping server")
@@ -120,19 +115,19 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
 
         // Advance the clock so that the files are created after StreamingContext starts, but
         // not enough to trigger a batch
-        clock.addToTime(batchDuration.milliseconds / 2)
+        clock.advance(batchDuration.milliseconds / 2)
 
         val input = Seq(1, 2, 3, 4, 5)
         input.foreach { i =>
           Thread.sleep(batchDuration.milliseconds)
           val file = new File(testDir, i.toString)
           Files.write(Array[Byte](i.toByte), file)
-          assert(file.setLastModified(clock.currentTime()))
-          assert(file.lastModified === clock.currentTime)
+          assert(file.setLastModified(clock.getTimeMillis()))
+          assert(file.lastModified === clock.getTimeMillis())
           logInfo("Created file " + file)
           // Advance the clock after creating the file to avoid a race when
           // setting its modification time
-          clock.addToTime(batchDuration.milliseconds)
+          clock.advance(batchDuration.milliseconds)
           eventually(eventuallyTimeout) {
             assert(batchCounter.getNumCompletedBatches === i)
           }
@@ -179,7 +174,7 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     while((!MultiThreadTestReceiver.haveAllThreadsFinished || output.sum < numTotalRecords) &&
       System.currentTimeMillis() - startTime < 5000) {
       Thread.sleep(100)
-      clock.addToTime(batchDuration.milliseconds)
+      clock.advance(batchDuration.milliseconds)
     }
     Thread.sleep(1000)
     logInfo("Stopping context")
@@ -214,7 +209,7 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     for (i <- 0 until input.size) {
       // Enqueue more than 1 item per tick but they should dequeue one at a time
       inputIterator.take(2).foreach(i => queue += ssc.sparkContext.makeRDD(Seq(i)))
-      clock.addToTime(batchDuration.milliseconds)
+      clock.advance(batchDuration.milliseconds)
     }
     Thread.sleep(1000)
     logInfo("Stopping context")
@@ -256,12 +251,12 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     // Enqueue the first 3 items (one by one), they should be merged in the next batch
     val inputIterator = input.toIterator
     inputIterator.take(3).foreach(i => queue += ssc.sparkContext.makeRDD(Seq(i)))
-    clock.addToTime(batchDuration.milliseconds)
+    clock.advance(batchDuration.milliseconds)
     Thread.sleep(1000)
 
     // Enqueue the remaining items (again one by one), merged in the final batch
     inputIterator.foreach(i => queue += ssc.sparkContext.makeRDD(Seq(i)))
-    clock.addToTime(batchDuration.milliseconds)
+    clock.advance(batchDuration.milliseconds)
     Thread.sleep(1000)
     logInfo("Stopping context")
     ssc.stop()
@@ -308,19 +303,19 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
 
         // Advance the clock so that the files are created after StreamingContext starts, but
         // not enough to trigger a batch
-        clock.addToTime(batchDuration.milliseconds / 2)
+        clock.advance(batchDuration.milliseconds / 2)
 
         // Over time, create files in the directory
         val input = Seq(1, 2, 3, 4, 5)
         input.foreach { i =>
           val file = new File(testDir, i.toString)
           Files.write(i + "\n", file, Charset.forName("UTF-8"))
-          assert(file.setLastModified(clock.currentTime()))
-          assert(file.lastModified === clock.currentTime)
+          assert(file.setLastModified(clock.getTimeMillis()))
+          assert(file.lastModified === clock.getTimeMillis())
           logInfo("Created file " + file)
           // Advance the clock after creating the file to avoid a race when
           // setting its modification time
-          clock.addToTime(batchDuration.milliseconds)
+          clock.advance(batchDuration.milliseconds)
           eventually(eventuallyTimeout) {
             assert(batchCounter.getNumCompletedBatches === i)
           }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
index 132ff2443fc0f..818f551dbe996 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
@@ -39,7 +39,7 @@ import org.apache.spark.shuffle.hash.HashShuffleManager
 import org.apache.spark.storage._
 import org.apache.spark.streaming.receiver._
 import org.apache.spark.streaming.util._
-import org.apache.spark.util.AkkaUtils
+import org.apache.spark.util.{AkkaUtils, ManualClock}
 import WriteAheadLogBasedBlockHandler._
 import WriteAheadLogSuite._
 
@@ -165,7 +165,7 @@ class ReceivedBlockHandlerSuite extends FunSuite with BeforeAndAfter with Matche
       preCleanupLogFiles.size should be > 1
 
       // this depends on the number of blocks inserted using generateAndStoreData()
-      manualClock.currentTime() shouldEqual 5000L
+      manualClock.getTimeMillis() shouldEqual 5000L
 
       val cleanupThreshTime = 3000L
       handler.cleanupOldBlocks(cleanupThreshTime)
@@ -243,7 +243,7 @@ class ReceivedBlockHandlerSuite extends FunSuite with BeforeAndAfter with Matche
     val blockIds = Seq.fill(blocks.size)(generateBlockId())
     val storeResults = blocks.zip(blockIds).map {
       case (block, id) =>
-        manualClock.addToTime(500) // log rolling interval set to 1000 ms through SparkConf
+        manualClock.advance(500) // log rolling interval set to 1000 ms through SparkConf
         logDebug("Inserting block " + id)
         receivedBlockHandler.storeBlock(id, block)
     }.toList
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
index fbb7b0bfebafc..a3a0fd5187403 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala
@@ -34,9 +34,9 @@ import org.apache.spark.{Logging, SparkConf, SparkException}
 import org.apache.spark.storage.StreamBlockId
 import org.apache.spark.streaming.receiver.BlockManagerBasedStoreResult
 import org.apache.spark.streaming.scheduler._
-import org.apache.spark.streaming.util.{Clock, ManualClock, SystemClock, WriteAheadLogReader}
+import org.apache.spark.streaming.util.WriteAheadLogReader
 import org.apache.spark.streaming.util.WriteAheadLogSuite._
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{Clock, ManualClock, SystemClock, Utils}
 
 class ReceivedBlockTrackerSuite
   extends FunSuite with BeforeAndAfter with Matchers with Logging {
@@ -100,7 +100,7 @@ class ReceivedBlockTrackerSuite
 
     def incrementTime() {
       val timeIncrementMillis = 2000L
-      manualClock.addToTime(timeIncrementMillis)
+      manualClock.advance(timeIncrementMillis)
     }
 
     // Generate and add blocks to the given tracker
@@ -138,13 +138,13 @@ class ReceivedBlockTrackerSuite
     tracker2.getUnallocatedBlocks(streamId).toList shouldEqual blockInfos1
 
     // Allocate blocks to batch and verify whether the unallocated blocks got allocated
-    val batchTime1 = manualClock.currentTime
+    val batchTime1 = manualClock.getTimeMillis()
     tracker2.allocateBlocksToBatch(batchTime1)
     tracker2.getBlocksOfBatchAndStream(batchTime1, streamId) shouldEqual blockInfos1
 
     // Add more blocks and allocate to another batch
     incrementTime()
-    val batchTime2 = manualClock.currentTime
+    val batchTime2 = manualClock.getTimeMillis()
     val blockInfos2 = addBlockInfos(tracker2)
     tracker2.allocateBlocksToBatch(batchTime2)
     tracker2.getBlocksOfBatchAndStream(batchTime2, streamId) shouldEqual blockInfos2
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
index 7d82c3e4aadcf..c2375ff65edac 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
@@ -31,10 +31,9 @@ import org.scalatest.concurrent.PatienceConfiguration
 
 import org.apache.spark.streaming.dstream.{DStream, InputDStream, ForEachDStream}
 import org.apache.spark.streaming.scheduler.{StreamingListenerBatchStarted, StreamingListenerBatchCompleted, StreamingListener}
-import org.apache.spark.streaming.util.ManualClock
 import org.apache.spark.{SparkConf, Logging}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ManualClock, Utils}
 
 /**
  * This is a input stream just for the testsuites. This is equivalent to a checkpointable,
@@ -189,10 +188,10 @@ trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging {
   def beforeFunction() {
     if (useManualClock) {
       logInfo("Using manual clock")
-      conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock")
+      conf.set("spark.streaming.clock", "org.apache.spark.util.ManualClock")
     } else {
       logInfo("Using real clock")
-      conf.set("spark.streaming.clock", "org.apache.spark.streaming.util.SystemClock")
+      conf.set("spark.streaming.clock", "org.apache.spark.util.SystemClock")
     }
   }
 
@@ -333,17 +332,17 @@ trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging {
 
       // Advance manual clock
       val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
-      logInfo("Manual clock before advancing = " + clock.currentTime())
+      logInfo("Manual clock before advancing = " + clock.getTimeMillis())
       if (actuallyWait) {
         for (i <- 1 to numBatches) {
           logInfo("Actually waiting for " + batchDuration)
-          clock.addToTime(batchDuration.milliseconds)
+          clock.advance(batchDuration.milliseconds)
           Thread.sleep(batchDuration.milliseconds)
         }
       } else {
-        clock.addToTime(numBatches * batchDuration.milliseconds)
+        clock.advance(numBatches * batchDuration.milliseconds)
       }
-      logInfo("Manual clock after advancing = " + clock.currentTime())
+      logInfo("Manual clock after advancing = " + clock.getTimeMillis())
 
       // Wait until expected number of output items have been generated
       val startTime = System.currentTimeMillis()
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
index 7ce9499dc614d..8335659667f22 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala
@@ -26,7 +26,7 @@ import scala.language.{implicitConversions, postfixOps}
 import WriteAheadLogSuite._
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ManualClock, Utils}
 import org.scalatest.{BeforeAndAfter, FunSuite}
 import org.scalatest.concurrent.Eventually._
 
@@ -197,7 +197,7 @@ class WriteAheadLogSuite extends FunSuite with BeforeAndAfter {
     val logFiles = getLogFilesInDirectory(testDir)
     assert(logFiles.size > 1)
 
-    manager.cleanupOldLogs(manualClock.currentTime() / 2, waitForCompletion)
+    manager.cleanupOldLogs(manualClock.getTimeMillis() / 2, waitForCompletion)
 
     if (waitForCompletion) {
       assert(getLogFilesInDirectory(testDir).size < logFiles.size)
@@ -219,7 +219,7 @@ class WriteAheadLogSuite extends FunSuite with BeforeAndAfter {
 
     // Recover old files and generate a second set of log files
     val dataToWrite2 = generateRandomData()
-    manualClock.addToTime(100000)
+    manualClock.advance(100000)
     writeDataUsingManager(testDir, dataToWrite2, manualClock)
     val logFiles2 = getLogFilesInDirectory(testDir)
     assert(logFiles2.size > logFiles1.size)
@@ -279,12 +279,12 @@ object WriteAheadLogSuite {
       manualClock: ManualClock = new ManualClock,
       stopManager: Boolean = true
     ): WriteAheadLogManager = {
-    if (manualClock.currentTime < 100000) manualClock.setTime(10000)
+    if (manualClock.getTimeMillis() < 100000) manualClock.setTime(10000)
     val manager = new WriteAheadLogManager(logDirectory, hadoopConf,
       rollingIntervalSecs = 1, callerName = "WriteAheadLogSuite", clock = manualClock)
     // Ensure that 500 does not get sorted after 2000, so put a high base value.
     data.foreach { item =>
-      manualClock.addToTime(500)
+      manualClock.advance(500)
       manager.writeToLog(item)
     }
     if (stopManager) manager.stop()

From 6bddc40353057a562c78e75c5549c79a0d7d5f8b Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Thu, 19 Feb 2015 15:50:58 -0800
Subject: [PATCH 258/357] SPARK-5570: No docs stating that `new
 SparkConf().set("spark.driver.memory", ...) will not work

I've updated documentation to reflect true behavior of this setting in client vs. cluster mode.

Author: Ilya Ganelin <ilya.ganelin@capitalone.com>

Closes #4665 from ilganeli/SPARK-5570 and squashes the following commits:

5d1c8dd [Ilya Ganelin] Added example configuration code
a51700a [Ilya Ganelin] Getting rid of extra spaces
85f7a08 [Ilya Ganelin] Reworded note
5889d43 [Ilya Ganelin] Formatting adjustment
f149ba1 [Ilya Ganelin] Minor updates
1fec7a5 [Ilya Ganelin] Updated to add clarification for other driver properties
db47595 [Ilya Ganelin] Slight formatting update
c899564 [Ilya Ganelin] Merge remote-tracking branch 'upstream/master' into SPARK-5570
17b751d [Ilya Ganelin] Updated documentation for driver-memory to reflect its true behavior in client vs cluster mode
---
 docs/configuration.md | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index eb0d6d33c97d9..541695c83ae36 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -115,7 +115,11 @@ of the most common options to set are:
   <td>
     Amount of memory to use for the driver process, i.e. where SparkContext is initialized.
     (e.g. <code>512m</code>, <code>2g</code>).
-  </td>
+    
+    <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
+    directly in your application, because the driver JVM has already started at that point.
+    Instead, please set this through the <code>--driver-memory</code> command line option
+    or in your default properties file.</td>
 </tr>
 <tr>
   <td><code>spark.executor.memory</code></td>
@@ -214,6 +218,11 @@ Apart from these, the following properties are also available, and may be useful
   <td>(none)</td>
   <td>
     A string of extra JVM options to pass to the driver. For instance, GC settings or other logging.
+    
+    <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
+    directly in your application, because the driver JVM has already started at that point.
+    Instead, please set this through the <code>--driver-java-options</code> command line option or in 
+    your default properties file.</td>
   </td>
 </tr>
 <tr>
@@ -221,6 +230,11 @@ Apart from these, the following properties are also available, and may be useful
   <td>(none)</td>
   <td>
     Extra classpath entries to append to the classpath of the driver.
+
+    <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
+    directly in your application, because the driver JVM has already started at that point.
+    Instead, please set this through the <code>--driver-class-path</code> command line option or in 
+    your default properties file.</td>
   </td>
 </tr>
 <tr>
@@ -228,6 +242,11 @@ Apart from these, the following properties are also available, and may be useful
   <td>(none)</td>
   <td>
     Set a special library path to use when launching the driver JVM.
+    
+    <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
+    directly in your application, because the driver JVM has already started at that point.
+    Instead, please set this through the <code>--driver-library-path</code> command line option or in 
+    your default properties file.</td>
   </td>
 </tr>
 <tr>
@@ -237,6 +256,8 @@ Apart from these, the following properties are also available, and may be useful
     (Experimental) Whether to give user-added jars precedence over Spark's own jars when loading
     classes in the the driver. This feature can be used to mitigate conflicts between Spark's
     dependencies and user dependencies. It is currently an experimental feature.
+    
+    This is used in cluster mode only.
   </td>
 </tr>
 <tr>

From 0cfd2cebde0b7fac3779eda80d6e42223f8a3d9f Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Thu, 19 Feb 2015 18:06:16 -0800
Subject: [PATCH 259/357] [SPARK-5900][MLLIB] make PIC and FPGrowth
 Java-friendly

In the previous version, PIC stores clustering assignments as an `RDD[(Long, Int)]`. This is mapped to `RDD<Tuple2<Object, Object>>` in Java and hence Java users have to cast types manually. We should either create a new method called `javaAssignments` that returns `JavaRDD[(java.lang.Long, java.lang.Int)]` or wrap the result pair in a class. I chose the latter approach in this PR. Now assignments are stored as an `RDD[Assignment]`, where `Assignment` is a class with `id` and `cluster`.

Similarly, in FPGrowth, the frequent itemsets are stored as an `RDD[(Array[Item], Long)]`, which is mapped to `RDD<Tuple2<Object, Object>>`. Though we provide a "Java-friendly" method `javaFreqItemsets` that returns `JavaRDD[(Array[Item], java.lang.Long)]`. It doesn't really work because `Array[Item]` is mapped to `Object` in Java. So in this PR I created a class `FreqItemset` to wrap the results. It has `items` and `freq`, as well as a `javaItems` method that returns `List<Item>` in Java.

I'm not certain that the names I chose are proper: `Assignment`/`id`/`cluster` and `FreqItemset`/`items`/`freq`. Please let me know if there are better suggestions.

CC: jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #4695 from mengxr/SPARK-5900 and squashes the following commits:

865b5ca [Xiangrui Meng] make Assignment serializable
cffa96e [Xiangrui Meng] fix test
9c0e590 [Xiangrui Meng] remove unused Tuple2
1b9db3d [Xiangrui Meng] make PIC and FPGrowth Java-friendly
---
 docs/mllib-clustering.md                      |  8 ++--
 docs/mllib-frequent-pattern-mining.md         | 12 +++---
 .../examples/mllib/JavaFPGrowthExample.java   |  8 ++--
 .../JavaPowerIterationClusteringExample.java  |  5 +--
 .../examples/mllib/FPGrowthExample.scala      |  4 +-
 .../PowerIterationClusteringExample.scala     |  8 +---
 .../clustering/PowerIterationClustering.scala | 33 ++++++++++++---
 .../org/apache/spark/mllib/fpm/FPGrowth.scala | 41 +++++++++++++------
 .../spark/mllib/fpm/JavaFPGrowthSuite.java    | 30 +++++---------
 .../PowerIterationClusteringSuite.scala       |  8 ++--
 .../spark/mllib/fpm/FPGrowthSuite.scala       | 10 ++---
 11 files changed, 93 insertions(+), 74 deletions(-)

diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index 6e46a47338398..0b6db4fcb7b1f 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -314,8 +314,8 @@ val pic = new PowerIteartionClustering()
   .setMaxIterations(20)
 val model = pic.run(similarities)
 
-model.assignments.foreach { case (vertexId, clusterId) =>
-  println(s"$vertexId -> $clusterId")
+model.assignments.foreach { a =>
+  println(s"${a.id} -> ${a.cluster}")
 }
 {% endhighlight %}
 
@@ -349,8 +349,8 @@ PowerIterationClustering pic = new PowerIterationClustering()
   .setMaxIterations(10);
 PowerIterationClusteringModel model = pic.run(similarities);
 
-for (Tuple2<Object, Object> assignment: model.assignments().toJavaRDD().collect()) {
-  System.out.println(assignment._1() + " -> " + assignment._2());
+for (PowerIterationClustering.Assignment a: model.assignments().toJavaRDD().collect()) {
+  System.out.println(a.id() + " -> " + a.cluster());
 }
 {% endhighlight %}
 </div>
diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md
index 0ff9738768aca..9fd9be0dd01b1 100644
--- a/docs/mllib-frequent-pattern-mining.md
+++ b/docs/mllib-frequent-pattern-mining.md
@@ -57,8 +57,8 @@ val fpg = new FPGrowth()
   .setNumPartitions(10)
 val model = fpg.run(transactions)
 
-model.freqItemsets.collect().foreach { case (itemset, freq) =>
-  println(itemset.mkString("[", ",", "]") + ", " + freq)
+model.freqItemsets.collect().foreach { itemset =>
+  println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
 }
 {% endhighlight %}
 
@@ -74,10 +74,9 @@ Calling `FPGrowth.run` with transactions returns an
 that stores the frequent itemsets with their frequencies.
 
 {% highlight java %}
-import java.util.Arrays;
 import java.util.List;
 
-import scala.Tuple2;
+import com.google.common.base.Joiner;
 
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.mllib.fpm.FPGrowth;
@@ -88,11 +87,10 @@ JavaRDD<List<String>> transactions = ...
 FPGrowth fpg = new FPGrowth()
   .setMinSupport(0.2)
   .setNumPartitions(10);
-
 FPGrowthModel<String> model = fpg.run(transactions);
 
-for (Tuple2<Object, Long> s: model.javaFreqItemsets().collect()) {
-   System.out.println("(" + Arrays.toString((Object[]) s._1()) + "): " + s._2());
+for (FPGrowth.FreqItemset<String> itemset: model.freqItemsets().toJavaRDD().collect()) {
+   System.out.println("[" + Joiner.on(",").join(s.javaItems()) + "], " + s.freq());
 }
 {% endhighlight %}
 
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java
index 0db572d7607a9..f50e802cf683c 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java
@@ -18,10 +18,8 @@
 package org.apache.spark.examples.mllib;
 
 import java.util.ArrayList;
-import java.util.Arrays;
-
-import scala.Tuple2;
 
+import com.google.common.base.Joiner;
 import com.google.common.collect.Lists;
 
 import org.apache.spark.SparkConf;
@@ -54,8 +52,8 @@ public static void main(String[] args) {
       .setMinSupport(0.3);
     FPGrowthModel<String> model = fpg.run(transactions);
 
-    for (Tuple2<Object, Long> s: model.javaFreqItemsets().collect()) {
-      System.out.println(Arrays.toString((Object[]) s._1()) + ", " + s._2());
+    for (FPGrowth.FreqItemset<String> s: model.freqItemsets().toJavaRDD().collect()) {
+      System.out.println("[" + Joiner.on(",").join(s.javaItems()) + "], " + s.freq());
     }
 
     sc.stop();
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java
index e9371de39f284..6c6f9768f015e 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaPowerIterationClusteringExample.java
@@ -17,7 +17,6 @@
 
 package org.apache.spark.examples.mllib;
 
-import scala.Tuple2;
 import scala.Tuple3;
 
 import com.google.common.collect.Lists;
@@ -49,8 +48,8 @@ public static void main(String[] args) {
       .setMaxIterations(10);
     PowerIterationClusteringModel model = pic.run(similarities);
 
-    for (Tuple2<Object, Object> assignment: model.assignments().toJavaRDD().collect()) {
-      System.out.println(assignment._1() + " -> " + assignment._2());
+    for (PowerIterationClustering.Assignment a: model.assignments().toJavaRDD().collect()) {
+      System.out.println(a.id() + " -> " + a.cluster());
     }
 
     sc.stop();
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
index ae66107d7015b..aaae275ec5524 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
@@ -42,8 +42,8 @@ object FPGrowthExample {
       .setMinSupport(0.3)
     val model = fpg.run(transactions)
 
-    model.freqItemsets.collect().foreach { case (itemset, freq) =>
-      println(itemset.mkString("[", ",", "]") + ", " + freq)
+    model.freqItemsets.collect().foreach { itemset =>
+      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
     }
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala
index b2373adba1fd4..91c9772744f18 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PowerIterationClusteringExample.scala
@@ -44,8 +44,7 @@ import org.apache.spark.{SparkConf, SparkContext}
  *
  * Here is a sample run and output:
  *
- * ./bin/run-example mllib.PowerIterationClusteringExample
- * -k 3 --n 30 --maxIterations 15
+ * ./bin/run-example mllib.PowerIterationClusteringExample -k 3 --n 30 --maxIterations 15
  *
  * Cluster assignments: 1 -> [0,1,2,3,4],2 -> [5,6,7,8,9,10,11,12,13,14],
  * 0 -> [15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]
@@ -103,7 +102,7 @@ object PowerIterationClusteringExample {
       .setMaxIterations(params.maxIterations)
       .run(circlesRdd)
 
-    val clusters = model.assignments.collect.groupBy(_._2).mapValues(_.map(_._1))
+    val clusters = model.assignments.collect().groupBy(_.cluster).mapValues(_.map(_.id))
     val assignments = clusters.toList.sortBy { case (k, v) => v.length}
     val assignmentsStr = assignments
       .map { case (k, v) =>
@@ -153,8 +152,5 @@ object PowerIterationClusteringExample {
     val expCoeff = -1.0 / 2.0 * math.pow(sigma, 2.0)
     val ssquares = (p1._1 - p2._1) * (p1._1 - p2._1) + (p1._2 - p2._2) * (p1._2 - p2._2)
     coeff * math.exp(expCoeff * ssquares)
-    //    math.exp((p1._1 - p2._1) * (p1._1 - p2._1) + (p1._2 - p2._2) * (p1._2 - p2._2))
   }
-
-
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index 63d03347f4572..180023922a9b0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -17,9 +17,9 @@
 
 package org.apache.spark.mllib.clustering
 
-import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.impl.GraphImpl
 import org.apache.spark.mllib.linalg.Vectors
@@ -33,12 +33,12 @@ import org.apache.spark.util.random.XORShiftRandom
  * Model produced by [[PowerIterationClustering]].
  *
  * @param k number of clusters
- * @param assignments an RDD of (vertexID, clusterID) pairs
+ * @param assignments an RDD of clustering [[PowerIterationClustering#Assignment]]s
  */
 @Experimental
 class PowerIterationClusteringModel(
     val k: Int,
-    val assignments: RDD[(Long, Int)]) extends Serializable
+    val assignments: RDD[PowerIterationClustering.Assignment]) extends Serializable
 
 /**
  * :: Experimental ::
@@ -133,16 +133,33 @@ class PowerIterationClustering private[clustering] (
    */
   private def pic(w: Graph[Double, Double]): PowerIterationClusteringModel = {
     val v = powerIter(w, maxIterations)
-    val assignments = kMeans(v, k)
+    val assignments = kMeans(v, k).mapPartitions({ iter =>
+      iter.map { case (id, cluster) =>
+        new Assignment(id, cluster)
+      }
+    }, preservesPartitioning = true)
     new PowerIterationClusteringModel(k, assignments)
   }
 }
 
-private[clustering] object PowerIterationClustering extends Logging {
+@Experimental
+object PowerIterationClustering extends Logging {
+
+  /**
+   * :: Experimental ::
+   * Cluster assignment.
+   * @param id node id
+   * @param cluster assigned cluster id
+   */
+  @Experimental
+  class Assignment(val id: Long, val cluster: Int) extends Serializable
+
   /**
    * Normalizes the affinity matrix (A) by row sums and returns the normalized affinity matrix (W).
    */
-  def normalize(similarities: RDD[(Long, Long, Double)]): Graph[Double, Double] = {
+  private[clustering]
+  def normalize(similarities: RDD[(Long, Long, Double)])
+    : Graph[Double, Double] = {
     val edges = similarities.flatMap { case (i, j, s) =>
       if (s < 0.0) {
         throw new SparkException("Similarity must be nonnegative but found s($i, $j) = $s.")
@@ -173,6 +190,7 @@ private[clustering] object PowerIterationClustering extends Logging {
    * @return a graph with edges representing W and vertices representing a random vector
    *         with unit 1-norm
    */
+  private[clustering]
   def randomInit(g: Graph[Double, Double]): Graph[Double, Double] = {
     val r = g.vertices.mapPartitionsWithIndex(
       (part, iter) => {
@@ -194,6 +212,7 @@ private[clustering] object PowerIterationClustering extends Logging {
    * @param g a graph representing the normalized affinity matrix (W)
    * @return a graph with edges representing W and vertices representing the degree vector
    */
+  private[clustering]
   def initDegreeVector(g: Graph[Double, Double]): Graph[Double, Double] = {
     val sum = g.vertices.values.sum()
     val v0 = g.vertices.mapValues(_ / sum)
@@ -207,6 +226,7 @@ private[clustering] object PowerIterationClustering extends Logging {
    * @param maxIterations maximum number of iterations
    * @return a [[VertexRDD]] representing the pseudo-eigenvector
    */
+  private[clustering]
   def powerIter(
       g: Graph[Double, Double],
       maxIterations: Int): VertexRDD[Double] = {
@@ -246,6 +266,7 @@ private[clustering] object PowerIterationClustering extends Logging {
    * @param k number of clusters
    * @return a [[VertexRDD]] representing the clustering assignments
    */
+  private[clustering]
   def kMeans(v: VertexRDD[Double], k: Int): VertexRDD[Int] = {
     val points = v.mapValues(x => Vectors.dense(x)).cache()
     val model = new KMeans()
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index 3168d608c9556..efa8459d3cdba 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -26,8 +26,9 @@ import scala.reflect.ClassTag
 
 import org.apache.spark.{HashPartitioner, Logging, Partitioner, SparkException}
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.api.java.{JavaPairRDD, JavaRDD}
+import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
+import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 
@@ -35,18 +36,11 @@ import org.apache.spark.storage.StorageLevel
  * :: Experimental ::
  *
  * Model trained by [[FPGrowth]], which holds frequent itemsets.
- * @param freqItemsets frequent itemset, which is an RDD of (itemset, frequency) pairs
+ * @param freqItemsets frequent itemset, which is an RDD of [[FreqItemset]]
  * @tparam Item item type
  */
 @Experimental
-class FPGrowthModel[Item: ClassTag](
-    val freqItemsets: RDD[(Array[Item], Long)]) extends Serializable {
-
-  /** Returns frequent itemsets as a [[org.apache.spark.api.java.JavaPairRDD]]. */
-  def javaFreqItemsets(): JavaPairRDD[Array[Item], java.lang.Long] = {
-    JavaPairRDD.fromRDD(freqItemsets).asInstanceOf[JavaPairRDD[Array[Item], java.lang.Long]]
-  }
-}
+class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable
 
 /**
  * :: Experimental ::
@@ -151,7 +145,7 @@ class FPGrowth private (
       data: RDD[Array[Item]],
       minCount: Long,
       freqItems: Array[Item],
-      partitioner: Partitioner): RDD[(Array[Item], Long)] = {
+      partitioner: Partitioner): RDD[FreqItemset[Item]] = {
     val itemToRank = freqItems.zipWithIndex.toMap
     data.flatMap { transaction =>
       genCondTransactions(transaction, itemToRank, partitioner)
@@ -161,7 +155,7 @@ class FPGrowth private (
     .flatMap { case (part, tree) =>
       tree.extract(minCount, x => partitioner.getPartition(x) == part)
     }.map { case (ranks, count) =>
-      (ranks.map(i => freqItems(i)).toArray, count)
+      new FreqItemset(ranks.map(i => freqItems(i)).toArray, count)
     }
   }
 
@@ -193,3 +187,26 @@ class FPGrowth private (
     output
   }
 }
+
+/**
+ * :: Experimental ::
+ */
+@Experimental
+object FPGrowth {
+
+  /**
+   * Frequent itemset.
+   * @param items items in this itemset. Java users should call [[FreqItemset#javaItems]] instead.
+   * @param freq frequency
+   * @tparam Item item type
+   */
+  class FreqItemset[Item](val items: Array[Item], val freq: Long) extends Serializable {
+
+    /**
+     * Returns items in a Java List.
+     */
+    def javaItems: java.util.List[Item] = {
+      items.toList.asJava
+    }
+  }
+}
diff --git a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java
index 851707c8a19c4..bd0edf2b9ea62 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaFPGrowthSuite.java
@@ -19,6 +19,7 @@
 
 import java.io.Serializable;
 import java.util.ArrayList;
+import java.util.List;
 
 import org.junit.After;
 import org.junit.Before;
@@ -28,6 +29,7 @@
 
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset;
 
 public class JavaFPGrowthSuite implements Serializable {
   private transient JavaSparkContext sc;
@@ -55,30 +57,18 @@ public void runFPGrowth() {
       Lists.newArrayList("z".split(" ")),
       Lists.newArrayList("x z y r q t p".split(" "))), 2);
 
-    FPGrowth fpg = new FPGrowth();
-
-    FPGrowthModel<String> model6 = fpg
-      .setMinSupport(0.9)
-      .setNumPartitions(1)
-      .run(rdd);
-    assertEquals(0, model6.javaFreqItemsets().count());
-
-    FPGrowthModel<String> model3 = fpg
+    FPGrowthModel<String> model = new FPGrowth()
       .setMinSupport(0.5)
       .setNumPartitions(2)
       .run(rdd);
-    assertEquals(18, model3.javaFreqItemsets().count());
 
-    FPGrowthModel<String> model2 = fpg
-      .setMinSupport(0.3)
-      .setNumPartitions(4)
-      .run(rdd);
-    assertEquals(54, model2.javaFreqItemsets().count());
+    List<FreqItemset<String>> freqItemsets = model.freqItemsets().toJavaRDD().collect();
+    assertEquals(18, freqItemsets.size());
 
-    FPGrowthModel<String> model1 = fpg
-      .setMinSupport(0.1)
-      .setNumPartitions(8)
-      .run(rdd);
-    assertEquals(625, model1.javaFreqItemsets().count());
+    for (FreqItemset<String> itemset: freqItemsets) {
+      // Test return types.
+      List<String> items = itemset.javaItems();
+      long freq = itemset.freq();
+    }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
index 03ecd9ca730be..6315c03a700f1 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala
@@ -51,8 +51,8 @@ class PowerIterationClusteringSuite extends FunSuite with MLlibTestSparkContext
       .setK(2)
       .run(sc.parallelize(similarities, 2))
     val predictions = Array.fill(2)(mutable.Set.empty[Long])
-    model.assignments.collect().foreach { case (i, c) =>
-        predictions(c) += i
+    model.assignments.collect().foreach { a =>
+      predictions(a.cluster) += a.id
     }
     assert(predictions.toSet == Set((0 to 3).toSet, (4 to 15).toSet))
  
@@ -61,8 +61,8 @@ class PowerIterationClusteringSuite extends FunSuite with MLlibTestSparkContext
       .setInitializationMode("degree")
       .run(sc.parallelize(similarities, 2))
     val predictions2 = Array.fill(2)(mutable.Set.empty[Long])
-    model2.assignments.collect().foreach { case (i, c) =>
-        predictions2(c) += i
+    model2.assignments.collect().foreach { a =>
+      predictions2(a.cluster) += a.id
     }
     assert(predictions2.toSet == Set((0 to 3).toSet, (4 to 15).toSet))
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
index 68128284b8608..bd5b9cc3afa10 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala
@@ -46,8 +46,8 @@ class FPGrowthSuite extends FunSuite with MLlibTestSparkContext {
       .setMinSupport(0.5)
       .setNumPartitions(2)
       .run(rdd)
-    val freqItemsets3 = model3.freqItemsets.collect().map { case (items, count) =>
-      (items.toSet, count)
+    val freqItemsets3 = model3.freqItemsets.collect().map { itemset =>
+      (itemset.items.toSet, itemset.freq)
     }
     val expected = Set(
       (Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L),
@@ -96,10 +96,10 @@ class FPGrowthSuite extends FunSuite with MLlibTestSparkContext {
       .setMinSupport(0.5)
       .setNumPartitions(2)
       .run(rdd)
-    assert(model3.freqItemsets.first()._1.getClass === Array(1).getClass,
+    assert(model3.freqItemsets.first().items.getClass === Array(1).getClass,
       "frequent itemsets should use primitive arrays")
-    val freqItemsets3 = model3.freqItemsets.collect().map { case (items, count) =>
-      (items.toSet, count)
+    val freqItemsets3 = model3.freqItemsets.collect().map { itemset =>
+      (itemset.items.toSet, itemset.freq)
     }
     val expected = Set(
       (Set(1), 6L), (Set(2), 5L), (Set(3), 5L), (Set(4), 4L),

From 3be92cdac30cf488e09dbdaaa70e5c4cdaa9a099 Mon Sep 17 00:00:00 2001
From: mcheah <mcheah@palantir.com>
Date: Thu, 19 Feb 2015 18:09:22 -0800
Subject: [PATCH 260/357] [SPARK-4808] Removing minimum number of elements read
 before spill check

In the general case, Spillable's heuristic of checking for memory stress
on every 32nd item after 1000 items are read is good enough. In general,
we do not want to be enacting the spilling checks until later on in the
job; checking for disk-spilling too early can produce unacceptable
performance impact in trivial cases.

However, there are non-trivial cases, particularly if each serialized
object is large, where checking for the necessity to spill too late
would allow the memory to overflow. Consider if every item is 1.5 MB in
size, and the heap size is 1000 MB. Then clearly if we only try to spill
the in-memory contents to disk after 1000 items are read, we would have
already accumulated 1500 MB of RAM and overflowed the heap.

Patch #3656 attempted to circumvent this by checking the need to spill
on every single item read, but that would cause unacceptable performance
in the general case. However, the convoluted cases above should not be
forced to be refactored to shrink the data items. Therefore it makes
sense that the memory spilling thresholds be configurable.

Author: mcheah <mcheah@palantir.com>

Closes #4420 from mingyukim/memory-spill-configurable and squashes the following commits:

6e2509f [mcheah] [SPARK-4808] Removing minimum number of elements read before spill check
---
 .../scala/org/apache/spark/util/collection/Spillable.scala  | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
index 9f54312074856..747ecf075a397 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
@@ -42,9 +42,6 @@ private[spark] trait Spillable[C] extends Logging {
   // Memory manager that can be used to acquire/release memory
   private[this] val shuffleMemoryManager = SparkEnv.get.shuffleMemoryManager
 
-  // Threshold for `elementsRead` before we start tracking this collection's memory usage
-  private[this] val trackMemoryThreshold = 1000
-
   // Initial threshold for the size of a collection before we start tracking its memory usage
   // Exposed for testing
   private[this] val initialMemoryThreshold: Long =
@@ -72,8 +69,7 @@ private[spark] trait Spillable[C] extends Logging {
    * @return true if `collection` was spilled to disk; false otherwise
    */
   protected def maybeSpill(collection: C, currentMemory: Long): Boolean = {
-    if (elementsRead > trackMemoryThreshold && elementsRead % 32 == 0 &&
-        currentMemory >= myMemoryThreshold) {
+    if (elementsRead % 32 == 0 && currentMemory >= myMemoryThreshold) {
       // Claim up to double our current memory from the shuffle memory pool
       val amountToRequest = 2 * currentMemory - myMemoryThreshold
       val granted = shuffleMemoryManager.tryToAcquire(amountToRequest)

From 70bfb5c7282df84e76eba01f59bf1b8551583c33 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 20 Feb 2015 16:20:02 +0800
Subject: [PATCH 261/357] [SPARK-5909][SQL] Add a clearCache command to Spark
 SQL's cache manager

JIRA: https://issues.apache.org/jira/browse/SPARK-5909

Author: Yin Huai <yhuai@databricks.com>

Closes #4694 from yhuai/clearCache and squashes the following commits:

397ecc4 [Yin Huai] Address comments.
a2702fc [Yin Huai] Update parser.
3a54506 [Yin Huai] add isEmpty to CacheManager.
6d14460 [Yin Huai] Python clearCache.
f7b8dbd [Yin Huai] Add clear cache command.
---
 python/pyspark/sql/context.py                    |  4 ++++
 .../org/apache/spark/sql/CacheManager.scala      |  6 ++++++
 .../scala/org/apache/spark/sql/SQLContext.scala  |  5 +++++
 .../org/apache/spark/sql/SparkSQLParser.scala    | 11 +++++++----
 .../apache/spark/sql/execution/commands.scala    | 15 +++++++++++++++
 .../org/apache/spark/sql/CachedTableSuite.scala  | 16 ++++++++++++++++
 6 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 2e2309f10375d..3f168f718bd7f 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -687,6 +687,10 @@ def uncacheTable(self, tableName):
         """Removes the specified table from the in-memory cache."""
         self._ssql_ctx.uncacheTable(tableName)
 
+    def clearCache(self):
+        """Removes all cached tables from the in-memory cache. """
+        self._ssql_ctx.clearCache()
+
 
 class HiveContext(SQLContext):
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala
index f1949aa5dd74b..ca4a127120b37 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/CacheManager.scala
@@ -71,11 +71,17 @@ private[sql] class CacheManager(sqlContext: SQLContext) extends Logging {
     }
   }
 
+  /** Clears all cached tables. */
   private[sql] def clearCache(): Unit = writeLock {
     cachedData.foreach(_.cachedRepresentation.cachedColumnBuffers.unpersist())
     cachedData.clear()
   }
 
+  /** Checks if the cache is empty. */
+  private[sql] def isEmpty: Boolean = readLock {
+    cachedData.isEmpty
+  }
+
   /**
    * Caches the data produced by the logical representation of the given schema rdd.  Unlike
    * `RDD.cache()`, the default storage level is set to be `MEMORY_AND_DISK` because recomputing
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index a6cf3cd9ddd4f..4bdaa023914b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -217,6 +217,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
    */
   def uncacheTable(tableName: String): Unit = cacheManager.uncacheTable(tableName)
 
+  /**
+   * Removes all cached tables from the in-memory cache.
+   */
+  def clearCache(): Unit = cacheManager.clearCache()
+
   // scalastyle:off
   // Disable style checker so "implicits" object can start with lowercase i
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
index 00e19da4374a8..5921eaf5e63f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSQLParser.scala
@@ -23,7 +23,7 @@ import scala.util.parsing.combinator.RegexParsers
 import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.{ShowTablesCommand, UncacheTableCommand, CacheTableCommand, SetCommand}
+import org.apache.spark.sql.execution._
 import org.apache.spark.sql.types.StringType
 
 
@@ -57,6 +57,7 @@ private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends Abstr
 
   protected val AS      = Keyword("AS")
   protected val CACHE   = Keyword("CACHE")
+  protected val CLEAR   = Keyword("CLEAR")
   protected val IN      = Keyword("IN")
   protected val LAZY    = Keyword("LAZY")
   protected val SET     = Keyword("SET")
@@ -74,9 +75,11 @@ private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends Abstr
     }
 
   private lazy val uncache: Parser[LogicalPlan] =
-    UNCACHE ~ TABLE ~> ident ^^ {
-      case tableName => UncacheTableCommand(tableName)
-    }
+    ( UNCACHE ~ TABLE ~> ident ^^ {
+        case tableName => UncacheTableCommand(tableName)
+      }
+    | CLEAR ~ CACHE ^^^ ClearCacheCommand
+    )
 
   private lazy val set: Parser[LogicalPlan] =
     SET ~> restInput ^^ {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index 7c92e9fc88168..a11232142d0fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -174,6 +174,21 @@ case class UncacheTableCommand(tableName: String) extends RunnableCommand {
   override def output: Seq[Attribute] = Seq.empty
 }
 
+/**
+ * :: DeveloperApi ::
+ * Clear all cached data from the in-memory cache.
+ */
+@DeveloperApi
+case object ClearCacheCommand extends RunnableCommand {
+
+  override def run(sqlContext: SQLContext) = {
+    sqlContext.clearCache()
+    Seq.empty[Row]
+  }
+
+  override def output: Seq[Attribute] = Seq.empty
+}
+
 /**
  * :: DeveloperApi ::
  */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index e70e866fdbf14..c240f2be955ca 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -280,4 +280,20 @@ class CachedTableSuite extends QueryTest {
     assert(intercept[RuntimeException](table("t1")).getMessage.startsWith("Table Not Found"))
     assert(!isCached("t2"))
   }
+
+  test("Clear all cache") {
+    sql("SELECT key FROM testData LIMIT 10").registerTempTable("t1")
+    sql("SELECT key FROM testData LIMIT 5").registerTempTable("t2")
+    cacheTable("t1")
+    cacheTable("t2")
+    clearCache()
+    assert(cacheManager.isEmpty)
+
+    sql("SELECT key FROM testData LIMIT 10").registerTempTable("t1")
+    sql("SELECT key FROM testData LIMIT 5").registerTempTable("t2")
+    cacheTable("t1")
+    cacheTable("t2")
+    sql("Clear CACHE")
+    assert(cacheManager.isEmpty)
+  }
 }

From d3dfebebce9f76e4433e16d4d6d29fb8fa4d4193 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 20 Feb 2015 10:21:39 +0000
Subject: [PATCH 262/357] SPARK-5744 [CORE] Take 2. RDD.isEmpty / take fails
 for (empty) RDD of Nothing

Follow-on to https://github.com/apache/spark/pull/4591

Document isEmpty / take / parallelize and their interaction with (an empty) RDD[Nothing] and RDD[Null]. Also, fix a marginally related minor issue with histogram() and EmptyRDD.

CC rxin since you reviewed the last one although I imagine this is an uncontroversial resolution.

Author: Sean Owen <sowen@cloudera.com>

Closes #4698 from srowen/SPARK-5744.2 and squashes the following commits:

9b2a811 [Sean Owen] 2 extra javadoc fixes
d1b9fba [Sean Owen] Document isEmpty / take / parallelize and their interaction with (an empty) RDD[Nothing] and RDD[Null]. Also, fix a marginally related minor issue with histogram() and EmptyRDD.
---
 core/src/main/scala/org/apache/spark/SparkContext.scala  | 2 ++
 .../scala/org/apache/spark/rdd/DoubleRDDFunctions.scala  | 9 ++++++++-
 core/src/main/scala/org/apache/spark/rdd/RDD.scala       | 7 +++++++
 core/src/test/java/org/apache/spark/JavaAPISuite.java    | 4 ++++
 .../test/scala/org/apache/spark/rdd/DoubleRDDSuite.scala | 3 +++
 5 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index d59b466830fdc..85ec5ea11357e 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -548,6 +548,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * @note Parallelize acts lazily. If `seq` is a mutable collection and is altered after the call
    * to parallelize and before the first action on the RDD, the resultant RDD will reflect the
    * modified collection. Pass a copy of the argument to avoid this.
+   * @note avoid using `parallelize(Seq())` to create an empty `RDD`. Consider `emptyRDD` for an
+   * RDD with no partitions, or `parallelize(Seq[T]())` for an RDD of `T` with empty partitions.
    */
   def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = {
     assertNotStopped()
diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
index e66f83bb34e30..03afc289736bb 100644
--- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
@@ -213,7 +213,14 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable {
     } else {
       basicBucketFunction _
     }
-    self.mapPartitions(histogramPartition(bucketFunction)).reduce(mergeCounters)
+    if (self.partitions.length == 0) {
+      new Array[Long](buckets.length - 1)
+    } else {
+      // reduce() requires a non-empty RDD. This works because the mapPartitions will make
+      // non-empty partitions out of empty ones. But it doesn't handle the no-partitions case,
+      // which is below
+      self.mapPartitions(histogramPartition(bucketFunction)).reduce(mergeCounters)
+    }
   }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 3ab9e54f0ec56..cf0433010aa03 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1146,6 +1146,9 @@ abstract class RDD[T: ClassTag](
    * Take the first num elements of the RDD. It works by first scanning one partition, and use the
    * results from that partition to estimate the number of additional partitions needed to satisfy
    * the limit.
+   *
+   * @note due to complications in the internal implementation, this method will raise
+   * an exception if called on an RDD of `Nothing` or `Null`.
    */
   def take(num: Int): Array[T] = {
     if (num == 0) {
@@ -1258,6 +1261,10 @@ abstract class RDD[T: ClassTag](
   def min()(implicit ord: Ordering[T]): T = this.reduce(ord.min)
 
   /**
+   * @note due to complications in the internal implementation, this method will raise an
+   * exception if called on an RDD of `Nothing` or `Null`. This may be come up in practice
+   * because, for example, the type of `parallelize(Seq())` is `RDD[Nothing]`.
+   * (`parallelize(Seq())` should be avoided anyway in favor of `parallelize(Seq[T]())`.)
    * @return true if and only if the RDD contains no elements at all. Note that an RDD
    *         may be empty even when it has at least 1 partition.
    */
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index b16a1e9460286..74e88c767ee07 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -708,6 +708,10 @@ public void javaDoubleRDDHistoGram() {
     // Test with provided buckets
     long[] histogram = rdd.histogram(expected_buckets);
     Assert.assertArrayEquals(expected_counts, histogram);
+    // SPARK-5744
+    Assert.assertArrayEquals(
+        new long[] {0},
+        sc.parallelizeDoubles(new ArrayList<Double>(0), 1).histogram(new double[]{0.0, 1.0}));
   }
 
   @Test
diff --git a/core/src/test/scala/org/apache/spark/rdd/DoubleRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/DoubleRDDSuite.scala
index de306533752c1..4cd0f97368ca3 100644
--- a/core/src/test/scala/org/apache/spark/rdd/DoubleRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/DoubleRDDSuite.scala
@@ -33,6 +33,9 @@ class DoubleRDDSuite extends FunSuite with SharedSparkContext {
     val expectedHistogramResults = Array(0)
     assert(histogramResults === expectedHistogramResults)
     assert(histogramResults2 === expectedHistogramResults)
+    val emptyRDD: RDD[Double] = sc.emptyRDD
+    assert(emptyRDD.histogram(buckets) === expectedHistogramResults)
+    assert(emptyRDD.histogram(buckets, true) === expectedHistogramResults)
   }
 
   test("WorksWithOutOfRangeWithOneBucket") {

From 4a17eedb16343413e5b6f8bb58c6da8952ee7ab6 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Fri, 20 Feb 2015 02:31:32 -0800
Subject: [PATCH 263/357] [SPARK-5867] [SPARK-5892] [doc] [ml] [mllib] Doc
 cleanups for 1.3 release

For SPARK-5867:
* The spark.ml programming guide needs to be updated to use the new SQL DataFrame API instead of the old SchemaRDD API.
* It should also include Python examples now.

For SPARK-5892:
* Fix Python docs
* Various other cleanups

BTW, I accidentally merged this with master.  If you want to compile it on your own, use this branch which is based on spark/branch-1.3 and cherry-picks the commits from this PR: [https://github.com/jkbradley/spark/tree/doc-review-1.3-check]

CC: mengxr  (ML),  davies  (Python docs)

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #4675 from jkbradley/doc-review-1.3 and squashes the following commits:

f191bb0 [Joseph K. Bradley] small cleanups
e786efa [Joseph K. Bradley] small doc corrections
6b1ab4a [Joseph K. Bradley] fixed python lint test
946affa [Joseph K. Bradley] Added sample data for ml.MovieLensALS example.  Changed spark.ml Java examples to use DataFrames API instead of sql()
da81558 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into doc-review-1.3
629dbf5 [Joseph K. Bradley] Updated based on code review: * made new page for old migration guides * small fixes * moved inherit_doc in python
b9df7c4 [Joseph K. Bradley] Small cleanups: toDF to toDF(), adding s for string interpolation
34b067f [Joseph K. Bradley] small doc correction
da16aef [Joseph K. Bradley] Fixed python mllib docs
8cce91c [Joseph K. Bradley] GMM: removed old imports, added some doc
695f3f6 [Joseph K. Bradley] partly done trying to fix inherit_doc for class hierarchies in python docs
a72c018 [Joseph K. Bradley] made ChiSqTestResult appear in python docs
b05a80d [Joseph K. Bradley] organize imports. doc cleanups
e572827 [Joseph K. Bradley] updated programming guide for ml and mllib
---
 data/mllib/als/sample_movielens_movies.txt    |  100 ++
 data/mllib/als/sample_movielens_ratings.txt   | 1501 +++++++++++++++++
 docs/ml-guide.md                              |  249 +--
 docs/mllib-guide.md                           |  133 +-
 docs/mllib-migration-guides.md                |   67 +
 .../ml/JavaCrossValidatorExample.java         |    7 +-
 .../examples/ml/JavaSimpleParamsExample.java  |    8 +-
 .../JavaSimpleTextClassificationPipeline.java |    7 +-
 .../ml/simple_text_classification_pipeline.py |    4 +-
 .../examples/ml/CrossValidatorExample.scala   |    2 +-
 .../spark/examples/ml/MovieLensALS.scala      |   32 +-
 .../examples/ml/SimpleParamsExample.scala     |    6 +-
 .../ml/SimpleTextClassificationPipeline.scala |    2 +-
 .../mllib/clustering/GaussianMixture.scala    |   12 +-
 .../apache/spark/mllib/linalg/Matrices.scala  |    6 +-
 .../apache/spark/mllib/linalg/Vectors.scala   |    2 +-
 .../spark/mllib/optimization/Gradient.scala   |   14 +-
 python/docs/pyspark.mllib.rst                 |   11 +-
 python/pyspark/ml/classification.py           |    3 +-
 python/pyspark/ml/feature.py                  |    3 +-
 python/pyspark/ml/pipeline.py                 |    3 +-
 python/pyspark/ml/util.py                     |   14 -
 python/pyspark/ml/wrapper.py                  |    2 +-
 python/pyspark/mllib/__init__.py              |    2 +-
 python/pyspark/mllib/clustering.py            |    2 +-
 python/pyspark/mllib/common.py                |   17 +
 python/pyspark/mllib/regression.py            |    8 +-
 python/pyspark/mllib/stat/__init__.py         |    4 +-
 python/pyspark/mllib/stat/distribution.py     |    3 +-
 python/pyspark/mllib/tree.py                  |    6 +-
 30 files changed, 1940 insertions(+), 290 deletions(-)
 create mode 100644 data/mllib/als/sample_movielens_movies.txt
 create mode 100644 data/mllib/als/sample_movielens_ratings.txt
 create mode 100644 docs/mllib-migration-guides.md

diff --git a/data/mllib/als/sample_movielens_movies.txt b/data/mllib/als/sample_movielens_movies.txt
new file mode 100644
index 0000000000000..934a0253849e1
--- /dev/null
+++ b/data/mllib/als/sample_movielens_movies.txt
@@ -0,0 +1,100 @@
+0::Movie 0::Romance|Comedy
+1::Movie 1::Action|Anime
+2::Movie 2::Romance|Thriller
+3::Movie 3::Action|Romance
+4::Movie 4::Anime|Comedy
+5::Movie 5::Action|Action
+6::Movie 6::Action|Comedy
+7::Movie 7::Anime|Comedy
+8::Movie 8::Comedy|Action
+9::Movie 9::Anime|Thriller
+10::Movie 10::Action|Anime
+11::Movie 11::Action|Anime
+12::Movie 12::Anime|Comedy
+13::Movie 13::Thriller|Action
+14::Movie 14::Anime|Comedy
+15::Movie 15::Comedy|Thriller
+16::Movie 16::Anime|Romance
+17::Movie 17::Thriller|Action
+18::Movie 18::Action|Comedy
+19::Movie 19::Anime|Romance
+20::Movie 20::Action|Anime
+21::Movie 21::Romance|Thriller
+22::Movie 22::Romance|Romance
+23::Movie 23::Comedy|Comedy
+24::Movie 24::Anime|Action
+25::Movie 25::Comedy|Comedy
+26::Movie 26::Anime|Romance
+27::Movie 27::Anime|Anime
+28::Movie 28::Thriller|Anime
+29::Movie 29::Anime|Romance
+30::Movie 30::Thriller|Romance
+31::Movie 31::Thriller|Romance
+32::Movie 32::Comedy|Anime
+33::Movie 33::Comedy|Comedy
+34::Movie 34::Anime|Anime
+35::Movie 35::Action|Thriller
+36::Movie 36::Anime|Romance
+37::Movie 37::Romance|Anime
+38::Movie 38::Thriller|Romance
+39::Movie 39::Romance|Comedy
+40::Movie 40::Action|Anime
+41::Movie 41::Comedy|Thriller
+42::Movie 42::Comedy|Action
+43::Movie 43::Thriller|Anime
+44::Movie 44::Anime|Action
+45::Movie 45::Comedy|Romance
+46::Movie 46::Comedy|Action
+47::Movie 47::Romance|Comedy
+48::Movie 48::Action|Comedy
+49::Movie 49::Romance|Romance
+50::Movie 50::Comedy|Romance
+51::Movie 51::Action|Action
+52::Movie 52::Thriller|Action
+53::Movie 53::Action|Action
+54::Movie 54::Romance|Thriller
+55::Movie 55::Anime|Romance
+56::Movie 56::Comedy|Action
+57::Movie 57::Action|Anime
+58::Movie 58::Thriller|Romance
+59::Movie 59::Thriller|Comedy
+60::Movie 60::Anime|Comedy
+61::Movie 61::Comedy|Action
+62::Movie 62::Comedy|Romance
+63::Movie 63::Romance|Thriller
+64::Movie 64::Romance|Action
+65::Movie 65::Anime|Romance
+66::Movie 66::Comedy|Action
+67::Movie 67::Thriller|Anime
+68::Movie 68::Thriller|Romance
+69::Movie 69::Action|Comedy
+70::Movie 70::Thriller|Thriller
+71::Movie 71::Action|Comedy
+72::Movie 72::Thriller|Romance
+73::Movie 73::Comedy|Action
+74::Movie 74::Action|Action
+75::Movie 75::Action|Action
+76::Movie 76::Comedy|Comedy
+77::Movie 77::Comedy|Comedy
+78::Movie 78::Comedy|Comedy
+79::Movie 79::Thriller|Thriller
+80::Movie 80::Comedy|Anime
+81::Movie 81::Comedy|Anime
+82::Movie 82::Romance|Anime
+83::Movie 83::Comedy|Thriller
+84::Movie 84::Anime|Action
+85::Movie 85::Thriller|Anime
+86::Movie 86::Romance|Anime
+87::Movie 87::Thriller|Thriller
+88::Movie 88::Romance|Thriller
+89::Movie 89::Action|Anime
+90::Movie 90::Anime|Romance
+91::Movie 91::Anime|Thriller
+92::Movie 92::Action|Comedy
+93::Movie 93::Romance|Thriller
+94::Movie 94::Thriller|Comedy
+95::Movie 95::Action|Action
+96::Movie 96::Thriller|Romance
+97::Movie 97::Thriller|Thriller
+98::Movie 98::Thriller|Comedy
+99::Movie 99::Thriller|Romance
diff --git a/data/mllib/als/sample_movielens_ratings.txt b/data/mllib/als/sample_movielens_ratings.txt
new file mode 100644
index 0000000000000..0889142950797
--- /dev/null
+++ b/data/mllib/als/sample_movielens_ratings.txt
@@ -0,0 +1,1501 @@
+0::2::3::1424380312
+0::3::1::1424380312
+0::5::2::1424380312
+0::9::4::1424380312
+0::11::1::1424380312
+0::12::2::1424380312
+0::15::1::1424380312
+0::17::1::1424380312
+0::19::1::1424380312
+0::21::1::1424380312
+0::23::1::1424380312
+0::26::3::1424380312
+0::27::1::1424380312
+0::28::1::1424380312
+0::29::1::1424380312
+0::30::1::1424380312
+0::31::1::1424380312
+0::34::1::1424380312
+0::37::1::1424380312
+0::41::2::1424380312
+0::44::1::1424380312
+0::45::2::1424380312
+0::46::1::1424380312
+0::47::1::1424380312
+0::48::1::1424380312
+0::50::1::1424380312
+0::51::1::1424380312
+0::54::1::1424380312
+0::55::1::1424380312
+0::59::2::1424380312
+0::61::2::1424380312
+0::64::1::1424380312
+0::67::1::1424380312
+0::68::1::1424380312
+0::69::1::1424380312
+0::71::1::1424380312
+0::72::1::1424380312
+0::77::2::1424380312
+0::79::1::1424380312
+0::83::1::1424380312
+0::87::1::1424380312
+0::89::2::1424380312
+0::91::3::1424380312
+0::92::4::1424380312
+0::94::1::1424380312
+0::95::2::1424380312
+0::96::1::1424380312
+0::98::1::1424380312
+0::99::1::1424380312
+1::2::2::1424380312
+1::3::1::1424380312
+1::4::2::1424380312
+1::6::1::1424380312
+1::9::3::1424380312
+1::12::1::1424380312
+1::13::1::1424380312
+1::14::1::1424380312
+1::16::1::1424380312
+1::19::1::1424380312
+1::21::3::1424380312
+1::27::1::1424380312
+1::28::3::1424380312
+1::33::1::1424380312
+1::36::2::1424380312
+1::37::1::1424380312
+1::40::1::1424380312
+1::41::2::1424380312
+1::43::1::1424380312
+1::44::1::1424380312
+1::47::1::1424380312
+1::50::1::1424380312
+1::54::1::1424380312
+1::56::2::1424380312
+1::57::1::1424380312
+1::58::1::1424380312
+1::60::1::1424380312
+1::62::4::1424380312
+1::63::1::1424380312
+1::67::1::1424380312
+1::68::4::1424380312
+1::70::2::1424380312
+1::72::1::1424380312
+1::73::1::1424380312
+1::74::2::1424380312
+1::76::1::1424380312
+1::77::3::1424380312
+1::78::1::1424380312
+1::81::1::1424380312
+1::82::1::1424380312
+1::85::3::1424380312
+1::86::2::1424380312
+1::88::2::1424380312
+1::91::1::1424380312
+1::92::2::1424380312
+1::93::1::1424380312
+1::94::2::1424380312
+1::96::1::1424380312
+1::97::1::1424380312
+2::4::3::1424380312
+2::6::1::1424380312
+2::8::5::1424380312
+2::9::1::1424380312
+2::10::1::1424380312
+2::12::3::1424380312
+2::13::1::1424380312
+2::15::2::1424380312
+2::18::2::1424380312
+2::19::4::1424380312
+2::22::1::1424380312
+2::26::1::1424380312
+2::28::1::1424380312
+2::34::4::1424380312
+2::35::1::1424380312
+2::37::5::1424380312
+2::38::1::1424380312
+2::39::5::1424380312
+2::40::4::1424380312
+2::47::1::1424380312
+2::50::1::1424380312
+2::52::2::1424380312
+2::54::1::1424380312
+2::55::1::1424380312
+2::57::2::1424380312
+2::58::2::1424380312
+2::59::1::1424380312
+2::61::1::1424380312
+2::62::1::1424380312
+2::64::1::1424380312
+2::65::1::1424380312
+2::66::3::1424380312
+2::68::1::1424380312
+2::71::3::1424380312
+2::76::1::1424380312
+2::77::1::1424380312
+2::78::1::1424380312
+2::80::1::1424380312
+2::83::5::1424380312
+2::85::1::1424380312
+2::87::2::1424380312
+2::88::1::1424380312
+2::89::4::1424380312
+2::90::1::1424380312
+2::92::4::1424380312
+2::93::5::1424380312
+3::0::1::1424380312
+3::1::1::1424380312
+3::2::1::1424380312
+3::7::3::1424380312
+3::8::3::1424380312
+3::9::1::1424380312
+3::14::1::1424380312
+3::15::1::1424380312
+3::16::1::1424380312
+3::18::4::1424380312
+3::19::1::1424380312
+3::24::3::1424380312
+3::26::1::1424380312
+3::29::3::1424380312
+3::33::1::1424380312
+3::34::3::1424380312
+3::35::1::1424380312
+3::36::3::1424380312
+3::37::1::1424380312
+3::38::2::1424380312
+3::43::1::1424380312
+3::44::1::1424380312
+3::46::1::1424380312
+3::47::1::1424380312
+3::51::5::1424380312
+3::52::3::1424380312
+3::56::1::1424380312
+3::58::1::1424380312
+3::60::3::1424380312
+3::62::1::1424380312
+3::65::2::1424380312
+3::66::1::1424380312
+3::67::1::1424380312
+3::68::2::1424380312
+3::70::1::1424380312
+3::72::2::1424380312
+3::76::3::1424380312
+3::79::3::1424380312
+3::80::4::1424380312
+3::81::1::1424380312
+3::83::1::1424380312
+3::84::1::1424380312
+3::86::1::1424380312
+3::87::2::1424380312
+3::88::4::1424380312
+3::89::1::1424380312
+3::91::1::1424380312
+3::94::3::1424380312
+4::1::1::1424380312
+4::6::1::1424380312
+4::8::1::1424380312
+4::9::1::1424380312
+4::10::1::1424380312
+4::11::1::1424380312
+4::12::1::1424380312
+4::13::1::1424380312
+4::14::2::1424380312
+4::15::1::1424380312
+4::17::1::1424380312
+4::20::1::1424380312
+4::22::1::1424380312
+4::23::1::1424380312
+4::24::1::1424380312
+4::29::4::1424380312
+4::30::1::1424380312
+4::31::1::1424380312
+4::34::1::1424380312
+4::35::1::1424380312
+4::36::1::1424380312
+4::39::2::1424380312
+4::40::3::1424380312
+4::41::4::1424380312
+4::43::2::1424380312
+4::44::1::1424380312
+4::45::1::1424380312
+4::46::1::1424380312
+4::47::1::1424380312
+4::49::2::1424380312
+4::50::1::1424380312
+4::51::1::1424380312
+4::52::4::1424380312
+4::54::1::1424380312
+4::55::1::1424380312
+4::60::3::1424380312
+4::61::1::1424380312
+4::62::4::1424380312
+4::63::3::1424380312
+4::65::1::1424380312
+4::67::2::1424380312
+4::69::1::1424380312
+4::70::4::1424380312
+4::71::1::1424380312
+4::73::1::1424380312
+4::78::1::1424380312
+4::84::1::1424380312
+4::85::1::1424380312
+4::87::3::1424380312
+4::88::3::1424380312
+4::89::2::1424380312
+4::96::1::1424380312
+4::97::1::1424380312
+4::98::1::1424380312
+4::99::1::1424380312
+5::0::1::1424380312
+5::1::1::1424380312
+5::4::1::1424380312
+5::5::1::1424380312
+5::8::1::1424380312
+5::9::3::1424380312
+5::10::2::1424380312
+5::13::3::1424380312
+5::15::1::1424380312
+5::19::1::1424380312
+5::20::3::1424380312
+5::21::2::1424380312
+5::23::3::1424380312
+5::27::1::1424380312
+5::28::1::1424380312
+5::29::1::1424380312
+5::31::1::1424380312
+5::36::3::1424380312
+5::38::2::1424380312
+5::39::1::1424380312
+5::42::1::1424380312
+5::48::3::1424380312
+5::49::4::1424380312
+5::50::3::1424380312
+5::51::1::1424380312
+5::52::1::1424380312
+5::54::1::1424380312
+5::55::5::1424380312
+5::56::3::1424380312
+5::58::1::1424380312
+5::60::1::1424380312
+5::61::1::1424380312
+5::64::3::1424380312
+5::65::2::1424380312
+5::68::4::1424380312
+5::70::1::1424380312
+5::71::1::1424380312
+5::72::1::1424380312
+5::74::1::1424380312
+5::79::1::1424380312
+5::81::2::1424380312
+5::84::1::1424380312
+5::85::1::1424380312
+5::86::1::1424380312
+5::88::1::1424380312
+5::90::4::1424380312
+5::91::2::1424380312
+5::95::2::1424380312
+5::99::1::1424380312
+6::0::1::1424380312
+6::1::1::1424380312
+6::2::3::1424380312
+6::5::1::1424380312
+6::6::1::1424380312
+6::9::1::1424380312
+6::10::1::1424380312
+6::15::2::1424380312
+6::16::2::1424380312
+6::17::1::1424380312
+6::18::1::1424380312
+6::20::1::1424380312
+6::21::1::1424380312
+6::22::1::1424380312
+6::24::1::1424380312
+6::25::5::1424380312
+6::26::1::1424380312
+6::28::1::1424380312
+6::30::1::1424380312
+6::33::1::1424380312
+6::38::1::1424380312
+6::39::1::1424380312
+6::43::4::1424380312
+6::44::1::1424380312
+6::45::1::1424380312
+6::48::1::1424380312
+6::49::1::1424380312
+6::50::1::1424380312
+6::53::1::1424380312
+6::54::1::1424380312
+6::55::1::1424380312
+6::56::1::1424380312
+6::58::4::1424380312
+6::59::1::1424380312
+6::60::1::1424380312
+6::61::3::1424380312
+6::63::3::1424380312
+6::66::1::1424380312
+6::67::3::1424380312
+6::68::1::1424380312
+6::69::1::1424380312
+6::71::2::1424380312
+6::73::1::1424380312
+6::75::1::1424380312
+6::77::1::1424380312
+6::79::1::1424380312
+6::81::1::1424380312
+6::84::1::1424380312
+6::85::3::1424380312
+6::86::1::1424380312
+6::87::1::1424380312
+6::88::1::1424380312
+6::89::1::1424380312
+6::91::2::1424380312
+6::94::1::1424380312
+6::95::2::1424380312
+6::96::1::1424380312
+7::1::1::1424380312
+7::2::2::1424380312
+7::3::1::1424380312
+7::4::1::1424380312
+7::7::1::1424380312
+7::10::1::1424380312
+7::11::2::1424380312
+7::14::2::1424380312
+7::15::1::1424380312
+7::16::1::1424380312
+7::18::1::1424380312
+7::21::1::1424380312
+7::22::1::1424380312
+7::23::1::1424380312
+7::25::5::1424380312
+7::26::1::1424380312
+7::29::4::1424380312
+7::30::1::1424380312
+7::31::3::1424380312
+7::32::1::1424380312
+7::33::1::1424380312
+7::35::1::1424380312
+7::37::2::1424380312
+7::39::3::1424380312
+7::40::2::1424380312
+7::42::2::1424380312
+7::44::1::1424380312
+7::45::2::1424380312
+7::47::4::1424380312
+7::48::1::1424380312
+7::49::1::1424380312
+7::53::1::1424380312
+7::54::1::1424380312
+7::55::1::1424380312
+7::56::1::1424380312
+7::59::1::1424380312
+7::61::2::1424380312
+7::62::3::1424380312
+7::63::2::1424380312
+7::66::1::1424380312
+7::67::3::1424380312
+7::74::1::1424380312
+7::75::1::1424380312
+7::76::3::1424380312
+7::77::1::1424380312
+7::81::1::1424380312
+7::82::1::1424380312
+7::84::2::1424380312
+7::85::4::1424380312
+7::86::1::1424380312
+7::92::2::1424380312
+7::96::1::1424380312
+7::97::1::1424380312
+7::98::1::1424380312
+8::0::1::1424380312
+8::2::4::1424380312
+8::3::2::1424380312
+8::4::2::1424380312
+8::5::1::1424380312
+8::7::1::1424380312
+8::9::1::1424380312
+8::11::1::1424380312
+8::15::1::1424380312
+8::18::1::1424380312
+8::19::1::1424380312
+8::21::1::1424380312
+8::29::5::1424380312
+8::31::3::1424380312
+8::33::1::1424380312
+8::35::1::1424380312
+8::36::1::1424380312
+8::40::2::1424380312
+8::44::1::1424380312
+8::45::1::1424380312
+8::50::1::1424380312
+8::51::1::1424380312
+8::52::5::1424380312
+8::53::5::1424380312
+8::54::1::1424380312
+8::55::1::1424380312
+8::56::1::1424380312
+8::58::4::1424380312
+8::60::3::1424380312
+8::62::4::1424380312
+8::64::1::1424380312
+8::67::3::1424380312
+8::69::1::1424380312
+8::71::1::1424380312
+8::72::3::1424380312
+8::77::3::1424380312
+8::78::1::1424380312
+8::79::1::1424380312
+8::83::1::1424380312
+8::85::5::1424380312
+8::86::1::1424380312
+8::88::1::1424380312
+8::90::1::1424380312
+8::92::2::1424380312
+8::95::4::1424380312
+8::96::3::1424380312
+8::97::1::1424380312
+8::98::1::1424380312
+8::99::1::1424380312
+9::2::3::1424380312
+9::3::1::1424380312
+9::4::1::1424380312
+9::5::1::1424380312
+9::6::1::1424380312
+9::7::5::1424380312
+9::9::1::1424380312
+9::12::1::1424380312
+9::14::3::1424380312
+9::15::1::1424380312
+9::19::1::1424380312
+9::21::1::1424380312
+9::22::1::1424380312
+9::24::1::1424380312
+9::25::1::1424380312
+9::26::1::1424380312
+9::30::3::1424380312
+9::32::4::1424380312
+9::35::2::1424380312
+9::36::2::1424380312
+9::37::2::1424380312
+9::38::1::1424380312
+9::39::1::1424380312
+9::43::3::1424380312
+9::49::5::1424380312
+9::50::3::1424380312
+9::53::1::1424380312
+9::54::1::1424380312
+9::58::1::1424380312
+9::59::1::1424380312
+9::60::1::1424380312
+9::61::1::1424380312
+9::63::3::1424380312
+9::64::3::1424380312
+9::68::1::1424380312
+9::69::1::1424380312
+9::70::3::1424380312
+9::71::1::1424380312
+9::73::2::1424380312
+9::75::1::1424380312
+9::77::2::1424380312
+9::81::2::1424380312
+9::82::1::1424380312
+9::83::1::1424380312
+9::84::1::1424380312
+9::86::1::1424380312
+9::87::4::1424380312
+9::88::1::1424380312
+9::90::3::1424380312
+9::94::2::1424380312
+9::95::3::1424380312
+9::97::2::1424380312
+9::98::1::1424380312
+10::0::3::1424380312
+10::2::4::1424380312
+10::4::3::1424380312
+10::7::1::1424380312
+10::8::1::1424380312
+10::10::1::1424380312
+10::13::2::1424380312
+10::14::1::1424380312
+10::16::2::1424380312
+10::17::1::1424380312
+10::18::1::1424380312
+10::21::1::1424380312
+10::22::1::1424380312
+10::24::1::1424380312
+10::25::3::1424380312
+10::28::1::1424380312
+10::35::1::1424380312
+10::36::1::1424380312
+10::37::1::1424380312
+10::38::1::1424380312
+10::39::1::1424380312
+10::40::4::1424380312
+10::41::2::1424380312
+10::42::3::1424380312
+10::43::1::1424380312
+10::49::3::1424380312
+10::50::1::1424380312
+10::51::1::1424380312
+10::52::1::1424380312
+10::55::2::1424380312
+10::56::1::1424380312
+10::58::1::1424380312
+10::63::1::1424380312
+10::66::1::1424380312
+10::67::2::1424380312
+10::68::1::1424380312
+10::75::1::1424380312
+10::77::1::1424380312
+10::79::1::1424380312
+10::86::1::1424380312
+10::89::3::1424380312
+10::90::1::1424380312
+10::97::1::1424380312
+10::98::1::1424380312
+11::0::1::1424380312
+11::6::2::1424380312
+11::9::1::1424380312
+11::10::1::1424380312
+11::11::1::1424380312
+11::12::1::1424380312
+11::13::4::1424380312
+11::16::1::1424380312
+11::18::5::1424380312
+11::19::4::1424380312
+11::20::1::1424380312
+11::21::1::1424380312
+11::22::1::1424380312
+11::23::5::1424380312
+11::25::1::1424380312
+11::27::5::1424380312
+11::30::5::1424380312
+11::32::5::1424380312
+11::35::3::1424380312
+11::36::2::1424380312
+11::37::2::1424380312
+11::38::4::1424380312
+11::39::1::1424380312
+11::40::1::1424380312
+11::41::1::1424380312
+11::43::2::1424380312
+11::45::1::1424380312
+11::47::1::1424380312
+11::48::5::1424380312
+11::50::4::1424380312
+11::51::3::1424380312
+11::59::1::1424380312
+11::61::1::1424380312
+11::62::1::1424380312
+11::64::1::1424380312
+11::66::4::1424380312
+11::67::1::1424380312
+11::69::5::1424380312
+11::70::1::1424380312
+11::71::3::1424380312
+11::72::3::1424380312
+11::75::3::1424380312
+11::76::1::1424380312
+11::77::1::1424380312
+11::78::1::1424380312
+11::79::5::1424380312
+11::80::3::1424380312
+11::81::4::1424380312
+11::82::1::1424380312
+11::86::1::1424380312
+11::88::1::1424380312
+11::89::1::1424380312
+11::90::4::1424380312
+11::94::2::1424380312
+11::97::3::1424380312
+11::99::1::1424380312
+12::2::1::1424380312
+12::4::1::1424380312
+12::6::1::1424380312
+12::7::3::1424380312
+12::8::1::1424380312
+12::14::1::1424380312
+12::15::2::1424380312
+12::16::4::1424380312
+12::17::5::1424380312
+12::18::2::1424380312
+12::21::1::1424380312
+12::22::2::1424380312
+12::23::3::1424380312
+12::24::1::1424380312
+12::25::1::1424380312
+12::27::5::1424380312
+12::30::2::1424380312
+12::31::4::1424380312
+12::35::5::1424380312
+12::38::1::1424380312
+12::41::1::1424380312
+12::44::2::1424380312
+12::45::1::1424380312
+12::50::4::1424380312
+12::51::1::1424380312
+12::52::1::1424380312
+12::53::1::1424380312
+12::54::1::1424380312
+12::56::2::1424380312
+12::57::1::1424380312
+12::60::1::1424380312
+12::63::1::1424380312
+12::64::5::1424380312
+12::66::3::1424380312
+12::67::1::1424380312
+12::70::1::1424380312
+12::72::1::1424380312
+12::74::1::1424380312
+12::75::1::1424380312
+12::77::1::1424380312
+12::78::1::1424380312
+12::79::3::1424380312
+12::82::2::1424380312
+12::83::1::1424380312
+12::84::1::1424380312
+12::85::1::1424380312
+12::86::1::1424380312
+12::87::1::1424380312
+12::88::1::1424380312
+12::91::3::1424380312
+12::92::1::1424380312
+12::94::4::1424380312
+12::95::2::1424380312
+12::96::1::1424380312
+12::98::2::1424380312
+13::0::1::1424380312
+13::3::1::1424380312
+13::4::2::1424380312
+13::5::1::1424380312
+13::6::1::1424380312
+13::12::1::1424380312
+13::14::2::1424380312
+13::15::1::1424380312
+13::17::1::1424380312
+13::18::3::1424380312
+13::20::1::1424380312
+13::21::1::1424380312
+13::22::1::1424380312
+13::26::1::1424380312
+13::27::1::1424380312
+13::29::3::1424380312
+13::31::1::1424380312
+13::33::1::1424380312
+13::40::2::1424380312
+13::43::2::1424380312
+13::44::1::1424380312
+13::45::1::1424380312
+13::49::1::1424380312
+13::51::1::1424380312
+13::52::2::1424380312
+13::53::3::1424380312
+13::54::1::1424380312
+13::62::1::1424380312
+13::63::2::1424380312
+13::64::1::1424380312
+13::68::1::1424380312
+13::71::1::1424380312
+13::72::3::1424380312
+13::73::1::1424380312
+13::74::3::1424380312
+13::77::2::1424380312
+13::78::1::1424380312
+13::79::2::1424380312
+13::83::3::1424380312
+13::85::1::1424380312
+13::86::1::1424380312
+13::87::2::1424380312
+13::88::2::1424380312
+13::90::1::1424380312
+13::93::4::1424380312
+13::94::1::1424380312
+13::98::1::1424380312
+13::99::1::1424380312
+14::1::1::1424380312
+14::3::3::1424380312
+14::4::1::1424380312
+14::5::1::1424380312
+14::6::1::1424380312
+14::7::1::1424380312
+14::9::1::1424380312
+14::10::1::1424380312
+14::11::1::1424380312
+14::12::1::1424380312
+14::13::1::1424380312
+14::14::3::1424380312
+14::15::1::1424380312
+14::16::1::1424380312
+14::17::1::1424380312
+14::20::1::1424380312
+14::21::1::1424380312
+14::24::1::1424380312
+14::25::2::1424380312
+14::27::1::1424380312
+14::28::1::1424380312
+14::29::5::1424380312
+14::31::3::1424380312
+14::34::1::1424380312
+14::36::1::1424380312
+14::37::2::1424380312
+14::39::2::1424380312
+14::40::1::1424380312
+14::44::1::1424380312
+14::45::1::1424380312
+14::47::3::1424380312
+14::48::1::1424380312
+14::49::1::1424380312
+14::51::1::1424380312
+14::52::5::1424380312
+14::53::3::1424380312
+14::54::1::1424380312
+14::55::1::1424380312
+14::56::1::1424380312
+14::62::4::1424380312
+14::63::5::1424380312
+14::67::3::1424380312
+14::68::1::1424380312
+14::69::3::1424380312
+14::71::1::1424380312
+14::72::4::1424380312
+14::73::1::1424380312
+14::76::5::1424380312
+14::79::1::1424380312
+14::82::1::1424380312
+14::83::1::1424380312
+14::88::1::1424380312
+14::93::3::1424380312
+14::94::1::1424380312
+14::95::2::1424380312
+14::96::4::1424380312
+14::98::1::1424380312
+15::0::1::1424380312
+15::1::4::1424380312
+15::2::1::1424380312
+15::5::2::1424380312
+15::6::1::1424380312
+15::7::1::1424380312
+15::13::1::1424380312
+15::14::1::1424380312
+15::15::1::1424380312
+15::17::2::1424380312
+15::19::2::1424380312
+15::22::2::1424380312
+15::23::2::1424380312
+15::25::1::1424380312
+15::26::3::1424380312
+15::27::1::1424380312
+15::28::2::1424380312
+15::29::1::1424380312
+15::32::1::1424380312
+15::33::2::1424380312
+15::34::1::1424380312
+15::35::2::1424380312
+15::36::1::1424380312
+15::37::1::1424380312
+15::39::1::1424380312
+15::42::1::1424380312
+15::46::5::1424380312
+15::48::2::1424380312
+15::50::2::1424380312
+15::51::1::1424380312
+15::52::1::1424380312
+15::58::1::1424380312
+15::62::1::1424380312
+15::64::3::1424380312
+15::65::2::1424380312
+15::72::1::1424380312
+15::73::1::1424380312
+15::74::1::1424380312
+15::79::1::1424380312
+15::80::1::1424380312
+15::81::1::1424380312
+15::82::2::1424380312
+15::85::1::1424380312
+15::87::1::1424380312
+15::91::2::1424380312
+15::96::1::1424380312
+15::97::1::1424380312
+15::98::3::1424380312
+16::2::1::1424380312
+16::5::3::1424380312
+16::6::2::1424380312
+16::7::1::1424380312
+16::9::1::1424380312
+16::12::1::1424380312
+16::14::1::1424380312
+16::15::1::1424380312
+16::19::1::1424380312
+16::21::2::1424380312
+16::29::4::1424380312
+16::30::2::1424380312
+16::32::1::1424380312
+16::34::1::1424380312
+16::36::1::1424380312
+16::38::1::1424380312
+16::46::1::1424380312
+16::47::3::1424380312
+16::48::1::1424380312
+16::49::1::1424380312
+16::50::1::1424380312
+16::51::5::1424380312
+16::54::5::1424380312
+16::55::1::1424380312
+16::56::2::1424380312
+16::57::1::1424380312
+16::60::1::1424380312
+16::63::2::1424380312
+16::65::1::1424380312
+16::67::1::1424380312
+16::72::1::1424380312
+16::74::1::1424380312
+16::80::1::1424380312
+16::81::1::1424380312
+16::82::1::1424380312
+16::85::5::1424380312
+16::86::1::1424380312
+16::90::5::1424380312
+16::91::1::1424380312
+16::93::1::1424380312
+16::94::3::1424380312
+16::95::2::1424380312
+16::96::3::1424380312
+16::98::3::1424380312
+16::99::1::1424380312
+17::2::1::1424380312
+17::3::1::1424380312
+17::6::1::1424380312
+17::10::4::1424380312
+17::11::1::1424380312
+17::13::2::1424380312
+17::17::5::1424380312
+17::19::1::1424380312
+17::20::5::1424380312
+17::22::4::1424380312
+17::28::1::1424380312
+17::29::1::1424380312
+17::33::1::1424380312
+17::34::1::1424380312
+17::35::2::1424380312
+17::37::1::1424380312
+17::38::1::1424380312
+17::45::1::1424380312
+17::46::5::1424380312
+17::47::1::1424380312
+17::49::3::1424380312
+17::51::1::1424380312
+17::55::5::1424380312
+17::56::3::1424380312
+17::57::1::1424380312
+17::58::1::1424380312
+17::59::1::1424380312
+17::60::1::1424380312
+17::63::1::1424380312
+17::66::1::1424380312
+17::68::4::1424380312
+17::69::1::1424380312
+17::70::1::1424380312
+17::72::1::1424380312
+17::73::3::1424380312
+17::78::1::1424380312
+17::79::1::1424380312
+17::82::2::1424380312
+17::84::1::1424380312
+17::90::5::1424380312
+17::91::3::1424380312
+17::92::1::1424380312
+17::93::1::1424380312
+17::94::4::1424380312
+17::95::2::1424380312
+17::97::1::1424380312
+18::1::1::1424380312
+18::4::3::1424380312
+18::5::2::1424380312
+18::6::1::1424380312
+18::7::1::1424380312
+18::10::1::1424380312
+18::11::4::1424380312
+18::12::2::1424380312
+18::13::1::1424380312
+18::15::1::1424380312
+18::18::1::1424380312
+18::20::1::1424380312
+18::21::2::1424380312
+18::22::1::1424380312
+18::23::2::1424380312
+18::25::1::1424380312
+18::26::1::1424380312
+18::27::1::1424380312
+18::28::5::1424380312
+18::29::1::1424380312
+18::31::1::1424380312
+18::32::1::1424380312
+18::36::1::1424380312
+18::38::5::1424380312
+18::39::5::1424380312
+18::40::1::1424380312
+18::42::1::1424380312
+18::43::1::1424380312
+18::44::4::1424380312
+18::46::1::1424380312
+18::47::1::1424380312
+18::48::1::1424380312
+18::51::2::1424380312
+18::55::1::1424380312
+18::56::1::1424380312
+18::57::1::1424380312
+18::62::1::1424380312
+18::63::1::1424380312
+18::66::3::1424380312
+18::67::1::1424380312
+18::70::1::1424380312
+18::75::1::1424380312
+18::76::3::1424380312
+18::77::1::1424380312
+18::80::3::1424380312
+18::81::3::1424380312
+18::82::1::1424380312
+18::83::5::1424380312
+18::84::1::1424380312
+18::97::1::1424380312
+18::98::1::1424380312
+18::99::2::1424380312
+19::0::1::1424380312
+19::1::1::1424380312
+19::2::1::1424380312
+19::4::1::1424380312
+19::6::2::1424380312
+19::11::1::1424380312
+19::12::1::1424380312
+19::14::1::1424380312
+19::23::1::1424380312
+19::26::1::1424380312
+19::31::1::1424380312
+19::32::4::1424380312
+19::33::1::1424380312
+19::34::1::1424380312
+19::37::1::1424380312
+19::38::1::1424380312
+19::41::1::1424380312
+19::43::1::1424380312
+19::45::1::1424380312
+19::48::1::1424380312
+19::49::1::1424380312
+19::50::2::1424380312
+19::53::2::1424380312
+19::54::3::1424380312
+19::55::1::1424380312
+19::56::2::1424380312
+19::58::1::1424380312
+19::61::1::1424380312
+19::62::1::1424380312
+19::63::1::1424380312
+19::64::1::1424380312
+19::65::1::1424380312
+19::69::2::1424380312
+19::72::1::1424380312
+19::74::3::1424380312
+19::76::1::1424380312
+19::78::1::1424380312
+19::79::1::1424380312
+19::81::1::1424380312
+19::82::1::1424380312
+19::84::1::1424380312
+19::86::1::1424380312
+19::87::2::1424380312
+19::90::4::1424380312
+19::93::1::1424380312
+19::94::4::1424380312
+19::95::2::1424380312
+19::96::1::1424380312
+19::98::4::1424380312
+20::0::1::1424380312
+20::1::1::1424380312
+20::2::2::1424380312
+20::4::2::1424380312
+20::6::1::1424380312
+20::8::1::1424380312
+20::12::1::1424380312
+20::21::2::1424380312
+20::22::5::1424380312
+20::24::2::1424380312
+20::25::1::1424380312
+20::26::1::1424380312
+20::29::2::1424380312
+20::30::2::1424380312
+20::32::2::1424380312
+20::39::1::1424380312
+20::40::1::1424380312
+20::41::2::1424380312
+20::45::2::1424380312
+20::48::1::1424380312
+20::50::1::1424380312
+20::51::3::1424380312
+20::53::3::1424380312
+20::55::1::1424380312
+20::57::2::1424380312
+20::60::1::1424380312
+20::61::1::1424380312
+20::64::1::1424380312
+20::66::1::1424380312
+20::70::2::1424380312
+20::72::1::1424380312
+20::73::2::1424380312
+20::75::4::1424380312
+20::76::1::1424380312
+20::77::4::1424380312
+20::78::1::1424380312
+20::79::1::1424380312
+20::84::2::1424380312
+20::85::2::1424380312
+20::88::3::1424380312
+20::89::1::1424380312
+20::90::3::1424380312
+20::91::1::1424380312
+20::92::2::1424380312
+20::93::1::1424380312
+20::94::4::1424380312
+20::97::1::1424380312
+21::0::1::1424380312
+21::2::4::1424380312
+21::3::1::1424380312
+21::7::2::1424380312
+21::11::1::1424380312
+21::12::1::1424380312
+21::13::1::1424380312
+21::14::3::1424380312
+21::17::1::1424380312
+21::19::1::1424380312
+21::20::1::1424380312
+21::21::1::1424380312
+21::22::1::1424380312
+21::23::1::1424380312
+21::24::1::1424380312
+21::27::1::1424380312
+21::29::5::1424380312
+21::30::2::1424380312
+21::38::1::1424380312
+21::40::2::1424380312
+21::43::3::1424380312
+21::44::1::1424380312
+21::45::1::1424380312
+21::46::1::1424380312
+21::48::1::1424380312
+21::51::1::1424380312
+21::53::5::1424380312
+21::54::1::1424380312
+21::55::1::1424380312
+21::56::1::1424380312
+21::58::3::1424380312
+21::59::3::1424380312
+21::64::1::1424380312
+21::66::1::1424380312
+21::68::1::1424380312
+21::71::1::1424380312
+21::73::1::1424380312
+21::74::4::1424380312
+21::80::1::1424380312
+21::81::1::1424380312
+21::83::1::1424380312
+21::84::1::1424380312
+21::85::3::1424380312
+21::87::4::1424380312
+21::89::2::1424380312
+21::92::2::1424380312
+21::96::3::1424380312
+21::99::1::1424380312
+22::0::1::1424380312
+22::3::2::1424380312
+22::5::2::1424380312
+22::6::2::1424380312
+22::9::1::1424380312
+22::10::1::1424380312
+22::11::1::1424380312
+22::13::1::1424380312
+22::14::1::1424380312
+22::16::1::1424380312
+22::18::3::1424380312
+22::19::1::1424380312
+22::22::5::1424380312
+22::25::1::1424380312
+22::26::1::1424380312
+22::29::3::1424380312
+22::30::5::1424380312
+22::32::4::1424380312
+22::33::1::1424380312
+22::35::1::1424380312
+22::36::3::1424380312
+22::37::1::1424380312
+22::40::1::1424380312
+22::41::3::1424380312
+22::44::1::1424380312
+22::45::2::1424380312
+22::48::1::1424380312
+22::51::5::1424380312
+22::55::1::1424380312
+22::56::2::1424380312
+22::60::3::1424380312
+22::61::1::1424380312
+22::62::4::1424380312
+22::63::1::1424380312
+22::65::1::1424380312
+22::66::1::1424380312
+22::68::4::1424380312
+22::69::4::1424380312
+22::70::3::1424380312
+22::71::1::1424380312
+22::74::5::1424380312
+22::75::5::1424380312
+22::78::1::1424380312
+22::80::3::1424380312
+22::81::1::1424380312
+22::82::1::1424380312
+22::84::1::1424380312
+22::86::1::1424380312
+22::87::3::1424380312
+22::88::5::1424380312
+22::90::2::1424380312
+22::92::3::1424380312
+22::95::2::1424380312
+22::96::2::1424380312
+22::98::4::1424380312
+22::99::1::1424380312
+23::0::1::1424380312
+23::2::1::1424380312
+23::4::1::1424380312
+23::6::2::1424380312
+23::10::4::1424380312
+23::12::1::1424380312
+23::13::4::1424380312
+23::14::1::1424380312
+23::15::1::1424380312
+23::18::4::1424380312
+23::22::2::1424380312
+23::23::4::1424380312
+23::24::1::1424380312
+23::25::1::1424380312
+23::26::1::1424380312
+23::27::5::1424380312
+23::28::1::1424380312
+23::29::1::1424380312
+23::30::4::1424380312
+23::32::5::1424380312
+23::33::2::1424380312
+23::36::3::1424380312
+23::37::1::1424380312
+23::38::1::1424380312
+23::39::1::1424380312
+23::43::1::1424380312
+23::48::5::1424380312
+23::49::5::1424380312
+23::50::4::1424380312
+23::53::1::1424380312
+23::55::5::1424380312
+23::57::1::1424380312
+23::59::1::1424380312
+23::60::1::1424380312
+23::61::1::1424380312
+23::64::4::1424380312
+23::65::5::1424380312
+23::66::2::1424380312
+23::67::1::1424380312
+23::68::3::1424380312
+23::69::1::1424380312
+23::72::1::1424380312
+23::73::3::1424380312
+23::77::1::1424380312
+23::82::2::1424380312
+23::83::1::1424380312
+23::84::1::1424380312
+23::85::1::1424380312
+23::87::3::1424380312
+23::88::1::1424380312
+23::95::2::1424380312
+23::97::1::1424380312
+24::4::1::1424380312
+24::6::3::1424380312
+24::7::1::1424380312
+24::10::2::1424380312
+24::12::1::1424380312
+24::15::1::1424380312
+24::19::1::1424380312
+24::24::1::1424380312
+24::27::3::1424380312
+24::30::5::1424380312
+24::31::1::1424380312
+24::32::3::1424380312
+24::33::1::1424380312
+24::37::1::1424380312
+24::39::1::1424380312
+24::40::1::1424380312
+24::42::1::1424380312
+24::43::3::1424380312
+24::45::2::1424380312
+24::46::1::1424380312
+24::47::1::1424380312
+24::48::1::1424380312
+24::49::1::1424380312
+24::50::1::1424380312
+24::52::5::1424380312
+24::57::1::1424380312
+24::59::4::1424380312
+24::63::4::1424380312
+24::65::1::1424380312
+24::66::1::1424380312
+24::67::1::1424380312
+24::68::3::1424380312
+24::69::5::1424380312
+24::71::1::1424380312
+24::72::4::1424380312
+24::77::4::1424380312
+24::78::1::1424380312
+24::80::1::1424380312
+24::82::1::1424380312
+24::84::1::1424380312
+24::86::1::1424380312
+24::87::1::1424380312
+24::88::2::1424380312
+24::89::1::1424380312
+24::90::5::1424380312
+24::91::1::1424380312
+24::92::1::1424380312
+24::94::2::1424380312
+24::95::1::1424380312
+24::96::5::1424380312
+24::98::1::1424380312
+24::99::1::1424380312
+25::1::3::1424380312
+25::2::1::1424380312
+25::7::1::1424380312
+25::9::1::1424380312
+25::12::3::1424380312
+25::16::3::1424380312
+25::17::1::1424380312
+25::18::1::1424380312
+25::20::1::1424380312
+25::22::1::1424380312
+25::23::1::1424380312
+25::26::2::1424380312
+25::29::1::1424380312
+25::30::1::1424380312
+25::31::2::1424380312
+25::33::4::1424380312
+25::34::3::1424380312
+25::35::2::1424380312
+25::36::1::1424380312
+25::37::1::1424380312
+25::40::1::1424380312
+25::41::1::1424380312
+25::43::1::1424380312
+25::47::4::1424380312
+25::50::1::1424380312
+25::51::1::1424380312
+25::53::1::1424380312
+25::56::1::1424380312
+25::58::2::1424380312
+25::64::2::1424380312
+25::67::2::1424380312
+25::68::1::1424380312
+25::70::1::1424380312
+25::71::4::1424380312
+25::73::1::1424380312
+25::74::1::1424380312
+25::76::1::1424380312
+25::79::1::1424380312
+25::82::1::1424380312
+25::84::2::1424380312
+25::85::1::1424380312
+25::91::3::1424380312
+25::92::1::1424380312
+25::94::1::1424380312
+25::95::1::1424380312
+25::97::2::1424380312
+26::0::1::1424380312
+26::1::1::1424380312
+26::2::1::1424380312
+26::3::1::1424380312
+26::4::4::1424380312
+26::5::2::1424380312
+26::6::3::1424380312
+26::7::5::1424380312
+26::13::3::1424380312
+26::14::1::1424380312
+26::16::1::1424380312
+26::18::3::1424380312
+26::20::1::1424380312
+26::21::3::1424380312
+26::22::5::1424380312
+26::23::5::1424380312
+26::24::5::1424380312
+26::27::1::1424380312
+26::31::1::1424380312
+26::35::1::1424380312
+26::36::4::1424380312
+26::40::1::1424380312
+26::44::1::1424380312
+26::45::2::1424380312
+26::47::1::1424380312
+26::48::1::1424380312
+26::49::3::1424380312
+26::50::2::1424380312
+26::52::1::1424380312
+26::54::4::1424380312
+26::55::1::1424380312
+26::57::3::1424380312
+26::58::1::1424380312
+26::61::1::1424380312
+26::62::2::1424380312
+26::66::1::1424380312
+26::68::4::1424380312
+26::71::1::1424380312
+26::73::4::1424380312
+26::76::1::1424380312
+26::81::3::1424380312
+26::85::1::1424380312
+26::86::3::1424380312
+26::88::5::1424380312
+26::91::1::1424380312
+26::94::5::1424380312
+26::95::1::1424380312
+26::96::1::1424380312
+26::97::1::1424380312
+27::0::1::1424380312
+27::9::1::1424380312
+27::10::1::1424380312
+27::18::4::1424380312
+27::19::3::1424380312
+27::20::1::1424380312
+27::22::2::1424380312
+27::24::2::1424380312
+27::25::1::1424380312
+27::27::3::1424380312
+27::28::1::1424380312
+27::29::1::1424380312
+27::31::1::1424380312
+27::33::3::1424380312
+27::40::1::1424380312
+27::42::1::1424380312
+27::43::1::1424380312
+27::44::3::1424380312
+27::45::1::1424380312
+27::51::3::1424380312
+27::52::1::1424380312
+27::55::3::1424380312
+27::57::1::1424380312
+27::59::1::1424380312
+27::60::1::1424380312
+27::61::1::1424380312
+27::64::1::1424380312
+27::66::3::1424380312
+27::68::1::1424380312
+27::70::1::1424380312
+27::71::2::1424380312
+27::72::1::1424380312
+27::75::3::1424380312
+27::78::1::1424380312
+27::80::3::1424380312
+27::82::1::1424380312
+27::83::3::1424380312
+27::86::1::1424380312
+27::87::2::1424380312
+27::90::1::1424380312
+27::91::1::1424380312
+27::92::1::1424380312
+27::93::1::1424380312
+27::94::2::1424380312
+27::95::1::1424380312
+27::98::1::1424380312
+28::0::3::1424380312
+28::1::1::1424380312
+28::2::4::1424380312
+28::3::1::1424380312
+28::6::1::1424380312
+28::7::1::1424380312
+28::12::5::1424380312
+28::13::2::1424380312
+28::14::1::1424380312
+28::15::1::1424380312
+28::17::1::1424380312
+28::19::3::1424380312
+28::20::1::1424380312
+28::23::3::1424380312
+28::24::3::1424380312
+28::27::1::1424380312
+28::29::1::1424380312
+28::33::1::1424380312
+28::34::1::1424380312
+28::36::1::1424380312
+28::38::2::1424380312
+28::39::2::1424380312
+28::44::1::1424380312
+28::45::1::1424380312
+28::49::4::1424380312
+28::50::1::1424380312
+28::52::1::1424380312
+28::54::1::1424380312
+28::56::1::1424380312
+28::57::3::1424380312
+28::58::1::1424380312
+28::59::1::1424380312
+28::60::1::1424380312
+28::62::3::1424380312
+28::63::1::1424380312
+28::65::1::1424380312
+28::75::1::1424380312
+28::78::1::1424380312
+28::81::5::1424380312
+28::82::4::1424380312
+28::83::1::1424380312
+28::85::1::1424380312
+28::88::2::1424380312
+28::89::4::1424380312
+28::90::1::1424380312
+28::92::5::1424380312
+28::94::1::1424380312
+28::95::2::1424380312
+28::98::1::1424380312
+28::99::1::1424380312
+29::3::1::1424380312
+29::4::1::1424380312
+29::5::1::1424380312
+29::7::2::1424380312
+29::9::1::1424380312
+29::10::3::1424380312
+29::11::1::1424380312
+29::13::3::1424380312
+29::14::1::1424380312
+29::15::1::1424380312
+29::17::3::1424380312
+29::19::3::1424380312
+29::22::3::1424380312
+29::23::4::1424380312
+29::25::1::1424380312
+29::29::1::1424380312
+29::31::1::1424380312
+29::32::4::1424380312
+29::33::2::1424380312
+29::36::2::1424380312
+29::38::3::1424380312
+29::39::1::1424380312
+29::42::1::1424380312
+29::46::5::1424380312
+29::49::3::1424380312
+29::51::2::1424380312
+29::59::1::1424380312
+29::61::1::1424380312
+29::62::1::1424380312
+29::67::1::1424380312
+29::68::3::1424380312
+29::69::1::1424380312
+29::70::1::1424380312
+29::74::1::1424380312
+29::75::1::1424380312
+29::79::2::1424380312
+29::80::1::1424380312
+29::81::2::1424380312
+29::83::1::1424380312
+29::85::1::1424380312
+29::86::1::1424380312
+29::90::4::1424380312
+29::93::1::1424380312
+29::94::4::1424380312
+29::97::1::1424380312
+29::99::1::1424380312
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index 4bf14fba34eec..da6aef7f14c4c 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -23,13 +23,13 @@ to `spark.ml`.
 
 Spark ML standardizes APIs for machine learning algorithms to make it easier to combine multiple algorithms into a single pipeline, or workflow.  This section covers the key concepts introduced by the Spark ML API.
 
-* **[ML Dataset](ml-guide.html#ml-dataset)**: Spark ML uses the [`SchemaRDD`](api/scala/index.html#org.apache.spark.sql.SchemaRDD) from Spark SQL as a dataset which can hold a variety of data types.
+* **[ML Dataset](ml-guide.html#ml-dataset)**: Spark ML uses the [`DataFrame`](api/scala/index.html#org.apache.spark.sql.DataFrame) from Spark SQL as a dataset which can hold a variety of data types.
 E.g., a dataset could have different columns storing text, feature vectors, true labels, and predictions.
 
-* **[`Transformer`](ml-guide.html#transformers)**: A `Transformer` is an algorithm which can transform one `SchemaRDD` into another `SchemaRDD`.
+* **[`Transformer`](ml-guide.html#transformers)**: A `Transformer` is an algorithm which can transform one `DataFrame` into another `DataFrame`.
 E.g., an ML model is a `Transformer` which transforms an RDD with features into an RDD with predictions.
 
-* **[`Estimator`](ml-guide.html#estimators)**: An `Estimator` is an algorithm which can be fit on a `SchemaRDD` to produce a `Transformer`.
+* **[`Estimator`](ml-guide.html#estimators)**: An `Estimator` is an algorithm which can be fit on a `DataFrame` to produce a `Transformer`.
 E.g., a learning algorithm is an `Estimator` which trains on a dataset and produces a model.
 
 * **[`Pipeline`](ml-guide.html#pipeline)**: A `Pipeline` chains multiple `Transformer`s and `Estimator`s together to specify an ML workflow.
@@ -39,20 +39,20 @@ E.g., a learning algorithm is an `Estimator` which trains on a dataset and produ
 ## ML Dataset
 
 Machine learning can be applied to a wide variety of data types, such as vectors, text, images, and structured data.
-Spark ML adopts the [`SchemaRDD`](api/scala/index.html#org.apache.spark.sql.SchemaRDD) from Spark SQL in order to support a variety of data types under a unified Dataset concept.
+Spark ML adopts the [`DataFrame`](api/scala/index.html#org.apache.spark.sql.DataFrame) from Spark SQL in order to support a variety of data types under a unified Dataset concept.
 
-`SchemaRDD` supports many basic and structured types; see the [Spark SQL datatype reference](sql-programming-guide.html#spark-sql-datatype-reference) for a list of supported types.
-In addition to the types listed in the Spark SQL guide, `SchemaRDD` can use ML [`Vector`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) types.
+`DataFrame` supports many basic and structured types; see the [Spark SQL datatype reference](sql-programming-guide.html#spark-sql-datatype-reference) for a list of supported types.
+In addition to the types listed in the Spark SQL guide, `DataFrame` can use ML [`Vector`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) types.
 
-A `SchemaRDD` can be created either implicitly or explicitly from a regular `RDD`.  See the code examples below and the [Spark SQL programming guide](sql-programming-guide.html) for examples.
+A `DataFrame` can be created either implicitly or explicitly from a regular `RDD`.  See the code examples below and the [Spark SQL programming guide](sql-programming-guide.html) for examples.
 
-Columns in a `SchemaRDD` are named.  The code examples below use names such as "text," "features," and "label."
+Columns in a `DataFrame` are named.  The code examples below use names such as "text," "features," and "label."
 
 ## ML Algorithms
 
 ### Transformers
 
-A [`Transformer`](api/scala/index.html#org.apache.spark.ml.Transformer) is an abstraction which includes feature transformers and learned models.  Technically, a `Transformer` implements a method `transform()` which converts one `SchemaRDD` into another, generally by appending one or more columns.
+A [`Transformer`](api/scala/index.html#org.apache.spark.ml.Transformer) is an abstraction which includes feature transformers and learned models.  Technically, a `Transformer` implements a method `transform()` which converts one `DataFrame` into another, generally by appending one or more columns.
 For example:
 
 * A feature transformer might take a dataset, read a column (e.g., text), convert it into a new column (e.g., feature vectors), append the new column to the dataset, and output the updated dataset.
@@ -60,7 +60,7 @@ For example:
 
 ### Estimators
 
-An [`Estimator`](api/scala/index.html#org.apache.spark.ml.Estimator) abstracts the concept of a learning algorithm or any algorithm which fits or trains on data.  Technically, an `Estimator` implements a method `fit()` which accepts a `SchemaRDD` and produces a `Transformer`.
+An [`Estimator`](api/scala/index.html#org.apache.spark.ml.Estimator) abstracts the concept of a learning algorithm or any algorithm which fits or trains on data.  Technically, an `Estimator` implements a method `fit()` which accepts a `DataFrame` and produces a `Transformer`.
 For example, a learning algorithm such as `LogisticRegression` is an `Estimator`, and calling `fit()` trains a `LogisticRegressionModel`, which is a `Transformer`.
 
 ### Properties of ML Algorithms
@@ -101,7 +101,7 @@ We illustrate this for the simple text document workflow.  The figure below is f
 
 Above, the top row represents a `Pipeline` with three stages.
 The first two (`Tokenizer` and `HashingTF`) are `Transformer`s (blue), and the third (`LogisticRegression`) is an `Estimator` (red).
-The bottom row represents data flowing through the pipeline, where cylinders indicate `SchemaRDD`s.
+The bottom row represents data flowing through the pipeline, where cylinders indicate `DataFrame`s.
 The `Pipeline.fit()` method is called on the original dataset which has raw text documents and labels.
 The `Tokenizer.transform()` method splits the raw text documents into words, adding a new column with words into the dataset.
 The `HashingTF.transform()` method converts the words column into feature vectors, adding a new column with those vectors to the dataset.
@@ -130,7 +130,7 @@ Each stage's `transform()` method updates the dataset and passes it to the next
 
 *DAG `Pipeline`s*: A `Pipeline`'s stages are specified as an ordered array.  The examples given here are all for linear `Pipeline`s, i.e., `Pipeline`s in which each stage uses data produced by the previous stage.  It is possible to create non-linear `Pipeline`s as long as the data flow graph forms a Directed Acyclic Graph (DAG).  This graph is currently specified implicitly based on the input and output column names of each stage (generally specified as parameters).  If the `Pipeline` forms a DAG, then the stages must be specified in topological order.
 
-*Runtime checking*: Since `Pipeline`s can operate on datasets with varied types, they cannot use compile-time type checking.  `Pipeline`s and `PipelineModel`s instead do runtime checking before actually running the `Pipeline`.  This type checking is done using the dataset *schema*, a description of the data types of columns in the `SchemaRDD`.
+*Runtime checking*: Since `Pipeline`s can operate on datasets with varied types, they cannot use compile-time type checking.  `Pipeline`s and `PipelineModel`s instead do runtime checking before actually running the `Pipeline`.  This type checking is done using the dataset *schema*, a description of the data types of columns in the `DataFrame`.
 
 ## Parameters
 
@@ -171,12 +171,12 @@ import org.apache.spark.sql.{Row, SQLContext}
 val conf = new SparkConf().setAppName("SimpleParamsExample")
 val sc = new SparkContext(conf)
 val sqlContext = new SQLContext(sc)
-import sqlContext._
+import sqlContext.implicits._
 
 // Prepare training data.
 // We use LabeledPoint, which is a case class.  Spark SQL can convert RDDs of case classes
-// into SchemaRDDs, where it uses the case class metadata to infer the schema.
-val training = sparkContext.parallelize(Seq(
+// into DataFrames, where it uses the case class metadata to infer the schema.
+val training = sc.parallelize(Seq(
   LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
   LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
   LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
@@ -192,7 +192,7 @@ lr.setMaxIter(10)
   .setRegParam(0.01)
 
 // Learn a LogisticRegression model.  This uses the parameters stored in lr.
-val model1 = lr.fit(training)
+val model1 = lr.fit(training.toDF)
 // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
 // we can view the parameters it used during fit().
 // This prints the parameter (name: value) pairs, where names are unique IDs for this
@@ -203,33 +203,35 @@ println("Model 1 was fit using parameters: " + model1.fittingParamMap)
 // which supports several methods for specifying parameters.
 val paramMap = ParamMap(lr.maxIter -> 20)
 paramMap.put(lr.maxIter, 30) // Specify 1 Param.  This overwrites the original maxIter.
-paramMap.put(lr.regParam -> 0.1, lr.threshold -> 0.5) // Specify multiple Params.
+paramMap.put(lr.regParam -> 0.1, lr.threshold -> 0.55) // Specify multiple Params.
 
 // One can also combine ParamMaps.
-val paramMap2 = ParamMap(lr.scoreCol -> "probability") // Changes output column name.
+val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name
 val paramMapCombined = paramMap ++ paramMap2
 
 // Now learn a new model using the paramMapCombined parameters.
 // paramMapCombined overrides all parameters set earlier via lr.set* methods.
-val model2 = lr.fit(training, paramMapCombined)
+val model2 = lr.fit(training.toDF, paramMapCombined)
 println("Model 2 was fit using parameters: " + model2.fittingParamMap)
 
-// Prepare test documents.
-val test = sparkContext.parallelize(Seq(
+// Prepare test data.
+val test = sc.parallelize(Seq(
   LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
   LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
   LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))))
 
-// Make predictions on test documents using the Transformer.transform() method.
+// Make predictions on test data using the Transformer.transform() method.
 // LogisticRegression.transform will only use the 'features' column.
-// Note that model2.transform() outputs a 'probability' column instead of the usual 'score'
-// column since we renamed the lr.scoreCol parameter previously.
-model2.transform(test)
-  .select('features, 'label, 'probability, 'prediction)
+// Note that model2.transform() outputs a 'myProbability' column instead of the usual
+// 'probability' column since we renamed the lr.probabilityCol parameter previously.
+model2.transform(test.toDF)
+  .select("features", "label", "myProbability", "prediction")
   .collect()
-  .foreach { case Row(features: Vector, label: Double, prob: Double, prediction: Double) =>
-    println("(" + features + ", " + label + ") -> prob=" + prob + ", prediction=" + prediction)
+  .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
+    println("($features, $label) -> prob=$prob, prediction=$prediction")
   }
+
+sc.stop()
 {% endhighlight %}
 </div>
 
@@ -244,23 +246,23 @@ import org.apache.spark.ml.param.ParamMap;
 import org.apache.spark.ml.classification.LogisticRegression;
 import org.apache.spark.mllib.linalg.Vectors;
 import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.sql.api.java.JavaSQLContext;
-import org.apache.spark.sql.api.java.JavaSchemaRDD;
-import org.apache.spark.sql.api.java.Row;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.Row;
 
 SparkConf conf = new SparkConf().setAppName("JavaSimpleParamsExample");
 JavaSparkContext jsc = new JavaSparkContext(conf);
-JavaSQLContext jsql = new JavaSQLContext(jsc);
+SQLContext jsql = new SQLContext(jsc);
 
 // Prepare training data.
-// We use LabeledPoint, which is a case class.  Spark SQL can convert RDDs of case classes
-// into SchemaRDDs, where it uses the case class metadata to infer the schema.
+// We use LabeledPoint, which is a JavaBean.  Spark SQL can convert RDDs of JavaBeans
+// into DataFrames, where it uses the bean metadata to infer the schema.
 List<LabeledPoint> localTraining = Lists.newArrayList(
   new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
   new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
   new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
   new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
-JavaSchemaRDD training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledPoint.class);
+DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledPoint.class);
 
 // Create a LogisticRegression instance.  This instance is an Estimator.
 LogisticRegression lr = new LogisticRegression();
@@ -281,13 +283,13 @@ System.out.println("Model 1 was fit using parameters: " + model1.fittingParamMap
 
 // We may alternatively specify parameters using a ParamMap.
 ParamMap paramMap = new ParamMap();
-paramMap.put(lr.maxIter(), 20); // Specify 1 Param.
+paramMap.put(lr.maxIter().w(20)); // Specify 1 Param.
 paramMap.put(lr.maxIter(), 30); // This overwrites the original maxIter.
-paramMap.put(lr.regParam(), 0.1);
+paramMap.put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params.
 
 // One can also combine ParamMaps.
 ParamMap paramMap2 = new ParamMap();
-paramMap2.put(lr.scoreCol(), "probability"); // Changes output column name.
+paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name
 ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
 
 // Now learn a new model using the paramMapCombined parameters.
@@ -300,19 +302,19 @@ List<LabeledPoint> localTest = Lists.newArrayList(
     new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
     new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
     new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
-JavaSchemaRDD test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class);
+DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), LabeledPoint.class);
 
 // Make predictions on test documents using the Transformer.transform() method.
 // LogisticRegression.transform will only use the 'features' column.
-// Note that model2.transform() outputs a 'probability' column instead of the usual 'score'
-// column since we renamed the lr.scoreCol parameter previously.
-model2.transform(test).registerAsTable("results");
-JavaSchemaRDD results =
-    jsql.sql("SELECT features, label, probability, prediction FROM results");
-for (Row r: results.collect()) {
+// Note that model2.transform() outputs a 'myProbability' column instead of the usual
+// 'probability' column since we renamed the lr.probabilityCol parameter previously.
+DataFrame results = model2.transform(test);
+for (Row r: results.select("features", "label", "myProbability", "prediction").collect()) {
   System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
       + ", prediction=" + r.get(3));
 }
+
+jsc.stop();
 {% endhighlight %}
 </div>
 
@@ -330,6 +332,7 @@ import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.classification.LogisticRegression
 import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
+import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.sql.{Row, SQLContext}
 
 // Labeled and unlabeled instance types.
@@ -337,14 +340,14 @@ import org.apache.spark.sql.{Row, SQLContext}
 case class LabeledDocument(id: Long, text: String, label: Double)
 case class Document(id: Long, text: String)
 
-// Set up contexts.  Import implicit conversions to SchemaRDD from sqlContext.
+// Set up contexts.  Import implicit conversions to DataFrame from sqlContext.
 val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline")
 val sc = new SparkContext(conf)
 val sqlContext = new SQLContext(sc)
-import sqlContext._
+import sqlContext.implicits._
 
 // Prepare training documents, which are labeled.
-val training = sparkContext.parallelize(Seq(
+val training = sc.parallelize(Seq(
   LabeledDocument(0L, "a b c d e spark", 1.0),
   LabeledDocument(1L, "b d", 0.0),
   LabeledDocument(2L, "spark f g h", 1.0),
@@ -365,30 +368,32 @@ val pipeline = new Pipeline()
   .setStages(Array(tokenizer, hashingTF, lr))
 
 // Fit the pipeline to training documents.
-val model = pipeline.fit(training)
+val model = pipeline.fit(training.toDF)
 
 // Prepare test documents, which are unlabeled.
-val test = sparkContext.parallelize(Seq(
+val test = sc.parallelize(Seq(
   Document(4L, "spark i j k"),
   Document(5L, "l m n"),
   Document(6L, "mapreduce spark"),
   Document(7L, "apache hadoop")))
 
 // Make predictions on test documents.
-model.transform(test)
-  .select('id, 'text, 'score, 'prediction)
+model.transform(test.toDF)
+  .select("id", "text", "probability", "prediction")
   .collect()
-  .foreach { case Row(id: Long, text: String, score: Double, prediction: Double) =>
-    println("(" + id + ", " + text + ") --> score=" + score + ", prediction=" + prediction)
+  .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
+    println("($id, $text) --> prob=$prob, prediction=$prediction")
   }
+
+sc.stop()
 {% endhighlight %}
 </div>
 
 <div data-lang="java">
 {% highlight java %}
-import java.io.Serializable;
 import java.util.List;
 import com.google.common.collect.Lists;
+import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.ml.Pipeline;
 import org.apache.spark.ml.PipelineModel;
@@ -396,10 +401,9 @@ import org.apache.spark.ml.PipelineStage;
 import org.apache.spark.ml.classification.LogisticRegression;
 import org.apache.spark.ml.feature.HashingTF;
 import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.sql.api.java.JavaSQLContext;
-import org.apache.spark.sql.api.java.JavaSchemaRDD;
-import org.apache.spark.sql.api.java.Row;
-import org.apache.spark.SparkConf;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
 
 // Labeled and unlabeled instance types.
 // Spark SQL can infer schema from Java Beans.
@@ -434,7 +438,7 @@ public class LabeledDocument extends Document implements Serializable {
 // Set up contexts.
 SparkConf conf = new SparkConf().setAppName("JavaSimpleTextClassificationPipeline");
 JavaSparkContext jsc = new JavaSparkContext(conf);
-JavaSQLContext jsql = new JavaSQLContext(jsc);
+SQLContext jsql = new SQLContext(jsc);
 
 // Prepare training documents, which are labeled.
 List<LabeledDocument> localTraining = Lists.newArrayList(
@@ -442,8 +446,7 @@ List<LabeledDocument> localTraining = Lists.newArrayList(
   new LabeledDocument(1L, "b d", 0.0),
   new LabeledDocument(2L, "spark f g h", 1.0),
   new LabeledDocument(3L, "hadoop mapreduce", 0.0));
-JavaSchemaRDD training =
-  jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
+DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
 
 // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
 Tokenizer tokenizer = new Tokenizer()
@@ -468,16 +471,62 @@ List<Document> localTest = Lists.newArrayList(
   new Document(5L, "l m n"),
   new Document(6L, "mapreduce spark"),
   new Document(7L, "apache hadoop"));
-JavaSchemaRDD test =
-  jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
+DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
 
 // Make predictions on test documents.
-model.transform(test).registerAsTable("prediction");
-JavaSchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
-for (Row r: predictions.collect()) {
-  System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> score=" + r.get(2)
+DataFrame predictions = model.transform(test);
+for (Row r: predictions.select("id", "text", "probability", "prediction").collect()) {
+  System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
       + ", prediction=" + r.get(3));
 }
+
+jsc.stop();
+{% endhighlight %}
+</div>
+
+<div data-lang="python">
+{% highlight python %}
+from pyspark import SparkContext
+from pyspark.ml import Pipeline
+from pyspark.ml.classification import LogisticRegression
+from pyspark.ml.feature import HashingTF, Tokenizer
+from pyspark.sql import Row, SQLContext
+
+sc = SparkContext(appName="SimpleTextClassificationPipeline")
+sqlCtx = SQLContext(sc)
+
+# Prepare training documents, which are labeled.
+LabeledDocument = Row("id", "text", "label")
+training = sc.parallelize([(0L, "a b c d e spark", 1.0),
+                           (1L, "b d", 0.0),
+                           (2L, "spark f g h", 1.0),
+                           (3L, "hadoop mapreduce", 0.0)]) \
+    .map(lambda x: LabeledDocument(*x)).toDF()
+
+# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
+tokenizer = Tokenizer(inputCol="text", outputCol="words")
+hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
+lr = LogisticRegression(maxIter=10, regParam=0.01)
+pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
+
+# Fit the pipeline to training documents.
+model = pipeline.fit(training)
+
+# Prepare test documents, which are unlabeled.
+Document = Row("id", "text")
+test = sc.parallelize([(4L, "spark i j k"),
+                       (5L, "l m n"),
+                       (6L, "mapreduce spark"),
+                       (7L, "apache hadoop")]) \
+    .map(lambda x: Document(*x)).toDF()
+
+# Make predictions on test documents and print columns of interest.
+prediction = model.transform(test)
+selected = prediction.select("id", "text", "prediction")
+for row in selected.collect():
+    print row
+
+sc.stop()
 {% endhighlight %}
 </div>
 
@@ -508,21 +557,21 @@ However, it is also a well-established method for choosing parameters which is m
 <div data-lang="scala">
 {% highlight scala %}
 import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.SparkContext._
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.classification.LogisticRegression
 import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
 import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
 import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
+import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.sql.{Row, SQLContext}
 
 val conf = new SparkConf().setAppName("CrossValidatorExample")
 val sc = new SparkContext(conf)
 val sqlContext = new SQLContext(sc)
-import sqlContext._
+import sqlContext.implicits._
 
 // Prepare training documents, which are labeled.
-val training = sparkContext.parallelize(Seq(
+val training = sc.parallelize(Seq(
   LabeledDocument(0L, "a b c d e spark", 1.0),
   LabeledDocument(1L, "b d", 0.0),
   LabeledDocument(2L, "spark f g h", 1.0),
@@ -565,24 +614,24 @@ crossval.setEstimatorParamMaps(paramGrid)
 crossval.setNumFolds(2) // Use 3+ in practice
 
 // Run cross-validation, and choose the best set of parameters.
-val cvModel = crossval.fit(training)
-// Get the best LogisticRegression model (with the best set of parameters from paramGrid).
-val lrModel = cvModel.bestModel
+val cvModel = crossval.fit(training.toDF)
 
 // Prepare test documents, which are unlabeled.
-val test = sparkContext.parallelize(Seq(
+val test = sc.parallelize(Seq(
   Document(4L, "spark i j k"),
   Document(5L, "l m n"),
   Document(6L, "mapreduce spark"),
   Document(7L, "apache hadoop")))
 
 // Make predictions on test documents. cvModel uses the best model found (lrModel).
-cvModel.transform(test)
-  .select('id, 'text, 'score, 'prediction)
+cvModel.transform(test.toDF)
+  .select("id", "text", "probability", "prediction")
   .collect()
-  .foreach { case Row(id: Long, text: String, score: Double, prediction: Double) =>
-  println("(" + id + ", " + text + ") --> score=" + score + ", prediction=" + prediction)
+  .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
+  println(s"($id, $text) --> prob=$prob, prediction=$prediction")
 }
+
+sc.stop()
 {% endhighlight %}
 </div>
 
@@ -592,7 +641,6 @@ import java.util.List;
 import com.google.common.collect.Lists;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.ml.Model;
 import org.apache.spark.ml.Pipeline;
 import org.apache.spark.ml.PipelineStage;
 import org.apache.spark.ml.classification.LogisticRegression;
@@ -603,13 +651,13 @@ import org.apache.spark.ml.param.ParamMap;
 import org.apache.spark.ml.tuning.CrossValidator;
 import org.apache.spark.ml.tuning.CrossValidatorModel;
 import org.apache.spark.ml.tuning.ParamGridBuilder;
-import org.apache.spark.sql.api.java.JavaSQLContext;
-import org.apache.spark.sql.api.java.JavaSchemaRDD;
-import org.apache.spark.sql.api.java.Row;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
 
 SparkConf conf = new SparkConf().setAppName("JavaCrossValidatorExample");
 JavaSparkContext jsc = new JavaSparkContext(conf);
-JavaSQLContext jsql = new JavaSQLContext(jsc);
+SQLContext jsql = new SQLContext(jsc);
 
 // Prepare training documents, which are labeled.
 List<LabeledDocument> localTraining = Lists.newArrayList(
@@ -625,8 +673,7 @@ List<LabeledDocument> localTraining = Lists.newArrayList(
   new LabeledDocument(9L, "a e c l", 0.0),
   new LabeledDocument(10L, "spark compile", 1.0),
   new LabeledDocument(11L, "hadoop software", 0.0));
-JavaSchemaRDD training =
-    jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
+DataFrame training = jsql.createDataFrame(jsc.parallelize(localTraining), LabeledDocument.class);
 
 // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
 Tokenizer tokenizer = new Tokenizer()
@@ -660,8 +707,6 @@ crossval.setNumFolds(2); // Use 3+ in practice
 
 // Run cross-validation, and choose the best set of parameters.
 CrossValidatorModel cvModel = crossval.fit(training);
-// Get the best LogisticRegression model (with the best set of parameters from paramGrid).
-Model lrModel = cvModel.bestModel();
 
 // Prepare test documents, which are unlabeled.
 List<Document> localTest = Lists.newArrayList(
@@ -669,15 +714,16 @@ List<Document> localTest = Lists.newArrayList(
   new Document(5L, "l m n"),
   new Document(6L, "mapreduce spark"),
   new Document(7L, "apache hadoop"));
-JavaSchemaRDD test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
+DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
 
 // Make predictions on test documents. cvModel uses the best model found (lrModel).
-cvModel.transform(test).registerAsTable("prediction");
-JavaSchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
-for (Row r: predictions.collect()) {
-  System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> score=" + r.get(2)
+DataFrame predictions = cvModel.transform(test);
+for (Row r: predictions.select("id", "text", "probability", "prediction").collect()) {
+  System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
       + ", prediction=" + r.get(3));
 }
+
+jsc.stop();
 {% endhighlight %}
 </div>
 
@@ -686,6 +732,21 @@ for (Row r: predictions.collect()) {
 # Dependencies
 
 Spark ML currently depends on MLlib and has the same dependencies.
-Please see the [MLlib Dependencies guide](mllib-guide.html#Dependencies) for more info.
+Please see the [MLlib Dependencies guide](mllib-guide.html#dependencies) for more info.
 
 Spark ML also depends upon Spark SQL, but the relevant parts of Spark SQL do not bring additional dependencies.
+
+# Migration Guide
+
+## From 1.2 to 1.3
+
+The main API changes are from Spark SQL.  We list the most important changes here:
+
+* The old [SchemaRDD](http://spark.apache.org/docs/1.2.1/api/scala/index.html#org.apache.spark.sql.SchemaRDD) has been replaced with [DataFrame](api/scala/index.html#org.apache.spark.sql.DataFrame) with a somewhat modified API.  All algorithms in Spark ML which used to use SchemaRDD now use DataFrame.
+* In Spark 1.2, we used implicit conversions from `RDD`s of `LabeledPoint` into `SchemaRDD`s by calling `import sqlContext._` where `sqlContext` was an instance of `SQLContext`.  These implicits have been moved, so we now call `import sqlContext.implicits._`.
+* Java APIs for SQL have also changed accordingly.  Please see the examples above and the [Spark SQL Programming Guide](sql-programming-guide.html) for details.
+
+Other changes were in `LogisticRegression`:
+
+* The `scoreCol` output column (with default value "score") was renamed to be `probabilityCol` (with default value "probability").  The type was originally `Double` (for the probability of class 1.0), but it is now `Vector` (for the probability of each class, to support multiclass classification in the future).
+* In Spark 1.2, `LogisticRegressionModel` did not include an intercept.  In Spark 1.3, it includes an intercept; however, it will always be 0.0 since it uses the default settings for [spark.mllib.LogisticRegressionWithLBFGS](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS).  The option to use an intercept will be added in the future.
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 0ca51f92d7a61..4c7a7d9115ca1 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -46,7 +46,7 @@ and the migration guide below will explain all changes between releases.
 
 # spark.ml: high-level APIs for ML pipelines
 
-Spark 1.2 includes a new package called `spark.ml`, which aims to provide a uniform set of
+Spark 1.2 introduced a new package called `spark.ml`, which aims to provide a uniform set of
 high-level APIs that help users create and tune practical machine learning pipelines.
 It is currently an alpha component, and we would like to hear back from the community about
 how it fits real-world use cases and how it could be improved.
@@ -92,125 +92,22 @@ version 1.4 or newer.
 
 # Migration Guide
 
-## From 1.1 to 1.2
+For the `spark.ml` package, please see the [spark.ml Migration Guide](ml-guide.html#migration-guide).
 
-The only API changes in MLlib v1.2 are in
-[`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree),
-which continues to be an experimental API in MLlib 1.2:
+## From 1.2 to 1.3
 
-1. *(Breaking change)* The Scala API for classification takes a named argument specifying the number
-of classes.  In MLlib v1.1, this argument was called `numClasses` in Python and
-`numClassesForClassification` in Scala.  In MLlib v1.2, the names are both set to `numClasses`.
-This `numClasses` parameter is specified either via
-[`Strategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.Strategy)
-or via [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree)
-static `trainClassifier` and `trainRegressor` methods.
+In the `spark.mllib` package, there were several breaking changes.  The first change (in `ALS`) is the only one in a component not marked as Alpha or Experimental.
 
-2. *(Breaking change)* The API for
-[`Node`](api/scala/index.html#org.apache.spark.mllib.tree.model.Node) has changed.
-This should generally not affect user code, unless the user manually constructs decision trees
-(instead of using the `trainClassifier` or `trainRegressor` methods).
-The tree `Node` now includes more information, including the probability of the predicted label
-(for classification).
+* *(Breaking change)* In [`ALS`](api/scala/index.html#org.apache.spark.mllib.recommendation.ALS), the extraneous method `solveLeastSquares` has been removed.  The `DeveloperApi` method `analyzeBlocks` was also removed.
+* *(Breaking change)* [`StandardScalerModel`](api/scala/index.html#org.apache.spark.mllib.feature.StandardScalerModel) remains an Alpha component. In it, the `variance` method has been replaced with the `std` method.  To compute the column variance values returned by the original `variance` method, simply square the standard deviation values returned by `std`.
+* *(Breaking change)* [`StreamingLinearRegressionWithSGD`](api/scala/index.html#org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD) remains an Experimental component.  In it, there were two changes:
+    * The constructor taking arguments was removed in favor of a builder patten using the default constructor plus parameter setter methods.
+    * Variable `model` is no longer public.
+* *(Breaking change)* [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree) remains an Experimental component.  In it and its associated classes, there were several changes:
+    * In `DecisionTree`, the deprecated class method `train` has been removed.  (The object/static `train` methods remain.)
+    * In `Strategy`, the `checkpointDir` parameter has been removed.  Checkpointing is still supported, but the checkpoint directory must be set before calling tree and tree ensemble training.
+* `PythonMLlibAPI` (the interface between Scala/Java and Python for MLlib) was a public API but is now private, declared `private[python]`.  This was never meant for external use.
 
-3. Printing methods' output has changed.  The `toString` (Scala/Java) and `__repr__` (Python) methods used to print the full model; they now print a summary.  For the full model, use `toDebugString`.
+## Previous Spark Versions
 
-Examples in the Spark distribution and examples in the
-[Decision Trees Guide](mllib-decision-tree.html#examples) have been updated accordingly.
-
-## From 1.0 to 1.1
-
-The only API changes in MLlib v1.1 are in
-[`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree),
-which continues to be an experimental API in MLlib 1.1:
-
-1. *(Breaking change)* The meaning of tree depth has been changed by 1 in order to match
-the implementations of trees in
-[scikit-learn](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree)
-and in [rpart](http://cran.r-project.org/web/packages/rpart/index.html).
-In MLlib v1.0, a depth-1 tree had 1 leaf node, and a depth-2 tree had 1 root node and 2 leaf nodes.
-In MLlib v1.1, a depth-0 tree has 1 leaf node, and a depth-1 tree has 1 root node and 2 leaf nodes.
-This depth is specified by the `maxDepth` parameter in
-[`Strategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.Strategy)
-or via [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree)
-static `trainClassifier` and `trainRegressor` methods.
-
-2. *(Non-breaking change)* We recommend using the newly added `trainClassifier` and `trainRegressor`
-methods to build a [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree),
-rather than using the old parameter class `Strategy`.  These new training methods explicitly
-separate classification and regression, and they replace specialized parameter types with
-simple `String` types.
-
-Examples of the new, recommended `trainClassifier` and `trainRegressor` are given in the
-[Decision Trees Guide](mllib-decision-tree.html#examples).
-
-## From 0.9 to 1.0
-
-In MLlib v1.0, we support both dense and sparse input in a unified way, which introduces a few
-breaking changes.  If your data is sparse, please store it in a sparse format instead of dense to
-take advantage of sparsity in both storage and computation. Details are described below.
-
-<div class="codetabs">
-<div data-lang="scala" markdown="1">
-
-We used to represent a feature vector by `Array[Double]`, which is replaced by
-[`Vector`](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) in v1.0. Algorithms that used
-to accept `RDD[Array[Double]]` now take
-`RDD[Vector]`. [`LabeledPoint`](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint)
-is now a wrapper of `(Double, Vector)` instead of `(Double, Array[Double])`. Converting
-`Array[Double]` to `Vector` is straightforward:
-
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.{Vector, Vectors}
-
-val array: Array[Double] = ... // a double array
-val vector: Vector = Vectors.dense(array) // a dense vector
-{% endhighlight %}
-
-[`Vectors`](api/scala/index.html#org.apache.spark.mllib.linalg.Vectors$) provides factory methods to create sparse vectors.
-
-*Note*: Scala imports `scala.collection.immutable.Vector` by default, so you have to import `org.apache.spark.mllib.linalg.Vector` explicitly to use MLlib's `Vector`.
-
-</div>
-
-<div data-lang="java" markdown="1">
-
-We used to represent a feature vector by `double[]`, which is replaced by
-[`Vector`](api/java/index.html?org/apache/spark/mllib/linalg/Vector.html) in v1.0. Algorithms that used
-to accept `RDD<double[]>` now take
-`RDD<Vector>`. [`LabeledPoint`](api/java/index.html?org/apache/spark/mllib/regression/LabeledPoint.html)
-is now a wrapper of `(double, Vector)` instead of `(double, double[])`. Converting `double[]` to
-`Vector` is straightforward:
-
-{% highlight java %}
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.Vectors;
-
-double[] array = ... // a double array
-Vector vector = Vectors.dense(array); // a dense vector
-{% endhighlight %}
-
-[`Vectors`](api/scala/index.html#org.apache.spark.mllib.linalg.Vectors$) provides factory methods to
-create sparse vectors.
-
-</div>
-
-<div data-lang="python" markdown="1">
-
-We used to represent a labeled feature vector in a NumPy array, where the first entry corresponds to
-the label and the rest are features.  This representation is replaced by class
-[`LabeledPoint`](api/python/pyspark.mllib.regression.LabeledPoint-class.html), which takes both
-dense and sparse feature vectors.
-
-{% highlight python %}
-from pyspark.mllib.linalg import SparseVector
-from pyspark.mllib.regression import LabeledPoint
-
-# Create a labeled point with a positive label and a dense feature vector.
-pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])
-
-# Create a labeled point with a negative label and a sparse feature vector.
-neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))
-{% endhighlight %}
-</div>
-</div>
+Earlier migration guides are archived [on this page](mllib-migration-guides.html).
diff --git a/docs/mllib-migration-guides.md b/docs/mllib-migration-guides.md
new file mode 100644
index 0000000000000..4de2d9491ac2b
--- /dev/null
+++ b/docs/mllib-migration-guides.md
@@ -0,0 +1,67 @@
+---
+layout: global
+title: Old Migration Guides - MLlib
+displayTitle: <a href="mllib-guide.html">MLlib</a> - Old Migration Guides
+description: MLlib migration guides from before Spark SPARK_VERSION_SHORT
+---
+
+The migration guide for the current Spark version is kept on the [MLlib Programming Guide main page](mllib-guide.html#migration-guide).
+
+## From 1.1 to 1.2
+
+The only API changes in MLlib v1.2 are in
+[`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree),
+which continues to be an experimental API in MLlib 1.2:
+
+1. *(Breaking change)* The Scala API for classification takes a named argument specifying the number
+of classes.  In MLlib v1.1, this argument was called `numClasses` in Python and
+`numClassesForClassification` in Scala.  In MLlib v1.2, the names are both set to `numClasses`.
+This `numClasses` parameter is specified either via
+[`Strategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.Strategy)
+or via [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree)
+static `trainClassifier` and `trainRegressor` methods.
+
+2. *(Breaking change)* The API for
+[`Node`](api/scala/index.html#org.apache.spark.mllib.tree.model.Node) has changed.
+This should generally not affect user code, unless the user manually constructs decision trees
+(instead of using the `trainClassifier` or `trainRegressor` methods).
+The tree `Node` now includes more information, including the probability of the predicted label
+(for classification).
+
+3. Printing methods' output has changed.  The `toString` (Scala/Java) and `__repr__` (Python) methods used to print the full model; they now print a summary.  For the full model, use `toDebugString`.
+
+Examples in the Spark distribution and examples in the
+[Decision Trees Guide](mllib-decision-tree.html#examples) have been updated accordingly.
+
+## From 1.0 to 1.1
+
+The only API changes in MLlib v1.1 are in
+[`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree),
+which continues to be an experimental API in MLlib 1.1:
+
+1. *(Breaking change)* The meaning of tree depth has been changed by 1 in order to match
+the implementations of trees in
+[scikit-learn](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree)
+and in [rpart](http://cran.r-project.org/web/packages/rpart/index.html).
+In MLlib v1.0, a depth-1 tree had 1 leaf node, and a depth-2 tree had 1 root node and 2 leaf nodes.
+In MLlib v1.1, a depth-0 tree has 1 leaf node, and a depth-1 tree has 1 root node and 2 leaf nodes.
+This depth is specified by the `maxDepth` parameter in
+[`Strategy`](api/scala/index.html#org.apache.spark.mllib.tree.configuration.Strategy)
+or via [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree)
+static `trainClassifier` and `trainRegressor` methods.
+
+2. *(Non-breaking change)* We recommend using the newly added `trainClassifier` and `trainRegressor`
+methods to build a [`DecisionTree`](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree),
+rather than using the old parameter class `Strategy`.  These new training methods explicitly
+separate classification and regression, and they replace specialized parameter types with
+simple `String` types.
+
+Examples of the new, recommended `trainClassifier` and `trainRegressor` are given in the
+[Decision Trees Guide](mllib-decision-tree.html#examples).
+
+## From 0.9 to 1.0
+
+In MLlib v1.0, we support both dense and sparse input in a unified way, which introduces a few
+breaking changes.  If your data is sparse, please store it in a sparse format instead of dense to
+take advantage of sparsity in both storage and computation. Details are described below.
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
index 5d8c5d0a92daa..9bbc14ea40875 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
@@ -34,8 +34,8 @@
 import org.apache.spark.ml.tuning.CrossValidatorModel;
 import org.apache.spark.ml.tuning.ParamGridBuilder;
 import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.SQLContext;
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
 
 /**
  * A simple example demonstrating model selection using CrossValidator.
@@ -115,9 +115,8 @@ public static void main(String[] args) {
     DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
 
     // Make predictions on test documents. cvModel uses the best model found (lrModel).
-    cvModel.transform(test).registerTempTable("prediction");
-    DataFrame predictions = jsql.sql("SELECT id, text, probability, prediction FROM prediction");
-    for (Row r: predictions.collect()) {
+    DataFrame predictions = cvModel.transform(test);
+    for (Row r: predictions.select("id", "text", "probability", "prediction").collect()) {
       System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
           + ", prediction=" + r.get(3));
     }
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
index 4c4d532388781..4e02acce696e6 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
@@ -29,8 +29,8 @@
 import org.apache.spark.mllib.linalg.Vectors;
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.SQLContext;
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
 
 /**
  * A simple example demonstrating ways to specify parameters for Estimators and Transformers.
@@ -100,10 +100,8 @@ public static void main(String[] args) {
     // LogisticRegression.transform will only use the 'features' column.
     // Note that model2.transform() outputs a 'myProbability' column instead of the usual
     // 'probability' column since we renamed the lr.probabilityCol parameter previously.
-    model2.transform(test).registerTempTable("results");
-    DataFrame results =
-        jsql.sql("SELECT features, label, myProbability, prediction FROM results");
-    for (Row r: results.collect()) {
+    DataFrame results = model2.transform(test);
+    for (Row r: results.select("features", "label", "myProbability", "prediction").collect()) {
       System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
           + ", prediction=" + r.get(3));
     }
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
index fdcfc888c235f..ef1ec103a879f 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
@@ -30,8 +30,8 @@
 import org.apache.spark.ml.feature.HashingTF;
 import org.apache.spark.ml.feature.Tokenizer;
 import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.SQLContext;
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SQLContext;
 
 /**
  * A simple text classification pipeline that recognizes "spark" from input text. It uses the Java
@@ -82,9 +82,8 @@ public static void main(String[] args) {
     DataFrame test = jsql.createDataFrame(jsc.parallelize(localTest), Document.class);
 
     // Make predictions on test documents.
-    model.transform(test).registerTempTable("prediction");
-    DataFrame predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
-    for (Row r: predictions.collect()) {
+    DataFrame predictions = model.transform(test);
+    for (Row r: predictions.select("id", "text", "probability", "prediction").collect()) {
       System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
           + ", prediction=" + r.get(3));
     }
diff --git a/examples/src/main/python/ml/simple_text_classification_pipeline.py b/examples/src/main/python/ml/simple_text_classification_pipeline.py
index b4d9355b681f6..d281f4fa44282 100644
--- a/examples/src/main/python/ml/simple_text_classification_pipeline.py
+++ b/examples/src/main/python/ml/simple_text_classification_pipeline.py
@@ -16,10 +16,10 @@
 #
 
 from pyspark import SparkContext
-from pyspark.sql import SQLContext, Row
 from pyspark.ml import Pipeline
-from pyspark.ml.feature import HashingTF, Tokenizer
 from pyspark.ml.classification import LogisticRegression
+from pyspark.ml.feature import HashingTF, Tokenizer
+from pyspark.sql import Row, SQLContext
 
 
 """
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
index 7ab892cd7560c..6c0af20461d3b 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
@@ -100,7 +100,7 @@ object CrossValidatorExample {
       Document(7L, "apache hadoop")))
 
     // Make predictions on test documents. cvModel uses the best model found (lrModel).
-    cvModel.transform(test.toDF)
+    cvModel.transform(test.toDF())
       .select("id", "text", "probability", "prediction")
       .collect()
       .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
index 96b2dd463e253..25f21113bf622 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MovieLensALS.scala
@@ -93,8 +93,8 @@ object MovieLensALS {
           | bin/spark-submit --class org.apache.spark.examples.ml.MovieLensALS \
           |  examples/target/scala-*/spark-examples-*.jar \
           |  --rank 10 --maxIter 15 --regParam 0.1 \
-          |  --movies path/to/movielens/movies.dat \
-          |  --ratings path/to/movielens/ratings.dat
+          |  --movies data/mllib/als/sample_movielens_movies.txt \
+          |  --ratings data/mllib/als/sample_movielens_ratings.txt
         """.stripMargin)
     }
 
@@ -157,17 +157,23 @@ object MovieLensALS {
     println(s"Test RMSE = $rmse.")
 
     // Inspect false positives.
-    predictions.registerTempTable("prediction")
-    sc.textFile(params.movies).map(Movie.parseMovie).toDF().registerTempTable("movie")
-    sqlContext.sql(
-      """
-        |SELECT userId, prediction.movieId, title, rating, prediction
-        |  FROM prediction JOIN movie ON prediction.movieId = movie.movieId
-        |  WHERE rating <= 1 AND prediction >= 4
-        |  LIMIT 100
-      """.stripMargin)
-      .collect()
-      .foreach(println)
+    // Note: We reference columns in 2 ways:
+    //  (1) predictions("movieId") lets us specify the movieId column in the predictions
+    //      DataFrame, rather than the movieId column in the movies DataFrame.
+    //  (2) $"userId" specifies the userId column in the predictions DataFrame.
+    //      We could also write predictions("userId") but do not have to since
+    //      the movies DataFrame does not have a column "userId."
+    val movies = sc.textFile(params.movies).map(Movie.parseMovie).toDF()
+    val falsePositives = predictions.join(movies)
+      .where((predictions("movieId") === movies("movieId"))
+        && ($"rating" <= 1) && ($"prediction" >= 4))
+      .select($"userId", predictions("movieId"), $"title", $"rating", $"prediction")
+    val numFalsePositives = falsePositives.count()
+    println(s"Found $numFalsePositives false positives")
+    if (numFalsePositives > 0) {
+      println(s"Example false positives:")
+      falsePositives.limit(100).collect().foreach(println)
+    }
 
     sc.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
index e8af5c162586a..bf805149d0af6 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
@@ -40,8 +40,8 @@ object SimpleParamsExample {
     import sqlContext.implicits._
 
     // Prepare training data.
-    // We use LabeledPoint, which is a case class.  Spark SQL can convert RDDs of Java Beans
-    // into DataFrames, where it uses the bean metadata to infer the schema.
+    // We use LabeledPoint, which is a case class.  Spark SQL can convert RDDs of case classes
+    // into DataFrames, where it uses the case class metadata to infer the schema.
     val training = sc.parallelize(Seq(
       LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
       LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
@@ -94,7 +94,7 @@ object SimpleParamsExample {
       .select("features", "label", "myProbability", "prediction")
       .collect()
       .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
-        println("($features, $label) -> prob=$prob, prediction=$prediction")
+        println(s"($features, $label) -> prob=$prob, prediction=$prediction")
       }
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
index a11db6fd5c382..6772efd2c581c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
@@ -83,7 +83,7 @@ object SimpleTextClassificationPipeline {
       .select("id", "text", "probability", "prediction")
       .collect()
       .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
-        println("($id, $text) --> prob=$prob, prediction=$prediction")
+        println(s"($id, $text) --> prob=$prob, prediction=$prediction")
       }
 
     sc.stop()
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 80584ef5e5979..568b65305649f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -19,12 +19,10 @@ package org.apache.spark.mllib.clustering
 
 import scala.collection.mutable.IndexedSeq
 
-import breeze.linalg.{diag, DenseMatrix => BreezeMatrix, DenseVector => BDV, SparseVector => BSV,
-  Transpose, Vector => BV}
+import breeze.linalg.{diag, DenseMatrix => BreezeMatrix, DenseVector => BDV, Vector => BV}
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.mllib.linalg.{BLAS, DenseVector, DenseMatrix, Matrices,
-  SparseVector, Vector, Vectors}
+import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, Matrices, Vector, Vectors}
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
@@ -43,7 +41,11 @@ import org.apache.spark.util.Utils
  * less than convergenceTol, or until it has reached the max number of iterations.
  * While this process is generally guaranteed to converge, it is not guaranteed
  * to find a global optimum.  
- * 
+ *
+ * Note: For high-dimensional data (with many features), this algorithm may perform poorly.
+ *       This is due to high-dimensional data (a) making it difficult to cluster at all (based
+ *       on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
+ *
  * @param k The number of independent Gaussians in the mixture model
  * @param convergenceTol The maximum change in log-likelihood at which convergence
  * is considered to have occurred.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 89b38679b7494..0e4a4d0085895 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -706,7 +706,7 @@ object Matrices {
   }
 
   /**
-   * Generate a `DenseMatrix` consisting of zeros.
+   * Generate a `Matrix` consisting of zeros.
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
    * @return `Matrix` with size `numRows` x `numCols` and values of zeros
@@ -778,8 +778,8 @@ object Matrices {
     SparseMatrix.sprandn(numRows, numCols, density, rng)
 
   /**
-   * Generate a diagonal matrix in `DenseMatrix` format from the supplied values.
-   * @param vector a `Vector` tat will form the values on the diagonal of the matrix
+   * Generate a diagonal matrix in `Matrix` format from the supplied values.
+   * @param vector a `Vector` that will form the values on the diagonal of the matrix
    * @return Square `Matrix` with size `values.length` x `values.length` and `values`
    *         on the diagonal
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 480bbfb5fe94a..4bdcb283da09c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -247,7 +247,7 @@ object Vectors {
   }
 
   /**
-   * Creates a dense vector of all zeros.
+   * Creates a vector of all zeros.
    *
    * @param size vector size
    * @return a zero vector
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
index 0acdab797e8f3..8bfa0d2b64995 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
@@ -63,10 +63,12 @@ abstract class Gradient extends Serializable {
  * http://statweb.stanford.edu/~tibs/ElemStatLearn/ , Eq. (4.17) on page 119 gives the formula of
  * multinomial logistic regression model. A simple calculation shows that
  *
+ * {{{
  * P(y=0|x, w) = 1 / (1 + \sum_i^{K-1} \exp(x w_i))
  * P(y=1|x, w) = exp(x w_1) / (1 + \sum_i^{K-1} \exp(x w_i))
  *   ...
  * P(y=K-1|x, w) = exp(x w_{K-1}) / (1 + \sum_i^{K-1} \exp(x w_i))
+ * }}}
  *
  * for K classes multiclass classification problem.
  *
@@ -75,9 +77,11 @@ abstract class Gradient extends Serializable {
  * will be (K-1) * N.
  *
  * As a result, the loss of objective function for a single instance of data can be written as
+ * {{{
  * l(w, x) = -log P(y|x, w) = -\alpha(y) log P(y=0|x, w) - (1-\alpha(y)) log P(y|x, w)
  *         = log(1 + \sum_i^{K-1}\exp(x w_i)) - (1-\alpha(y)) x w_{y-1}
  *         = log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1}
+ * }}}
  *
  * where \alpha(i) = 1 if i != 0, and
  *       \alpha(i) = 0 if i == 0,
@@ -86,14 +90,16 @@ abstract class Gradient extends Serializable {
  * For optimization, we have to calculate the first derivative of the loss function, and
  * a simple calculation shows that
  *
+ * {{{
  * \frac{\partial l(w, x)}{\partial w_{ij}}
  *   = (\exp(x w_i) / (1 + \sum_k^{K-1} \exp(x w_k)) - (1-\alpha(y)\delta_{y, i+1})) * x_j
  *   = multiplier_i * x_j
+ * }}}
  *
  * where \delta_{i, j} = 1 if i == j,
  *       \delta_{i, j} = 0 if i != j, and
- *       multiplier
- *         = \exp(margins_i) / (1 + \sum_k^{K-1} \exp(margins_i)) - (1-\alpha(y)\delta_{y, i+1})
+ *       multiplier =
+ *         \exp(margins_i) / (1 + \sum_k^{K-1} \exp(margins_i)) - (1-\alpha(y)\delta_{y, i+1})
  *
  * If any of margins is larger than 709.78, the numerical computation of multiplier and loss
  * function will be suffered from arithmetic overflow. This issue occurs when there are outliers
@@ -103,10 +109,12 @@ abstract class Gradient extends Serializable {
  * Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can be
  * easily rewritten into the following equivalent numerically stable formula.
  *
+ * {{{
  * l(w, x) = log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1}
  *         = log(\exp(-maxMargin) + \sum_i^{K-1}\exp(margins_i - maxMargin)) + maxMargin
  *           - (1-\alpha(y)) margins_{y-1}
  *         = log(1 + sum) + maxMargin - (1-\alpha(y)) margins_{y-1}
+ * }}}
  *
  * where sum = \exp(-maxMargin) + \sum_i^{K-1}\exp(margins_i - maxMargin) - 1.
  *
@@ -115,8 +123,10 @@ abstract class Gradient extends Serializable {
  *
  * For multiplier, similar trick can be applied as the following,
  *
+ * {{{
  * multiplier = \exp(margins_i) / (1 + \sum_k^{K-1} \exp(margins_i)) - (1-\alpha(y)\delta_{y, i+1})
  *            = \exp(margins_i - maxMargin) / (1 + sum) - (1-\alpha(y)\delta_{y, i+1})
+ * }}}
  *
  * where each term in \exp is also smaller than zero, so overflow is not a concern.
  *
diff --git a/python/docs/pyspark.mllib.rst b/python/docs/pyspark.mllib.rst
index 21f66ca344a3c..b706c5e376ef4 100644
--- a/python/docs/pyspark.mllib.rst
+++ b/python/docs/pyspark.mllib.rst
@@ -7,7 +7,7 @@ pyspark.mllib.classification module
 .. automodule:: pyspark.mllib.classification
     :members:
     :undoc-members:
-    :show-inheritance:
+    :inherited-members:
 
 pyspark.mllib.clustering module
 -------------------------------
@@ -15,7 +15,6 @@ pyspark.mllib.clustering module
 .. automodule:: pyspark.mllib.clustering
     :members:
     :undoc-members:
-    :show-inheritance:
 
 pyspark.mllib.feature module
 -------------------------------
@@ -39,7 +38,6 @@ pyspark.mllib.random module
 .. automodule:: pyspark.mllib.random
     :members:
     :undoc-members:
-    :show-inheritance:
 
 pyspark.mllib.recommendation module
 -----------------------------------
@@ -47,7 +45,6 @@ pyspark.mllib.recommendation module
 .. automodule:: pyspark.mllib.recommendation
     :members:
     :undoc-members:
-    :show-inheritance:
 
 pyspark.mllib.regression module
 -------------------------------
@@ -55,7 +52,7 @@ pyspark.mllib.regression module
 .. automodule:: pyspark.mllib.regression
     :members:
     :undoc-members:
-    :show-inheritance:
+    :inherited-members:
 
 pyspark.mllib.stat module
 -------------------------
@@ -63,7 +60,6 @@ pyspark.mllib.stat module
 .. automodule:: pyspark.mllib.stat
     :members:
     :undoc-members:
-    :show-inheritance:
 
 pyspark.mllib.tree module
 -------------------------
@@ -71,7 +67,7 @@ pyspark.mllib.tree module
 .. automodule:: pyspark.mllib.tree
     :members:
     :undoc-members:
-    :show-inheritance:
+    :inherited-members:
 
 pyspark.mllib.util module
 -------------------------
@@ -79,4 +75,3 @@ pyspark.mllib.util module
 .. automodule:: pyspark.mllib.util
     :members:
     :undoc-members:
-    :show-inheritance:
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index b6de7493d7523..4ff7463498cce 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -15,10 +15,11 @@
 # limitations under the License.
 #
 
-from pyspark.ml.util import inherit_doc, keyword_only
+from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
 from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,\
     HasRegParam
+from pyspark.mllib.common import inherit_doc
 
 
 __all__ = ['LogisticRegression', 'LogisticRegressionModel']
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index f1ddbb478dd9c..433b4fb5d22bf 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -16,8 +16,9 @@
 #
 
 from pyspark.ml.param.shared import HasInputCol, HasOutputCol, HasNumFeatures
-from pyspark.ml.util import inherit_doc, keyword_only
+from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaTransformer
+from pyspark.mllib.common import inherit_doc
 
 __all__ = ['Tokenizer', 'HashingTF']
 
diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
index 18d8a58f357bd..5233c5801e2e6 100644
--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@@ -18,7 +18,8 @@
 from abc import ABCMeta, abstractmethod
 
 from pyspark.ml.param import Param, Params
-from pyspark.ml.util import inherit_doc, keyword_only
+from pyspark.ml.util import keyword_only
+from pyspark.mllib.common import inherit_doc
 
 
 __all__ = ['Estimator', 'Transformer', 'Pipeline', 'PipelineModel']
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index 81d3f0882b8a9..6f7f39c40eb5a 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -19,20 +19,6 @@
 import uuid
 
 
-def inherit_doc(cls):
-    for name, func in vars(cls).items():
-        # only inherit docstring for public functions
-        if name.startswith("_"):
-            continue
-        if not func.__doc__:
-            for parent in cls.__bases__:
-                parent_func = getattr(parent, name, None)
-                if parent_func and getattr(parent_func, "__doc__", None):
-                    func.__doc__ = parent_func.__doc__
-                    break
-    return cls
-
-
 def keyword_only(func):
     """
     A decorator that forces keyword arguments in the wrapped method
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index 9e12ddc3d9b8f..4bae96f678388 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -21,7 +21,7 @@
 from pyspark.sql import DataFrame
 from pyspark.ml.param import Params
 from pyspark.ml.pipeline import Estimator, Transformer
-from pyspark.ml.util import inherit_doc
+from pyspark.mllib.common import inherit_doc
 
 
 def _jvm():
diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py
index c3217620e3c4e..6449800d9c120 100644
--- a/python/pyspark/mllib/__init__.py
+++ b/python/pyspark/mllib/__init__.py
@@ -19,7 +19,7 @@
 Python bindings for MLlib.
 """
 
-# MLlib currently needs and NumPy 1.4+, so complain if lower
+# MLlib currently needs NumPy 1.4+, so complain if lower
 
 import numpy
 if numpy.version.version < '1.4':
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index f6b97abb1723c..949db5705abd7 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -152,7 +152,7 @@ def predictSoft(self, x):
 
 class GaussianMixture(object):
     """
-    Estimate model parameters with the expectation-maximization algorithm.
+    Learning algorithm for Gaussian Mixtures using the expectation-maximization algorithm.
 
     :param data:            RDD of data points
     :param k:               Number of components
diff --git a/python/pyspark/mllib/common.py b/python/pyspark/mllib/common.py
index 3c5ee66cd8b64..621591c26b77f 100644
--- a/python/pyspark/mllib/common.py
+++ b/python/pyspark/mllib/common.py
@@ -134,3 +134,20 @@ def __del__(self):
     def call(self, name, *a):
         """Call method of java_model"""
         return callJavaFunc(self._sc, getattr(self._java_model, name), *a)
+
+
+def inherit_doc(cls):
+    """
+    A decorator that makes a class inherit documentation from its parents.
+    """
+    for name, func in vars(cls).items():
+        # only inherit docstring for public functions
+        if name.startswith("_"):
+            continue
+        if not func.__doc__:
+            for parent in cls.__bases__:
+                parent_func = getattr(parent, name, None)
+                if parent_func and getattr(parent_func, "__doc__", None):
+                    func.__doc__ = parent_func.__doc__
+                    break
+    return cls
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 210060140fd91..21751cc68f308 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -18,7 +18,7 @@
 import numpy as np
 from numpy import array
 
-from pyspark.mllib.common import callMLlibFunc
+from pyspark.mllib.common import callMLlibFunc, inherit_doc
 from pyspark.mllib.linalg import SparseVector, _convert_to_vector
 
 __all__ = ['LabeledPoint', 'LinearModel', 'LinearRegressionModel', 'RidgeRegressionModel',
@@ -69,6 +69,7 @@ def __repr__(self):
         return "(weights=%s, intercept=%r)" % (self._coeff, self._intercept)
 
 
+@inherit_doc
 class LinearRegressionModelBase(LinearModel):
 
     """A linear regression model.
@@ -89,6 +90,7 @@ def predict(self, x):
         return self.weights.dot(x) + self.intercept
 
 
+@inherit_doc
 class LinearRegressionModel(LinearRegressionModelBase):
 
     """A linear regression model derived from a least-squares fit.
@@ -162,7 +164,7 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
         @param intercept:         Boolean parameter which indicates the use
                                   or not of the augmented representation for
                                   training data (i.e. whether bias features
-                                  are activated or not).
+                                  are activated or not). (default: False)
         """
         def train(rdd, i):
             return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations),
@@ -172,6 +174,7 @@ def train(rdd, i):
         return _regression_train_wrapper(train, LinearRegressionModel, data, initialWeights)
 
 
+@inherit_doc
 class LassoModel(LinearRegressionModelBase):
 
     """A linear regression model derived from a least-squares fit with an
@@ -218,6 +221,7 @@ def train(rdd, i):
         return _regression_train_wrapper(train, LassoModel, data, initialWeights)
 
 
+@inherit_doc
 class RidgeRegressionModel(LinearRegressionModelBase):
 
     """A linear regression model derived from a least-squares fit with an
diff --git a/python/pyspark/mllib/stat/__init__.py b/python/pyspark/mllib/stat/__init__.py
index b686d955a0080..e3e128513e0d7 100644
--- a/python/pyspark/mllib/stat/__init__.py
+++ b/python/pyspark/mllib/stat/__init__.py
@@ -21,5 +21,7 @@
 
 from pyspark.mllib.stat._statistics import *
 from pyspark.mllib.stat.distribution import MultivariateGaussian
+from pyspark.mllib.stat.test import ChiSqTestResult
 
-__all__ = ["Statistics", "MultivariateStatisticalSummary", "MultivariateGaussian"]
+__all__ = ["Statistics", "MultivariateStatisticalSummary", "ChiSqTestResult",
+           "MultivariateGaussian"]
diff --git a/python/pyspark/mllib/stat/distribution.py b/python/pyspark/mllib/stat/distribution.py
index 07792e1532046..46f7a1d2f277a 100644
--- a/python/pyspark/mllib/stat/distribution.py
+++ b/python/pyspark/mllib/stat/distribution.py
@@ -22,7 +22,8 @@
 
 class MultivariateGaussian(namedtuple('MultivariateGaussian', ['mu', 'sigma'])):
 
-    """ Represents a (mu, sigma) tuple
+    """Represents a (mu, sigma) tuple
+
     >>> m = MultivariateGaussian(Vectors.dense([11,12]),DenseMatrix(2, 2, (1.0, 3.0, 5.0, 2.0)))
     >>> (m.mu, m.sigma.toArray())
     (DenseVector([11.0, 12.0]), array([[ 1., 5.],[ 3., 2.]]))
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index aae48f213246b..02d551b87dcc0 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -20,12 +20,12 @@
 import random
 
 from pyspark import SparkContext, RDD
-from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
+from pyspark.mllib.common import callMLlibFunc, inherit_doc, JavaModelWrapper
 from pyspark.mllib.linalg import _convert_to_vector
 from pyspark.mllib.regression import LabeledPoint
 
 __all__ = ['DecisionTreeModel', 'DecisionTree', 'RandomForestModel',
-           'RandomForest', 'GradientBoostedTrees']
+           'RandomForest', 'GradientBoostedTreesModel', 'GradientBoostedTrees']
 
 
 class TreeEnsembleModel(JavaModelWrapper):
@@ -216,6 +216,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
                           impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
 
 
+@inherit_doc
 class RandomForestModel(TreeEnsembleModel):
     """
     .. note:: Experimental
@@ -381,6 +382,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
                           featureSubsetStrategy, impurity, maxDepth, maxBins, seed)
 
 
+@inherit_doc
 class GradientBoostedTreesModel(TreeEnsembleModel):
     """
     .. note:: Experimental

From 5b0a42cb17b840c82d3f8a5ad061d99e261ceadf Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 20 Feb 2015 15:35:05 -0800
Subject: [PATCH 264/357] [SPARK-5898] [SPARK-5896] [SQL]  [PySpark] create
 DataFrame from pandas and tuple/list

Fix createDataFrame() from pandas DataFrame (not tested by jenkins, depends on SPARK-5693).

It also support to create DataFrame from plain tuple/list without column names, `_1`, `_2` will be used as column names.

Author: Davies Liu <davies@databricks.com>

Closes #4679 from davies/pandas and squashes the following commits:

c0cbe0b [Davies Liu] fix tests
8466d1d [Davies Liu] fix create DataFrame from pandas
---
 python/pyspark/sql/context.py | 12 ++++++++++--
 python/pyspark/sql/tests.py   |  2 +-
 python/pyspark/sql/types.py   | 26 +++++++++-----------------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 3f168f718bd7f..313f15e6d9b6f 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -351,6 +351,8 @@ def createDataFrame(self, data, schema=None, samplingRatio=None):
         :return: a DataFrame
 
         >>> l = [('Alice', 1)]
+        >>> sqlCtx.createDataFrame(l).collect()
+        [Row(_1=u'Alice', _2=1)]
         >>> sqlCtx.createDataFrame(l, ['name', 'age']).collect()
         [Row(name=u'Alice', age=1)]
 
@@ -359,6 +361,8 @@ def createDataFrame(self, data, schema=None, samplingRatio=None):
         [Row(age=1, name=u'Alice')]
 
         >>> rdd = sc.parallelize(l)
+        >>> sqlCtx.createDataFrame(rdd).collect()
+        [Row(_1=u'Alice', _2=1)]
         >>> df = sqlCtx.createDataFrame(rdd, ['name', 'age'])
         >>> df.collect()
         [Row(name=u'Alice', age=1)]
@@ -377,14 +381,17 @@ def createDataFrame(self, data, schema=None, samplingRatio=None):
         >>> df3 = sqlCtx.createDataFrame(rdd, schema)
         >>> df3.collect()
         [Row(name=u'Alice', age=1)]
+
+        >>> sqlCtx.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
+        [Row(name=u'Alice', age=1)]
         """
         if isinstance(data, DataFrame):
             raise TypeError("data is already a DataFrame")
 
         if has_pandas and isinstance(data, pandas.DataFrame):
-            data = self._sc.parallelize(data.to_records(index=False))
             if schema is None:
                 schema = list(data.columns)
+            data = [r.tolist() for r in data.to_records(index=False)]
 
         if not isinstance(data, RDD):
             try:
@@ -399,7 +406,8 @@ def createDataFrame(self, data, schema=None, samplingRatio=None):
         if isinstance(schema, (list, tuple)):
             first = data.first()
             if not isinstance(first, (list, tuple)):
-                raise ValueError("each row in `rdd` should be list or tuple")
+                raise ValueError("each row in `rdd` should be list or tuple, "
+                                 "but got %r" % type(first))
             row_cls = Row(*schema)
             schema = self._inferSchema(data.map(lambda r: row_cls(*r)), samplingRatio)
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 8e1bb36598727..39071e7e35ca1 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -186,7 +186,7 @@ def test_serialize_nested_array_and_map(self):
         self.assertEqual("2", row.d)
 
     def test_infer_schema(self):
-        d = [Row(l=[], d={}),
+        d = [Row(l=[], d={}, s=None),
              Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")}, s="")]
         rdd = self.sc.parallelize(d)
         df = self.sqlCtx.createDataFrame(rdd)
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 9409c6f9f6556..b6e41cf0b29ff 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -604,7 +604,7 @@ def _infer_type(obj):
     ExamplePointUDT
     """
     if obj is None:
-        raise ValueError("Can not infer type for None")
+        return NullType()
 
     if hasattr(obj, '__UDT__'):
         return obj.__UDT__
@@ -637,15 +637,14 @@ def _infer_schema(row):
     if isinstance(row, dict):
         items = sorted(row.items())
 
-    elif isinstance(row, tuple):
+    elif isinstance(row, (tuple, list)):
         if hasattr(row, "_fields"):  # namedtuple
             items = zip(row._fields, tuple(row))
         elif hasattr(row, "__FIELDS__"):  # Row
             items = zip(row.__FIELDS__, tuple(row))
-        elif all(isinstance(x, tuple) and len(x) == 2 for x in row):
-            items = row
         else:
-            raise ValueError("Can't infer schema from tuple")
+            names = ['_%d' % i for i in range(1, len(row) + 1)]
+            items = zip(names, row)
 
     elif hasattr(row, "__dict__"):  # object
         items = sorted(row.__dict__.items())
@@ -812,17 +811,10 @@ def convert_struct(obj):
         if obj is None:
             return
 
-        if isinstance(obj, tuple):
-            if hasattr(obj, "_fields"):
-                d = dict(zip(obj._fields, obj))
-            elif hasattr(obj, "__FIELDS__"):
-                d = dict(zip(obj.__FIELDS__, obj))
-            elif all(isinstance(x, tuple) and len(x) == 2 for x in obj):
-                d = dict(obj)
-            else:
-                raise ValueError("unexpected tuple: %s" % str(obj))
+        if isinstance(obj, (tuple, list)):
+            return tuple(conv(v) for v, conv in zip(obj, converters))
 
-        elif isinstance(obj, dict):
+        if isinstance(obj, dict):
             d = obj
         elif hasattr(obj, "__dict__"):  # object
             d = obj.__dict__
@@ -1022,7 +1014,7 @@ def _verify_type(obj, dataType):
         return
 
     _type = type(dataType)
-    assert _type in _acceptable_types, "unkown datatype: %s" % dataType
+    assert _type in _acceptable_types, "unknown datatype: %s" % dataType
 
     # subclass of them can not be deserialized in JVM
     if type(obj) not in _acceptable_types[_type]:
@@ -1040,7 +1032,7 @@ def _verify_type(obj, dataType):
 
     elif isinstance(dataType, StructType):
         if len(obj) != len(dataType.fields):
-            raise ValueError("Length of object (%d) does not match with"
+            raise ValueError("Length of object (%d) does not match with "
                              "length of fields (%d)" % (len(obj), len(dataType.fields)))
         for v, f in zip(obj, dataType.fields):
             _verify_type(v, f.dataType)

From e155324711740da97698b93526128b0eae2dc0ce Mon Sep 17 00:00:00 2001
From: Jacky Li <jackylk@users.noreply.github.com>
Date: Sat, 21 Feb 2015 13:00:16 +0000
Subject: [PATCH 265/357] [MLlib] fix typo

fix typo: it should be "default:" instead of "default;"

Author: Jacky Li <jackylk@users.noreply.github.com>

Closes #4713 from jackylk/patch-10 and squashes the following commits:

15daf2e [Jacky Li] [MLlib] fix typo
---
 .../scala/org/apache/spark/examples/mllib/DenseKMeans.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseKMeans.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseKMeans.scala
index 11e35598baf50..14cc5cbb679c5 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseKMeans.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseKMeans.scala
@@ -56,7 +56,7 @@ object DenseKMeans {
         .text(s"number of clusters, required")
         .action((x, c) => c.copy(k = x))
       opt[Int]("numIterations")
-        .text(s"number of iterations, default; ${defaultParams.numIterations}")
+        .text(s"number of iterations, default: ${defaultParams.numIterations}")
         .action((x, c) => c.copy(numIterations = x))
       opt[String]("initMode")
         .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +

From d3cbd38c33e6a2addcf8caa18eeb10036fbfd01b Mon Sep 17 00:00:00 2001
From: Nishkam Ravi <nravi@cloudera.com>
Date: Sat, 21 Feb 2015 09:59:28 -0800
Subject: [PATCH 266/357] SPARK-5841 [CORE] [HOTFIX 2] Memory leak in
 DiskBlockManager

Continue to see IllegalStateException in YARN cluster mode. Adding a simple workaround for now.

Author: Nishkam Ravi <nravi@cloudera.com>
Author: nishkamravi2 <nishkamravi@gmail.com>
Author: nravi <nravi@c1704.halxg.cloudera.com>

Closes #4690 from nishkamravi2/master_nravi and squashes the following commits:

d453197 [nishkamravi2] Update NewHadoopRDD.scala
6f41a1d [nishkamravi2] Update NewHadoopRDD.scala
0ce2c32 [nishkamravi2] Update HadoopRDD.scala
f7e33c2 [Nishkam Ravi] Merge branch 'master_nravi' of https://github.com/nishkamravi2/spark into master_nravi
ba1eb8b [Nishkam Ravi] Try-catch block around the two occurrences of removeShutDownHook. Deletion of semi-redundant occurrences of expensive operation inShutDown.
71d0e17 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
494d8c0 [nishkamravi2] Update DiskBlockManager.scala
3c5ddba [nishkamravi2] Update DiskBlockManager.scala
f0d12de [Nishkam Ravi] Workaround for IllegalStateException caused by recent changes to BlockManager.stop
79ea8b4 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
b446edc [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
5c9a4cb [nishkamravi2] Update TaskSetManagerSuite.scala
535295a [nishkamravi2] Update TaskSetManager.scala
3e1b616 [Nishkam Ravi] Modify test for maxResultSize
9f6583e [Nishkam Ravi] Changes to maxResultSize code (improve error message and add condition to check if maxResultSize > 0)
5f8f9ed [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
636a9ff [nishkamravi2] Update YarnAllocator.scala
8f76c8b [Nishkam Ravi] Doc change for yarn memory overhead
35daa64 [Nishkam Ravi] Slight change in the doc for yarn memory overhead
5ac2ec1 [Nishkam Ravi] Remove out
dac1047 [Nishkam Ravi] Additional documentation for yarn memory overhead issue
42c2c3d [Nishkam Ravi] Additional changes for yarn memory overhead issue
362da5e [Nishkam Ravi] Additional changes for yarn memory overhead
c726bd9 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
f00fa31 [Nishkam Ravi] Improving logging for AM memoryOverhead
1cf2d1e [nishkamravi2] Update YarnAllocator.scala
ebcde10 [Nishkam Ravi] Modify default YARN memory_overhead-- from an additive constant to a multiplier (redone to resolve merge conflicts)
2e69f11 [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark into master_nravi
efd688a [Nishkam Ravi] Merge branch 'master' of https://github.com/apache/spark
2b630f9 [nravi] Accept memory input as "30g", "512M" instead of an int value, to be consistent with rest of Spark
3bf8fad [nravi] Merge branch 'master' of https://github.com/apache/spark
5423a03 [nravi] Merge branch 'master' of https://github.com/apache/spark
eb663ca [nravi] Merge branch 'master' of https://github.com/apache/spark
df2aeb1 [nravi] Improved fix for ConcurrentModificationIssue (Spark-1097, Hadoop-10456)
6b840f0 [nravi] Undo the fix for SPARK-1758 (the problem is fixed)
5108700 [nravi] Fix in Spark for the Concurrent thread modification issue (SPARK-1097, HADOOP-10456)
681b36f [nravi] Fix for SPARK-1758: failing test org.apache.spark.JavaAPISuite.wholeTextFiles
---
 .../org/apache/spark/deploy/worker/ExecutorRunner.scala     | 6 +++++-
 .../scala/org/apache/spark/storage/DiskBlockManager.scala   | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
index 0add3064da452..bea04cd542fd1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
@@ -105,7 +105,11 @@ private[spark] class ExecutorRunner(
       workerThread.interrupt()
       workerThread = null
       state = ExecutorState.KILLED
-      Runtime.getRuntime.removeShutdownHook(shutdownHook)
+      try {
+        Runtime.getRuntime.removeShutdownHook(shutdownHook)
+      } catch {
+        case e: IllegalStateException => None
+      }
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
index b297f3fd9dd1e..12cd8ea3bdf1f 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -148,7 +148,11 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon
   /** Cleanup local dirs and stop shuffle sender. */
   private[spark] def stop() {
     // Remove the shutdown hook.  It causes memory leaks if we leave it around.
-    Runtime.getRuntime.removeShutdownHook(shutdownHook)
+    try {
+      Runtime.getRuntime.removeShutdownHook(shutdownHook)
+    } catch {
+      case e: IllegalStateException => None
+    }
     doStop()
   }
 

From 7138816abe1060a1e967c4c77c72d5752586d557 Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Sat, 21 Feb 2015 10:01:01 -0800
Subject: [PATCH 267/357] [SPARK-5937][YARN] Fix ClientSuite to set YARN mode,
 so that the correct class is used in t...

...ests.

Without this SparkHadoopUtil is used by the Client instead of YarnSparkHadoopUtil.

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #4711 from harishreedharan/SPARK-5937 and squashes the following commits:

d154de6 [Hari Shreedharan] Use System.clearProperty() instead of setting the value of SPARK_YARN_MODE to empty string.
f729f70 [Hari Shreedharan] Fix ClientSuite to set YARN mode, so that the correct class is used in tests.
---
 .../org/apache/spark/deploy/yarn/ClientSuite.scala  | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index f8f8129d220e4..92f04b4b859b3 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -28,8 +28,7 @@ import org.apache.hadoop.yarn.api.records._
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.mockito.Matchers._
 import org.mockito.Mockito._
-import org.scalatest.FunSuite
-import org.scalatest.Matchers
+import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.{ HashMap => MutableHashMap }
@@ -39,7 +38,15 @@ import scala.util.Try
 import org.apache.spark.{SparkException, SparkConf}
 import org.apache.spark.util.Utils
 
-class ClientSuite extends FunSuite with Matchers {
+class ClientSuite extends FunSuite with Matchers with BeforeAndAfterAll {
+
+  override def beforeAll(): Unit = {
+    System.setProperty("SPARK_YARN_MODE", "true")
+  }
+
+  override def afterAll(): Unit = {
+    System.clearProperty("SPARK_YARN_MODE")
+  }
 
   test("default Yarn application classpath") {
     Client.getDefaultYarnApplicationClasspath should be(Some(Fixtures.knownDefYarnAppCP))

From 7683982faf920b8ac6cf46b79842450e7d46c5cc Mon Sep 17 00:00:00 2001
From: Evan Yu <ehotou@gmail.com>
Date: Sat, 21 Feb 2015 20:40:21 +0000
Subject: [PATCH 268/357] [SPARK-5860][CORE] JdbcRDD: overflow on large range
 with high number of partitions

Fix a overflow bug in JdbcRDD when calculating partitions for large BIGINT ids

Author: Evan Yu <ehotou@gmail.com>

Closes #4701 from hotou/SPARK-5860 and squashes the following commits:

9e038d1 [Evan Yu] [SPARK-5860][CORE] Prevent overflowing at the length level
7883ad9 [Evan Yu] [SPARK-5860][CORE] Prevent overflowing at the length level
c88755a [Evan Yu] [SPARK-5860][CORE] switch to BigInt instead of BigDecimal
4e9ff4f [Evan Yu] [SPARK-5860][CORE] JdbcRDD overflow on large range with high number of partitions
---
 .../scala/org/apache/spark/rdd/JdbcRDD.scala  |  8 +--
 .../org/apache/spark/rdd/JdbcRDDSuite.scala   | 60 ++++++++++++++-----
 2 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
index 4fe7622bda00f..e2267861e79df 100644
--- a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
@@ -62,11 +62,11 @@ class JdbcRDD[T: ClassTag](
 
   override def getPartitions: Array[Partition] = {
     // bounds are inclusive, hence the + 1 here and - 1 on end
-    val length = 1 + upperBound - lowerBound
+    val length = BigInt(1) + upperBound - lowerBound
     (0 until numPartitions).map(i => {
-      val start = lowerBound + ((i * length) / numPartitions).toLong
-      val end = lowerBound + (((i + 1) * length) / numPartitions).toLong - 1
-      new JdbcPartition(i, start, end)
+      val start = lowerBound + ((i * length) / numPartitions)
+      val end = lowerBound + (((i + 1) * length) / numPartitions) - 1
+      new JdbcPartition(i, start.toLong, end.toLong)
     }).toArray
   }
 
diff --git a/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala
index 6138d0bbd57f6..0dc59888f7304 100644
--- a/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/JdbcRDDSuite.scala
@@ -29,22 +29,42 @@ class JdbcRDDSuite extends FunSuite with BeforeAndAfter with LocalSparkContext {
     Class.forName("org.apache.derby.jdbc.EmbeddedDriver")
     val conn = DriverManager.getConnection("jdbc:derby:target/JdbcRDDSuiteDb;create=true")
     try {
-      val create = conn.createStatement
-      create.execute("""
-        CREATE TABLE FOO(
-          ID INTEGER NOT NULL GENERATED ALWAYS AS IDENTITY (START WITH 1, INCREMENT BY 1),
-          DATA INTEGER
-        )""")
-      create.close()
-      val insert = conn.prepareStatement("INSERT INTO FOO(DATA) VALUES(?)")
-      (1 to 100).foreach { i =>
-        insert.setInt(1, i * 2)
-        insert.executeUpdate
+
+      try {
+        val create = conn.createStatement
+        create.execute("""
+          CREATE TABLE FOO(
+            ID INTEGER NOT NULL GENERATED ALWAYS AS IDENTITY (START WITH 1, INCREMENT BY 1),
+            DATA INTEGER
+          )""")
+        create.close()
+        val insert = conn.prepareStatement("INSERT INTO FOO(DATA) VALUES(?)")
+        (1 to 100).foreach { i =>
+          insert.setInt(1, i * 2)
+          insert.executeUpdate
+        }
+        insert.close()
+      } catch {
+        case e: SQLException if e.getSQLState == "X0Y32" =>
+        // table exists
       }
-      insert.close()
-    } catch {
-      case e: SQLException if e.getSQLState == "X0Y32" =>
+
+      try {
+        val create = conn.createStatement
+        create.execute("CREATE TABLE BIGINT_TEST(ID BIGINT NOT NULL, DATA INTEGER)")
+        create.close()
+        val insert = conn.prepareStatement("INSERT INTO BIGINT_TEST VALUES(?,?)")
+        (1 to 100).foreach { i =>
+          insert.setLong(1, 100000000000000000L +  4000000000000000L * i)
+          insert.setInt(2, i)
+          insert.executeUpdate
+        }
+        insert.close()
+      } catch {
+        case e: SQLException if e.getSQLState == "X0Y32" =>
         // table exists
+      }
+
     } finally {
       conn.close()
     }
@@ -62,6 +82,18 @@ class JdbcRDDSuite extends FunSuite with BeforeAndAfter with LocalSparkContext {
     assert(rdd.count === 100)
     assert(rdd.reduce(_+_) === 10100)
   }
+  
+  test("large id overflow") {
+    sc = new SparkContext("local", "test")
+    val rdd = new JdbcRDD(
+      sc,
+      () => { DriverManager.getConnection("jdbc:derby:target/JdbcRDDSuiteDb") },
+      "SELECT DATA FROM BIGINT_TEST WHERE ? <= ID AND ID <= ?",
+      1131544775L, 567279358897692673L, 20,
+      (r: ResultSet) => { r.getInt(1) } ).cache()
+    assert(rdd.count === 100)
+    assert(rdd.reduce(_+_) === 5050)
+  }
 
   after {
     try {

From 46462ff255b0eef8263ed798f3d5aeb8460ecaf1 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Sat, 21 Feb 2015 23:07:30 -0800
Subject: [PATCH 269/357] MAINTENANCE: Automated closing of pull requests.

This commit exists to close the following pull requests on Github:

Closes #3490 (close requested by 'andrewor14')
Closes #4646 (close requested by 'srowen')
Closes #3591 (close requested by 'andrewor14')
Closes #3656 (close requested by 'andrewor14')
Closes #4553 (close requested by 'JoshRosen')
Closes #4202 (close requested by 'srowen')
Closes #4497 (close requested by 'marmbrus')
Closes #4150 (close requested by 'andrewor14')
Closes #2409 (close requested by 'andrewor14')
Closes #4221 (close requested by 'srowen')

From a7f90390251ff62a0e10edf4c2eb876538597791 Mon Sep 17 00:00:00 2001
From: Alexander <abezzubov@nflabs.com>
Date: Sun, 22 Feb 2015 08:53:05 +0000
Subject: [PATCH 270/357] =?UTF-8?q?[DOCS]=20Fix=20typo=20in=20API=20for=20?=
 =?UTF-8?q?custom=20InputFormats=20based=20on=20the=20=E2=80=9Cnew?=
 =?UTF-8?q?=E2=80=9D=20MapReduce=20API?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This looks like a simple typo ```SparkContext.newHadoopRDD``` instead of ```SparkContext.newAPIHadoopRDD``` as in actual http://spark.apache.org/docs/1.2.1/api/scala/index.html#org.apache.spark.SparkContext

Author: Alexander <abezzubov@nflabs.com>

Closes #4718 from bzz/hadoop-InputFormats-doc-fix and squashes the following commits:

680a4c4 [Alexander] Fix typo in docs on custom Hadoop InputFormats
---
 docs/programming-guide.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 4e4af76316863..7b0701828878e 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -335,7 +335,7 @@ Apart from text files, Spark's Scala API also supports several other data format
 
 * For [SequenceFiles](http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/mapred/SequenceFileInputFormat.html), use SparkContext's `sequenceFile[K, V]` method where `K` and `V` are the types of key and values in the file. These should be subclasses of Hadoop's [Writable](http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/io/Writable.html) interface, like [IntWritable](http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/io/IntWritable.html) and [Text](http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/io/Text.html). In addition, Spark allows you to specify native types for a few common Writables; for example, `sequenceFile[Int, String]` will automatically read IntWritables and Texts.
 
-* For other Hadoop InputFormats, you can use the `SparkContext.hadoopRDD` method, which takes an arbitrary `JobConf` and input format class, key class and value class. Set these the same way you would for a Hadoop job with your input source. You can also use `SparkContext.newHadoopRDD` for InputFormats based on the "new" MapReduce API (`org.apache.hadoop.mapreduce`).
+* For other Hadoop InputFormats, you can use the `SparkContext.hadoopRDD` method, which takes an arbitrary `JobConf` and input format class, key class and value class. Set these the same way you would for a Hadoop job with your input source. You can also use `SparkContext.newAPIHadoopRDD` for InputFormats based on the "new" MapReduce API (`org.apache.hadoop.mapreduce`).
 
 * `RDD.saveAsObjectFile` and `SparkContext.objectFile` support saving an RDD in a simple format consisting of serialized Java objects. While this is not as efficient as specialized formats like Avro, it offers an easy way to save any RDD.
 
@@ -367,7 +367,7 @@ Apart from text files, Spark's Java API also supports several other data formats
 
 * For [SequenceFiles](http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/mapred/SequenceFileInputFormat.html), use SparkContext's `sequenceFile[K, V]` method where `K` and `V` are the types of key and values in the file. These should be subclasses of Hadoop's [Writable](http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/io/Writable.html) interface, like [IntWritable](http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/io/IntWritable.html) and [Text](http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/io/Text.html).
 
-* For other Hadoop InputFormats, you can use the `JavaSparkContext.hadoopRDD` method, which takes an arbitrary `JobConf` and input format class, key class and value class. Set these the same way you would for a Hadoop job with your input source. You can also use `JavaSparkContext.newHadoopRDD` for InputFormats based on the "new" MapReduce API (`org.apache.hadoop.mapreduce`).
+* For other Hadoop InputFormats, you can use the `JavaSparkContext.hadoopRDD` method, which takes an arbitrary `JobConf` and input format class, key class and value class. Set these the same way you would for a Hadoop job with your input source. You can also use `JavaSparkContext.newAPIHadoopRDD` for InputFormats based on the "new" MapReduce API (`org.apache.hadoop.mapreduce`).
 
 * `JavaRDD.saveAsObjectFile` and `JavaSparkContext.objectFile` support saving an RDD in a simple format consisting of serialized Java objects. While this is not as efficient as specialized formats like Avro, it offers an easy way to save any RDD.
 

From 275b1bef897d775f1f7743378ca3e09e36160136 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Sun, 22 Feb 2015 08:56:30 +0000
Subject: [PATCH 271/357] [DataFrame] [Typo] Fix the typo

Author: Cheng Hao <hao.cheng@intel.com>

Closes #4717 from chenghao-intel/typo1 and squashes the following commits:

858d7b0 [Cheng Hao] update the typo
---
 sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 810f7c77477bb..69e5f6a07da7f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -138,7 +138,7 @@ class DataFrame protected[sql](
 
   /**
    * An implicit conversion function internal to this class for us to avoid doing
-   * "new DataFrameImpl(...)" everywhere.
+   * "new DataFrame(...)" everywhere.
    */
   @inline private implicit def logicalPlanToDataFrame(logicalPlan: LogicalPlan): DataFrame = {
     new DataFrame(sqlContext, logicalPlan)

From e4f9d03d728bc6fbfb6ebc7d15b4ba328f98f3dc Mon Sep 17 00:00:00 2001
From: Aaron Josephs <ajoseph4@binghamton.edu>
Date: Sun, 22 Feb 2015 22:09:06 -0800
Subject: [PATCH 272/357] [SPARK-911] allow efficient queries for a range if
 RDD is partitioned wi...

...th RangePartitioner

Author: Aaron Josephs <ajoseph4@binghamton.edu>

Closes #1381 from aaronjosephs/PLAT-911 and squashes the following commits:

e30ade5 [Aaron Josephs] [SPARK-911] allow efficient queries for a range if RDD is partitioned with RangePartitioner
---
 .../spark/rdd/OrderedRDDFunctions.scala       | 23 +++++++++++++++
 .../org/apache/spark/rdd/SortingSuite.scala   | 28 +++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala
index 144f679a59460..6fdfdb734d1b8 100644
--- a/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala
@@ -75,4 +75,27 @@ class OrderedRDDFunctions[K : Ordering : ClassTag,
     new ShuffledRDD[K, V, V](self, partitioner).setKeyOrdering(ordering)
   }
 
+  /**
+   * Returns an RDD containing only the elements in the the inclusive range `lower` to `upper`.
+   * If the RDD has been partitioned using a `RangePartitioner`, then this operation can be
+   * performed efficiently by only scanning the partitions that might contain matching elements.
+   * Otherwise, a standard `filter` is applied to all partitions.
+   */
+  def filterByRange(lower: K, upper: K): RDD[P] = {
+
+    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)
+
+    val rddToFilter: RDD[P] = self.partitioner match {
+      case Some(rp: RangePartitioner[K, V]) => {
+        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
+          case (l, u) => Math.min(l, u) to Math.max(l, u)
+        }
+        PartitionPruningRDD.create(self, partitionIndicies.contains)
+      }
+      case _ =>
+        self
+    }
+    rddToFilter.filter { case (k, v) => inRange(k) }
+  }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala b/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala
index a40f2ffeffdf9..64b1c24c47168 100644
--- a/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala
@@ -119,5 +119,33 @@ class SortingSuite extends FunSuite with SharedSparkContext with Matchers with L
     partitions(1).last should be > partitions(2).head
     partitions(2).last should be > partitions(3).head
   }
+
+  test("get a range of elements in a sorted RDD that is on one partition") {
+    val pairArr = (1 to 1000).map(x => (x, x)).toArray
+    val sorted = sc.parallelize(pairArr, 10).sortByKey()
+    val range = sorted.filterByRange(20, 40).collect()
+    assert((20 to 40).toArray === range.map(_._1))
+  }
+
+  test("get a range of elements over multiple partitions in a descendingly sorted RDD") {
+    val pairArr = (1000 to 1 by -1).map(x => (x, x)).toArray
+    val sorted = sc.parallelize(pairArr, 10).sortByKey(false)
+    val range = sorted.filterByRange(200, 800).collect()
+    assert((800 to 200 by -1).toArray === range.map(_._1))
+  }
+
+  test("get a range of elements in an array not partitioned by a range partitioner") {
+    val pairArr = util.Random.shuffle((1 to 1000).toList).map(x => (x, x))
+    val pairs = sc.parallelize(pairArr,10)
+    val range = pairs.filterByRange(200, 800).collect()
+    assert((800 to 200 by -1).toArray.sorted === range.map(_._1).sorted)
+  }
+
+  test("get a range of elements over multiple partitions but not taking up full partitions") {
+    val pairArr = (1000 to 1 by -1).map(x => (x, x)).toArray
+    val sorted = sc.parallelize(pairArr, 10).sortByKey(false)
+    val range = sorted.filterByRange(250, 850).collect()
+    assert((850 to 250 by -1).toArray === range.map(_._1))
+  }
 }
 

From 95cd643aa954b7e4229e94fa8bdc99bf3b2bb1da Mon Sep 17 00:00:00 2001
From: Ilya Ganelin <ilya.ganelin@capitalone.com>
Date: Sun, 22 Feb 2015 22:43:04 -0800
Subject: [PATCH 273/357] [SPARK-3885] Provide mechanism to remove accumulators
 once they are no longer used

Instead of storing a strong reference to accumulators, I've replaced this with a weak reference and updated any code that uses these accumulators to check whether the reference resolves before using the accumulator. A weak reference will be cleared when there is no longer an existing copy of the variable versus using a soft reference in which case accumulators would only be cleared when the GC explicitly ran out of memory.

Author: Ilya Ganelin <ilya.ganelin@capitalone.com>

Closes #4021 from ilganeli/SPARK-3885 and squashes the following commits:

4ba9575 [Ilya Ganelin]  Fixed error in test suite
8510943 [Ilya Ganelin] Extra code
bb76ef0 [Ilya Ganelin] File deleted somehow
283a333 [Ilya Ganelin] Added cleanup method for accumulators to remove stale references within Accumulators.original to accumulators that are now out of scope
345fd4f [Ilya Ganelin] Merge remote-tracking branch 'upstream/master' into SPARK-3885
7485a82 [Ilya Ganelin] Fixed build error
c8e0f2b [Ilya Ganelin] Added working test for accumulator garbage collection
94ce754 [Ilya Ganelin] Still not being properly garbage collected
8722b63 [Ilya Ganelin] Fixing gc test
7414a9c [Ilya Ganelin] Added test for accumulator garbage collection
18d62ec [Ilya Ganelin] Updated to throw Exception when accessing a GCd accumulator
9a81928 [Ilya Ganelin] Reverting permissions changes
28f705c [Ilya Ganelin] Merge remote-tracking branch 'upstream/master' into SPARK-3885
b820ab4b [Ilya Ganelin] reset
d78f4bf [Ilya Ganelin] Removed obsolete comment
0746e61 [Ilya Ganelin] Updated DAGSchedulerSUite to fix bug
3350852 [Ilya Ganelin] Updated DAGScheduler and Suite to correctly use new implementation of WeakRef Accumulator storage
c49066a [Ilya Ganelin] Merge remote-tracking branch 'upstream/master' into SPARK-3885
cbb9023 [Ilya Ganelin] Merge remote-tracking branch 'upstream/master' into SPARK-3885
a77d11b [Ilya Ganelin] Updated Accumulators class to store weak references instead of strong references to allow garbage collection of old accumulators
---
 .../scala/org/apache/spark/Accumulators.scala | 36 ++++++++++++++-----
 .../org/apache/spark/ContextCleaner.scala     | 20 +++++++++++
 .../scala/org/apache/spark/SparkContext.scala | 28 +++++++++++----
 .../apache/spark/scheduler/DAGScheduler.scala | 10 +++++-
 .../org/apache/spark/AccumulatorSuite.scala   | 20 +++++++++++
 .../apache/spark/ContextCleanerSuite.scala    |  4 +++
 .../spark/scheduler/DAGSchedulerSuite.scala   |  6 +++-
 7 files changed, 107 insertions(+), 17 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Accumulators.scala b/core/src/main/scala/org/apache/spark/Accumulators.scala
index 5f31bfba3f8d6..30f0ccd73ccca 100644
--- a/core/src/main/scala/org/apache/spark/Accumulators.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulators.scala
@@ -23,6 +23,7 @@ import java.lang.ThreadLocal
 
 import scala.collection.generic.Growable
 import scala.collection.mutable.Map
+import scala.ref.WeakReference
 import scala.reflect.ClassTag
 
 import org.apache.spark.serializer.JavaSerializer
@@ -280,10 +281,12 @@ object AccumulatorParam {
 // TODO: The multi-thread support in accumulators is kind of lame; check
 // if there's a more intuitive way of doing it right
 private[spark] object Accumulators {
-  // TODO: Use soft references? => need to make readObject work properly then
-  val originals = Map[Long, Accumulable[_, _]]()
-  val localAccums = new ThreadLocal[Map[Long, Accumulable[_, _]]]() {
-    override protected def initialValue() = Map[Long, Accumulable[_, _]]()
+  // Store a WeakReference instead of a StrongReference because this way accumulators can be
+  // appropriately garbage collected during long-running jobs and release memory
+  type WeakAcc = WeakReference[Accumulable[_, _]]
+  val originals = Map[Long, WeakAcc]()
+  val localAccums = new ThreadLocal[Map[Long, WeakAcc]]() {
+    override protected def initialValue() = Map[Long, WeakAcc]()
   }
   var lastId: Long = 0
 
@@ -294,9 +297,9 @@ private[spark] object Accumulators {
 
   def register(a: Accumulable[_, _], original: Boolean): Unit = synchronized {
     if (original) {
-      originals(a.id) = a
+      originals(a.id) = new WeakAcc(a)
     } else {
-      localAccums.get()(a.id) = a
+      localAccums.get()(a.id) = new WeakAcc(a)
     }
   }
 
@@ -307,11 +310,22 @@ private[spark] object Accumulators {
     }
   }
 
+  def remove(accId: Long) {
+    synchronized {
+      originals.remove(accId)
+    }
+  }
+
   // Get the values of the local accumulators for the current thread (by ID)
   def values: Map[Long, Any] = synchronized {
     val ret = Map[Long, Any]()
     for ((id, accum) <- localAccums.get) {
-      ret(id) = accum.localValue
+      // Since we are now storing weak references, we must check whether the underlying data
+      // is valid.
+      ret(id) = accum.get match {
+        case Some(values) => values.localValue
+        case None => None
+      }
     }
     return ret
   }
@@ -320,7 +334,13 @@ private[spark] object Accumulators {
   def add(values: Map[Long, Any]): Unit = synchronized {
     for ((id, value) <- values) {
       if (originals.contains(id)) {
-        originals(id).asInstanceOf[Accumulable[Any, Any]] ++= value
+        // Since we are now storing weak references, we must check whether the underlying data
+        // is valid.
+        originals(id).get match {
+          case Some(accum) => accum.asInstanceOf[Accumulable[Any, Any]] ++= value
+          case None =>
+            throw new IllegalAccessError("Attempted to access garbage collected Accumulator.")
+        }
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/ContextCleaner.scala b/core/src/main/scala/org/apache/spark/ContextCleaner.scala
index ede1e23f4fcc5..434f1e47cf822 100644
--- a/core/src/main/scala/org/apache/spark/ContextCleaner.scala
+++ b/core/src/main/scala/org/apache/spark/ContextCleaner.scala
@@ -32,6 +32,7 @@ private sealed trait CleanupTask
 private case class CleanRDD(rddId: Int) extends CleanupTask
 private case class CleanShuffle(shuffleId: Int) extends CleanupTask
 private case class CleanBroadcast(broadcastId: Long) extends CleanupTask
+private case class CleanAccum(accId: Long) extends CleanupTask
 
 /**
  * A WeakReference associated with a CleanupTask.
@@ -114,6 +115,10 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
     registerForCleanup(rdd, CleanRDD(rdd.id))
   }
 
+  def registerAccumulatorForCleanup(a: Accumulable[_, _]): Unit = {
+    registerForCleanup(a, CleanAccum(a.id))
+  }
+
   /** Register a ShuffleDependency for cleanup when it is garbage collected. */
   def registerShuffleForCleanup(shuffleDependency: ShuffleDependency[_, _, _]) {
     registerForCleanup(shuffleDependency, CleanShuffle(shuffleDependency.shuffleId))
@@ -145,6 +150,8 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
               doCleanupShuffle(shuffleId, blocking = blockOnShuffleCleanupTasks)
             case CleanBroadcast(broadcastId) =>
               doCleanupBroadcast(broadcastId, blocking = blockOnCleanupTasks)
+            case CleanAccum(accId) =>
+              doCleanupAccum(accId, blocking = blockOnCleanupTasks)
           }
         }
       } catch {
@@ -190,6 +197,18 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
     }
   }
 
+  /** Perform accumulator cleanup. */
+  def doCleanupAccum(accId: Long, blocking: Boolean) {
+    try {
+      logDebug("Cleaning accumulator " + accId)
+      Accumulators.remove(accId)
+      listeners.foreach(_.accumCleaned(accId))
+      logInfo("Cleaned accumulator " + accId)
+    } catch {
+      case e: Exception => logError("Error cleaning accumulator " + accId, e)
+    }
+  }
+
   private def blockManagerMaster = sc.env.blockManager.master
   private def broadcastManager = sc.env.broadcastManager
   private def mapOutputTrackerMaster = sc.env.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
@@ -206,4 +225,5 @@ private[spark] trait CleanerListener {
   def rddCleaned(rddId: Int)
   def shuffleCleaned(shuffleId: Int)
   def broadcastCleaned(broadcastId: Long)
+  def accumCleaned(accId: Long)
 }
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 85ec5ea11357e..930d4bea4785b 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -986,7 +986,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * values to using the `+=` method. Only the driver can access the accumulator's `value`.
    */
   def accumulator[T](initialValue: T)(implicit param: AccumulatorParam[T]) =
-    new Accumulator(initialValue, param)
+  {
+    val acc = new Accumulator(initialValue, param)
+    cleaner.foreach(_.registerAccumulatorForCleanup(acc))
+    acc
+  }
 
   /**
    * Create an [[org.apache.spark.Accumulator]] variable of a given type, with a name for display
@@ -994,7 +998,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * driver can access the accumulator's `value`.
    */
   def accumulator[T](initialValue: T, name: String)(implicit param: AccumulatorParam[T]) = {
-    new Accumulator(initialValue, param, Some(name))
+    val acc = new Accumulator(initialValue, param, Some(name))
+    cleaner.foreach(_.registerAccumulatorForCleanup(acc))
+    acc
   }
 
   /**
@@ -1003,8 +1009,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * @tparam R accumulator result type
    * @tparam T type that can be added to the accumulator
    */
-  def accumulable[R, T](initialValue: R)(implicit param: AccumulableParam[R, T]) =
-    new Accumulable(initialValue, param)
+  def accumulable[R, T](initialValue: R)(implicit param: AccumulableParam[R, T]) = {
+    val acc = new Accumulable(initialValue, param)
+    cleaner.foreach(_.registerAccumulatorForCleanup(acc))
+    acc
+  }
 
   /**
    * Create an [[org.apache.spark.Accumulable]] shared variable, with a name for display in the
@@ -1013,8 +1022,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * @tparam R accumulator result type
    * @tparam T type that can be added to the accumulator
    */
-  def accumulable[R, T](initialValue: R, name: String)(implicit param: AccumulableParam[R, T]) =
-    new Accumulable(initialValue, param, Some(name))
+  def accumulable[R, T](initialValue: R, name: String)(implicit param: AccumulableParam[R, T]) = {
+    val acc = new Accumulable(initialValue, param, Some(name))
+    cleaner.foreach(_.registerAccumulatorForCleanup(acc))
+    acc
+  }
 
   /**
    * Create an accumulator from a "mutable collection" type.
@@ -1025,7 +1037,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   def accumulableCollection[R <% Growable[T] with TraversableOnce[T] with Serializable: ClassTag, T]
       (initialValue: R): Accumulable[R, T] = {
     val param = new GrowableAccumulableParam[R,T]
-    new Accumulable(initialValue, param)
+    val acc = new Accumulable(initialValue, param)
+    cleaner.foreach(_.registerAccumulatorForCleanup(acc))
+    acc
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index c58721c2c82b7..bc84e2351ad74 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -890,8 +890,16 @@ class DAGScheduler(
     if (event.accumUpdates != null) {
       try {
         Accumulators.add(event.accumUpdates)
+
         event.accumUpdates.foreach { case (id, partialValue) =>
-          val acc = Accumulators.originals(id).asInstanceOf[Accumulable[Any, Any]]
+          // In this instance, although the reference in Accumulators.originals is a WeakRef,
+          // it's guaranteed to exist since the event.accumUpdates Map exists
+
+          val acc = Accumulators.originals(id).get match {
+            case Some(accum) => accum.asInstanceOf[Accumulable[Any, Any]]
+            case None => throw new NullPointerException("Non-existent reference to Accumulator")
+          }
+
           // To avoid UI cruft, ignore cases where value wasn't updated
           if (acc.name.isDefined && partialValue != acc.zero) {
             val name = acc.name.get
diff --git a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
index f087fc550dde3..bd0f8bdefa171 100644
--- a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark
 
 import scala.collection.mutable
+import scala.ref.WeakReference
 
 import org.scalatest.FunSuite
 import org.scalatest.Matchers
@@ -136,4 +137,23 @@ class AccumulatorSuite extends FunSuite with Matchers with LocalSparkContext {
     }
   }
 
+  test ("garbage collection") {
+    // Create an accumulator and let it go out of scope to test that it's properly garbage collected
+    sc = new SparkContext("local", "test")
+    var acc: Accumulable[mutable.Set[Any], Any] = sc.accumulable(new mutable.HashSet[Any]())
+    val accId = acc.id
+    val ref = WeakReference(acc)
+
+    // Ensure the accumulator is present
+    assert(ref.get.isDefined)
+
+    // Remove the explicit reference to it and allow weak reference to get garbage collected
+    acc = null
+    System.gc()
+    assert(ref.get.isEmpty)
+
+    Accumulators.remove(accId)
+    assert(!Accumulators.originals.get(accId).isDefined)
+  }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
index ae2ae7ed0d3aa..cdfaacee7da40 100644
--- a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
@@ -382,6 +382,10 @@ class CleanerTester(
       toBeCleanedBroadcstIds -= broadcastId
       logInfo("Broadcast" + broadcastId + " cleaned")
     }
+
+    def accumCleaned(accId: Long): Unit = {
+      logInfo("Cleaned accId " + accId + " cleaned")
+    }
   }
 
   val MAX_VALIDATION_ATTEMPTS = 10
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 9d0c1273695f6..4bf7f9e647d55 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -735,7 +735,11 @@ class DAGSchedulerSuite extends FunSuiteLike  with BeforeAndAfter with LocalSpar
     completeWithAccumulator(accum.id, taskSets(0), Seq((Success, 42)))
     completeWithAccumulator(accum.id, taskSets(0), Seq((Success, 42)))
     assert(results === Map(0 -> 42))
-    assert(Accumulators.originals(accum.id).value === 1)
+
+    val accVal = Accumulators.originals(accum.id).get.get.value
+
+    assert(accVal === 1)
+
     assertDataStructuresEmpty
   }
 

From 934876741683fc254fed18e7ff630614f78944be Mon Sep 17 00:00:00 2001
From: Makoto Fukuhara <fukuo33@gmail.com>
Date: Mon, 23 Feb 2015 09:24:33 +0000
Subject: [PATCH 274/357] [EXAMPLES] fix typo.

Author: Makoto Fukuhara <fukuo33@gmail.com>

Closes #4724 from fukuo33/fix-typo and squashes the following commits:

8c806b9 [Makoto Fukuhara] fix typo.
---
 .../main/scala/org/apache/spark/examples/BroadcastTest.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala
index 1b53f3edbe92e..4c129dbe2d12d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala
@@ -29,7 +29,7 @@ object BroadcastTest {
     val blockSize = if (args.length > 3) args(3) else "4096"
 
     val sparkConf = new SparkConf().setAppName("Broadcast Test")
-      .set("spark.broadcast.factory", s"org.apache.spark.broadcast.${bcName}BroaddcastFactory")
+      .set("spark.broadcast.factory", s"org.apache.spark.broadcast.${bcName}BroadcastFactory")
       .set("spark.broadcast.blockSize", blockSize)
     val sc = new SparkContext(sparkConf)
 

From 757b14b862a1d39c1bad7b321dae1a3ea8338fbb Mon Sep 17 00:00:00 2001
From: Saisai Shao <saisai.shao@intel.com>
Date: Mon, 23 Feb 2015 11:27:27 +0000
Subject: [PATCH 275/357] [SPARK-5943][Streaming] Update the test to use new
 API to reduce the warning

Author: Saisai Shao <saisai.shao@intel.com>

Closes #4722 from jerryshao/SPARK-5943 and squashes the following commits:

1b01233 [Saisai Shao] Update the test to use new API to reduce the warning
---
 python/pyspark/streaming/context.py                    |  2 +-
 .../apache/spark/streaming/StreamingContextSuite.scala | 10 +++++-----
 .../org/apache/spark/streaming/TestSuiteBase.scala     |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index b06ab650370bd..2c73083c9f9a8 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -189,7 +189,7 @@ def awaitTermination(self, timeout=None):
         if timeout is None:
             self._jssc.awaitTermination()
         else:
-            self._jssc.awaitTermination(int(timeout * 1000))
+            self._jssc.awaitTerminationOrTimeout(int(timeout * 1000))
 
     def awaitTerminationOrTimeout(self, timeout):
         """
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index 2aa5e0876b6e0..6a7cd97aa3222 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -190,7 +190,7 @@ class StreamingContextSuite extends FunSuite with BeforeAndAfter with Timeouts w
         logInfo("Count = " + count + ", Running count = " + runningCount)
       }
       ssc.start()
-      ssc.awaitTermination(500)
+      ssc.awaitTerminationOrTimeout(500)
       ssc.stop(stopSparkContext = false, stopGracefully = true)
       logInfo("Running count = " + runningCount)
       logInfo("TestReceiver.counter = " + TestReceiver.counter.get())
@@ -223,7 +223,7 @@ class StreamingContextSuite extends FunSuite with BeforeAndAfter with Timeouts w
       logInfo("Count = " + count + ", Running count = " + runningCount)
     }
     ssc.start()
-    ssc.awaitTermination(500)
+    ssc.awaitTerminationOrTimeout(500)
     ssc.stop(stopSparkContext = false, stopGracefully = true)
     logInfo("Running count = " + runningCount)
     assert(runningCount > 0)
@@ -243,7 +243,7 @@ class StreamingContextSuite extends FunSuite with BeforeAndAfter with Timeouts w
 
     // test whether awaitTermination() exits after give amount of time
     failAfter(1000 millis) {
-      ssc.awaitTermination(500)
+      ssc.awaitTerminationOrTimeout(500)
     }
 
     // test whether awaitTermination() does not exit if not time is given
@@ -288,7 +288,7 @@ class StreamingContextSuite extends FunSuite with BeforeAndAfter with Timeouts w
 
     val exception = intercept[Exception] {
       ssc.start()
-      ssc.awaitTermination(5000)
+      ssc.awaitTerminationOrTimeout(5000)
     }
     assert(exception.getMessage.contains("map task"), "Expected exception not thrown")
   }
@@ -299,7 +299,7 @@ class StreamingContextSuite extends FunSuite with BeforeAndAfter with Timeouts w
     inputStream.transform { rdd => throw new TestException("error in transform"); rdd }.register()
     val exception = intercept[TestException] {
       ssc.start()
-      ssc.awaitTermination(5000)
+      ssc.awaitTerminationOrTimeout(5000)
     }
     assert(exception.getMessage.contains("transform"), "Expected exception not thrown")
   }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
index c2375ff65edac..3565d621e8a6c 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala
@@ -348,7 +348,7 @@ trait TestSuiteBase extends FunSuite with BeforeAndAfter with Logging {
       val startTime = System.currentTimeMillis()
       while (output.size < numExpectedOutput && System.currentTimeMillis() - startTime < maxWaitTimeMillis) {
         logInfo("output.size = " + output.size + ", numExpectedOutput = " + numExpectedOutput)
-        ssc.awaitTermination(50)
+        ssc.awaitTerminationOrTimeout(50)
       }
       val timeTaken = System.currentTimeMillis() - startTime
       logInfo("Output generated in " + timeTaken + " milliseconds")

From 242d49584c6aa21d928db2552033661950f760a5 Mon Sep 17 00:00:00 2001
From: CodingCat <zhunansjtu@gmail.com>
Date: Mon, 23 Feb 2015 11:29:25 +0000
Subject: [PATCH 276/357] [SPARK-5724] fix the misconfiguration in AkkaUtils

https://issues.apache.org/jira/browse/SPARK-5724

In AkkaUtil, we set several failure detector related the parameters as following

```
al akkaConf = ConfigFactory.parseMap(conf.getAkkaConf.toMap[String, String])
      .withFallback(akkaSslConfig).withFallback(ConfigFactory.parseString(
      s"""
      |akka.daemonic = on
      |akka.loggers = [""akka.event.slf4j.Slf4jLogger""]
      |akka.stdout-loglevel = "ERROR"
      |akka.jvm-exit-on-fatal-error = off
      |akka.remote.require-cookie = "$requireCookie"
      |akka.remote.secure-cookie = "$secureCookie"
      |akka.remote.transport-failure-detector.heartbeat-interval = $akkaHeartBeatInterval s
      |akka.remote.transport-failure-detector.acceptable-heartbeat-pause = $akkaHeartBeatPauses s
      |akka.remote.transport-failure-detector.threshold = $akkaFailureDetector
      |akka.actor.provider = "akka.remote.RemoteActorRefProvider"
      |akka.remote.netty.tcp.transport-class = "akka.remote.transport.netty.NettyTransport"
      |akka.remote.netty.tcp.hostname = "$host"
      |akka.remote.netty.tcp.port = $port
      |akka.remote.netty.tcp.tcp-nodelay = on
      |akka.remote.netty.tcp.connection-timeout = $akkaTimeout s
      |akka.remote.netty.tcp.maximum-frame-size = ${akkaFrameSize}B
      |akka.remote.netty.tcp.execution-pool-size = $akkaThreads
      |akka.actor.default-dispatcher.throughput = $akkaBatchSize
      |akka.log-config-on-start = $logAkkaConfig
      |akka.remote.log-remote-lifecycle-events = $lifecycleEvents
      |akka.log-dead-letters = $lifecycleEvents
      |akka.log-dead-letters-during-shutdown = $lifecycleEvents
      """.stripMargin))

```

Actually, we do not have any parameter naming "akka.remote.transport-failure-detector.threshold"
see: http://doc.akka.io/docs/akka/2.3.4/general/configuration.html
what we have is "akka.remote.watch-failure-detector.threshold"

Author: CodingCat <zhunansjtu@gmail.com>

Closes #4512 from CodingCat/SPARK-5724 and squashes the following commits:

bafe56e [CodingCat] fix the grammar in configuration doc
338296e [CodingCat] remove failure-detector related info
8bfcfd4 [CodingCat] fix the misconfiguration in AkkaUtils
---
 .../org/apache/spark/util/AkkaUtils.scala     |  3 --
 docs/configuration.md                         | 36 +++++++------------
 2 files changed, 12 insertions(+), 27 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
index 3d9c6192ff7f7..48a6ede05e17b 100644
--- a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
@@ -79,8 +79,6 @@ private[spark] object AkkaUtils extends Logging {
     val logAkkaConfig = if (conf.getBoolean("spark.akka.logAkkaConfig", false)) "on" else "off"
 
     val akkaHeartBeatPauses = conf.getInt("spark.akka.heartbeat.pauses", 6000)
-    val akkaFailureDetector =
-      conf.getDouble("spark.akka.failure-detector.threshold", 300.0)
     val akkaHeartBeatInterval = conf.getInt("spark.akka.heartbeat.interval", 1000)
 
     val secretKey = securityManager.getSecretKey()
@@ -106,7 +104,6 @@ private[spark] object AkkaUtils extends Logging {
       |akka.remote.secure-cookie = "$secureCookie"
       |akka.remote.transport-failure-detector.heartbeat-interval = $akkaHeartBeatInterval s
       |akka.remote.transport-failure-detector.acceptable-heartbeat-pause = $akkaHeartBeatPauses s
-      |akka.remote.transport-failure-detector.threshold = $akkaFailureDetector
       |akka.actor.provider = "akka.remote.RemoteActorRefProvider"
       |akka.remote.netty.tcp.transport-class = "akka.remote.transport.netty.NettyTransport"
       |akka.remote.netty.tcp.hostname = "$host"
diff --git a/docs/configuration.md b/docs/configuration.md
index 541695c83ae36..c8db338cb6f89 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -903,36 +903,24 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.akka.heartbeat.pauses</code></td>
   <td>6000</td>
   <td>
-     This is set to a larger value to disable failure detector that comes inbuilt akka. It can be
-     enabled again, if you plan to use this feature (Not recommended). Acceptable heart beat pause
-     in seconds for akka. This can be used to control sensitivity to gc pauses. Tune this in
-     combination of `spark.akka.heartbeat.interval` and `spark.akka.failure-detector.threshold`
-     if you need to.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.akka.failure-detector.threshold</code></td>
-  <td>300.0</td>
-  <td>
-     This is set to a larger value to disable failure detector that comes inbuilt akka. It can be
-     enabled again, if you plan to use this feature (Not recommended). This maps to akka's
-     `akka.remote.transport-failure-detector.threshold`. Tune this in combination of
-     `spark.akka.heartbeat.pauses` and `spark.akka.heartbeat.interval` if you need to.
+     This is set to a larger value to disable the transport failure detector that comes built in to Akka.
+     It can be enabled again, if you plan to use this feature (Not recommended). Acceptable heart 
+     beat pause in seconds for Akka. This can be used to control sensitivity to GC pauses. Tune
+     this along with `spark.akka.heartbeat.interval` if you need to.
   </td>
 </tr>
 <tr>
   <td><code>spark.akka.heartbeat.interval</code></td>
   <td>1000</td>
   <td>
-    This is set to a larger value to disable failure detector that comes inbuilt akka. It can be
-    enabled again, if you plan to use this feature (Not recommended). A larger interval value in
-    seconds reduces network overhead and a smaller value ( ~ 1 s) might be more informative for
-    akka's failure detector. Tune this in combination of `spark.akka.heartbeat.pauses` and
-    `spark.akka.failure-detector.threshold` if you need to. Only positive use case for using
-    failure detector can be, a sensistive failure detector can help evict rogue executors really
-    quick. However this is usually not the case as gc pauses and network lags are expected in a
-    real Spark cluster. Apart from that enabling this leads to a lot of exchanges of heart beats
-    between nodes leading to flooding the network with those.
+    This is set to a larger value to disable the transport failure detector that comes built in to Akka.
+    It can be enabled again, if you plan to use this feature (Not recommended). A larger interval 
+    value in seconds reduces network overhead and a smaller value ( ~ 1 s) might be more informative 
+    for Akka's failure detector. Tune this in combination of `spark.akka.heartbeat.pauses` if you need
+    to. A likely positive use case for using failure detector would be: a sensistive failure detector
+    can help evict rogue executors quickly. However this is usually not the case as GC pauses
+    and network lags are expected in a real Spark cluster. Apart from that enabling this leads to 
+    a lot of exchanges of heart beats between nodes leading to flooding the network with those.
   </td>
 </tr>
 <tr>

From 651a1c019eb911005e234a46cc559d63da352377 Mon Sep 17 00:00:00 2001
From: Jacky Li <jacky.likun@huawei.com>
Date: Mon, 23 Feb 2015 08:47:28 -0800
Subject: [PATCH 277/357] [SPARK-5939][MLLib] make FPGrowth example app take
 parameters

Add parameter parsing in FPGrowth example app in Scala and Java
And a sample data file is added in data/mllib folder

Author: Jacky Li <jacky.likun@huawei.com>

Closes #4714 from jackylk/parameter and squashes the following commits:

8c478b3 [Jacky Li] fix according to comments
3bb74f6 [Jacky Li] make FPGrowth exampl app take parameters
f0e4d10 [Jacky Li] make FPGrowth exampl app take parameters
---
 data/mllib/sample_fpgrowth.txt                |  6 ++
 .../examples/mllib/JavaFPGrowthExample.java   | 43 ++++++++++----
 .../examples/mllib/FPGrowthExample.scala      | 59 ++++++++++++++-----
 3 files changed, 81 insertions(+), 27 deletions(-)
 create mode 100644 data/mllib/sample_fpgrowth.txt

diff --git a/data/mllib/sample_fpgrowth.txt b/data/mllib/sample_fpgrowth.txt
new file mode 100644
index 0000000000000..c451583e51317
--- /dev/null
+++ b/data/mllib/sample_fpgrowth.txt
@@ -0,0 +1,6 @@
+r z h k p
+z y x w v u t s
+s x o n r
+x z y m t s q e
+z
+x z y r q t p
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java
index f50e802cf683c..36baf5868736c 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaFPGrowthExample.java
@@ -25,32 +25,49 @@
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function;
 import org.apache.spark.mllib.fpm.FPGrowth;
 import org.apache.spark.mllib.fpm.FPGrowthModel;
 
 /**
  * Java example for mining frequent itemsets using FP-growth.
+ * Example usage:  ./bin/run-example mllib.JavaFPGrowthExample ./data/mllib/sample_fpgrowth.txt
  */
 public class JavaFPGrowthExample {
 
   public static void main(String[] args) {
+    String inputFile;
+    double minSupport = 0.3;
+    int numPartition = -1;
+    if (args.length < 1) {
+      System.err.println(
+        "Usage: JavaFPGrowth <input_file> [minSupport] [numPartition]");
+      System.exit(1);
+    }
+    inputFile = args[0];
+    if (args.length >= 2) {
+      minSupport = Double.parseDouble(args[1]);
+    }
+    if (args.length >= 3) {
+      numPartition = Integer.parseInt(args[2]);
+    }
+
     SparkConf sparkConf = new SparkConf().setAppName("JavaFPGrowthExample");
     JavaSparkContext sc = new JavaSparkContext(sparkConf);
 
+    JavaRDD<ArrayList<String>> transactions = sc.textFile(inputFile).map(
+      new Function<String, ArrayList<String>>() {
+        @Override
+        public ArrayList<String> call(String s) {
+          return Lists.newArrayList(s.split(" "));
+        }
+      }
+    );
 
-    // TODO: Read a user-specified input file.
-    @SuppressWarnings("unchecked")
-    JavaRDD<ArrayList<String>> transactions = sc.parallelize(Lists.newArrayList(
-      Lists.newArrayList("r z h k p".split(" ")),
-      Lists.newArrayList("z y x w v u t s".split(" ")),
-      Lists.newArrayList("s x o n r".split(" ")),
-      Lists.newArrayList("x z y m t s q e".split(" ")),
-      Lists.newArrayList("z".split(" ")),
-      Lists.newArrayList("x z y r q t p".split(" "))), 2);
-
-    FPGrowth fpg = new FPGrowth()
-      .setMinSupport(0.3);
-    FPGrowthModel<String> model = fpg.run(transactions);
+    FPGrowthModel<String> model = new FPGrowth()
+      .setMinSupport(minSupport)
+      .setNumPartitions(numPartition)
+      .run(transactions);
 
     for (FPGrowth.FreqItemset<String> s: model.freqItemsets().toJavaRDD().collect()) {
       System.out.println("[" + Joiner.on(",").join(s.javaItems()) + "], " + s.freq());
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
index aaae275ec5524..13f24a1e59610 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
@@ -17,30 +17,61 @@
 
 package org.apache.spark.examples.mllib
 
+import scopt.OptionParser
+
 import org.apache.spark.mllib.fpm.FPGrowth
-import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.{SparkConf, SparkContext}
 
 /**
  * Example for mining frequent itemsets using FP-growth.
+ * Example usage: ./bin/run-example mllib.FPGrowthExample \
+ *   --minSupport 0.8 --numPartition 2 ./data/mllib/sample_fpgrowth.txt
  */
 object FPGrowthExample {
 
+  case class Params(
+    input: String = null,
+    minSupport: Double = 0.3,
+    numPartition: Int = -1) extends AbstractParams[Params]
+
   def main(args: Array[String]) {
-    val conf = new SparkConf().setAppName("FPGrowthExample")
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("FPGrowthExample") {
+      head("FPGrowth: an example FP-growth app.")
+      opt[Double]("minSupport")
+        .text(s"minimal support level, default: ${defaultParams.minSupport}")
+        .action((x, c) => c.copy(minSupport = x))
+      opt[Int]("numPartition")
+        .text(s"number of partition, default: ${defaultParams.numPartition}")
+        .action((x, c) => c.copy(numPartition = x))
+      arg[String]("<input>")
+        .text("input paths to input data set, whose file format is that each line " +
+          "contains a transaction with each item in String and separated by a space")
+        .required()
+        .action((x, c) => c.copy(input = x))
+    }
+
+    parser.parse(args, defaultParams).map { params =>
+      run(params)
+    }.getOrElse {
+      sys.exit(1)
+    }
+  }
+
+  def run(params: Params) {
+    val conf = new SparkConf().setAppName(s"FPGrowthExample with $params")
     val sc = new SparkContext(conf)
+    val transactions = sc.textFile(params.input).map(_.split(" ")).cache()
+
+    println(s"Number of transactions: ${transactions.count()}")
+
+    val model = new FPGrowth()
+      .setMinSupport(params.minSupport)
+      .setNumPartitions(params.numPartition)
+      .run(transactions)
 
-    // TODO: Read a user-specified input file.
-    val transactions = sc.parallelize(Seq(
-      "r z h k p",
-      "z y x w v u t s",
-      "s x o n r",
-      "x z y m t s q e",
-      "z",
-      "x z y r q t p").map(_.split(" ")), numSlices = 2)
-
-    val fpg = new FPGrowth()
-      .setMinSupport(0.3)
-    val model = fpg.run(transactions)
+    println(s"Number of frequent itemsets: ${model.freqItemsets.count()}")
 
     model.freqItemsets.collect().foreach { itemset =>
       println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)

From 28ccf5ee769a1df019e38985112065c01724fbd9 Mon Sep 17 00:00:00 2001
From: Alexander Ulanov <nashb@yandex.ru>
Date: Mon, 23 Feb 2015 12:09:40 -0800
Subject: [PATCH 278/357] [MLLIB] SPARK-5912 Programming guide for feature
 selection

Added description of ChiSqSelector and few words about feature selection in general. I could add a code example, however it would not look reasonable in the absence of feature discretizer or a dataset in the `data` folder that has redundant features.

Author: Alexander Ulanov <nashb@yandex.ru>

Closes #4709 from avulanov/SPARK-5912 and squashes the following commits:

19a8a4e [Alexander Ulanov] Addressing reviewers comments @jkbradley
58d9e4d [Alexander Ulanov] Addressing reviewers comments @jkbradley
eb6b9fe [Alexander Ulanov] Typo
2921a1d [Alexander Ulanov] ChiSqSelector example of use
c845350 [Alexander Ulanov] ChiSqSelector docs
---
 docs/mllib-feature-extraction.md | 54 ++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index d4a61a7fbf3d7..d588b9cb46697 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -375,3 +375,57 @@ data2 = labels.zip(normalizer2.transform(features))
 {% endhighlight %}
 </div>
 </div>
+
+## Feature selection
+[Feature selection](http://en.wikipedia.org/wiki/Feature_selection) allows selecting the most relevant features for use in model construction. The number of features to select can be determined using the validation set. Feature selection is usually applied on sparse data, for example in text classification. Feature selection reduces the size of the vector space and, in turn, the complexity of any subsequent operation with vectors. 
+
+### ChiSqSelector
+ChiSqSelector stands for Chi-Squared feature selection. It operates on the labeled data. ChiSqSelector orders categorical features based on their values of Chi-Squared test on independence from class and filters (selects) top given features.  
+
+#### Model Fitting
+
+[`ChiSqSelector`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) has the
+following parameters in the constructor:
+
+* `numTopFeatures` number of top features that selector will select (filter).
+
+We provide a [`fit`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) method in
+`ChiSqSelector` which can take an input of `RDD[LabeledPoint]` with categorical features, learn the summary statistics, and then
+return a model which can transform the input dataset into the reduced feature space.
+
+This model implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer)
+which can apply the Chi-Squared feature selection on a `Vector` to produce a reduced `Vector` or on
+an `RDD[Vector]` to produce a reduced `RDD[Vector]`.
+
+Note that the model that performs actual feature filtering can be instantiated independently with array of feature indices that has to be sorted ascending.
+
+#### Example
+
+The following example shows the basic use of ChiSqSelector.
+
+<div class="codetabs">
+<div data-lang="scala">
+{% highlight scala %}
+import org.apache.spark.SparkContext._
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.util.MLUtils
+
+// load some data in libsvm format, each point is in the range 0..255
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+// discretize data in 16 equal bins
+val discretizedData = data.map { lp =>
+  LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => x / 16 } ) )
+}
+// create ChiSqSelector that will select 50 features
+val selector = new ChiSqSelector(50)
+// create ChiSqSelector model
+val transformer = selector.fit(disctetizedData)
+// filter top 50 features from each feature vector
+val filteredData = disctetizedData.map { lp => 
+  LabeledPoint(lp.label, transformer.transform(lp.features)) 
+}
+{% endhighlight %}
+</div>
+</div>
+

From 59536cc87e10e5011560556729dd901280958f43 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Mon, 23 Feb 2015 16:15:57 -0800
Subject: [PATCH 279/357] [SPARK-5912] [docs] [mllib] Small fixes to
 ChiSqSelector docs

Fixes:
* typo in Scala example
* Removed comment "usually applied on sparse data" since that is debatable
* small edits to text for clarity

CC: avulanov  I noticed a typo post-hoc and ended up making a few small edits.  Do the changes look OK?

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #4732 from jkbradley/chisqselector-docs and squashes the following commits:

9656a3b [Joseph K. Bradley] added Java example for ChiSqSelector to guide
3f3f9f4 [Joseph K. Bradley] small fixes to ChiSqSelector docs
---
 docs/mllib-feature-extraction.md | 72 ++++++++++++++++++++++++++------
 1 file changed, 60 insertions(+), 12 deletions(-)

diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index d588b9cb46697..80842b27effd8 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -377,27 +377,27 @@ data2 = labels.zip(normalizer2.transform(features))
 </div>
 
 ## Feature selection
-[Feature selection](http://en.wikipedia.org/wiki/Feature_selection) allows selecting the most relevant features for use in model construction. The number of features to select can be determined using the validation set. Feature selection is usually applied on sparse data, for example in text classification. Feature selection reduces the size of the vector space and, in turn, the complexity of any subsequent operation with vectors. 
+[Feature selection](http://en.wikipedia.org/wiki/Feature_selection) allows selecting the most relevant features for use in model construction. Feature selection reduces the size of the vector space and, in turn, the complexity of any subsequent operation with vectors. The number of features to select can be tuned using a held-out validation set.
 
 ### ChiSqSelector
-ChiSqSelector stands for Chi-Squared feature selection. It operates on the labeled data. ChiSqSelector orders categorical features based on their values of Chi-Squared test on independence from class and filters (selects) top given features.  
+[`ChiSqSelector`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) stands for Chi-Squared feature selection. It operates on labeled data with categorical features. `ChiSqSelector` orders features based on a Chi-Squared test of independence from the class, and then filters (selects) the top features which are most closely related to the label.
 
 #### Model Fitting
 
 [`ChiSqSelector`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) has the
 following parameters in the constructor:
 
-* `numTopFeatures` number of top features that selector will select (filter).
+* `numTopFeatures` number of top features that the selector will select (filter).
 
 We provide a [`fit`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) method in
 `ChiSqSelector` which can take an input of `RDD[LabeledPoint]` with categorical features, learn the summary statistics, and then
-return a model which can transform the input dataset into the reduced feature space.
+return a `ChiSqSelectorModel` which can transform an input dataset into the reduced feature space.
 
 This model implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer)
 which can apply the Chi-Squared feature selection on a `Vector` to produce a reduced `Vector` or on
 an `RDD[Vector]` to produce a reduced `RDD[Vector]`.
 
-Note that the model that performs actual feature filtering can be instantiated independently with array of feature indices that has to be sorted ascending.
+Note that the user can also construct a `ChiSqSelectorModel` by hand by providing an array of selected feature indices (which must be sorted in ascending order).
 
 #### Example
 
@@ -411,21 +411,69 @@ import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLUtils
 
-// load some data in libsvm format, each point is in the range 0..255
+// Load some data in libsvm format
 val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-// discretize data in 16 equal bins
+// Discretize data in 16 equal bins since ChiSqSelector requires categorical features
 val discretizedData = data.map { lp =>
   LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => x / 16 } ) )
 }
-// create ChiSqSelector that will select 50 features
+// Create ChiSqSelector that will select 50 features
 val selector = new ChiSqSelector(50)
-// create ChiSqSelector model
-val transformer = selector.fit(disctetizedData)
-// filter top 50 features from each feature vector
-val filteredData = disctetizedData.map { lp => 
+// Create ChiSqSelector model (selecting features)
+val transformer = selector.fit(discretizedData)
+// Filter the top 50 features from each feature vector
+val filteredData = discretizedData.map { lp => 
   LabeledPoint(lp.label, transformer.transform(lp.features)) 
 }
 {% endhighlight %}
 </div>
+
+<div data-lang="java">
+{% highlight java %}
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.feature.ChiSqSelector;
+import org.apache.spark.mllib.feature.ChiSqSelectorModel;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+
+SparkConf sparkConf = new SparkConf().setAppName("JavaChiSqSelector");
+JavaSparkContext sc = new JavaSparkContext(sparkConf);
+JavaRDD<LabeledPoint> points = MLUtils.loadLibSVMFile(sc.sc(),
+    "data/mllib/sample_libsvm_data.txt").toJavaRDD().cache();
+
+// Discretize data in 16 equal bins since ChiSqSelector requires categorical features
+JavaRDD<LabeledPoint> discretizedData = points.map(
+    new Function<LabeledPoint, LabeledPoint>() {
+      @Override
+      public LabeledPoint call(LabeledPoint lp) {
+        final double[] discretizedFeatures = new double[lp.features().size()];
+        for (int i = 0; i < lp.features().size(); ++i) {
+          discretizedFeatures[i] = lp.features().apply(i) / 16;
+        }
+        return new LabeledPoint(lp.label(), Vectors.dense(discretizedFeatures));
+      }
+    });
+
+// Create ChiSqSelector that will select 50 features
+ChiSqSelector selector = new ChiSqSelector(50);
+// Create ChiSqSelector model (selecting features)
+final ChiSqSelectorModel transformer = selector.fit(discretizedData.rdd());
+// Filter the top 50 features from each feature vector
+JavaRDD<LabeledPoint> filteredData = discretizedData.map(
+    new Function<LabeledPoint, LabeledPoint>() {
+      @Override
+      public LabeledPoint call(LabeledPoint lp) {
+        return new LabeledPoint(lp.label(), transformer.transform(lp.features()));
+      }
+    }
+);
+
+sc.stop();
+{% endhighlight %}
+</div>
 </div>
 

From 48376bfe9c97bf31279918def6c6615849c88f4d Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 23 Feb 2015 17:16:34 -0800
Subject: [PATCH 280/357] [SPARK-5935][SQL] Accept MapType in the schema
 provided to a JSON dataset.

JIRA: https://issues.apache.org/jira/browse/SPARK-5935

Author: Yin Huai <yhuai@databricks.com>
Author: Yin Huai <huai@cse.ohio-state.edu>

Closes #4710 from yhuai/jsonMapType and squashes the following commits:

3e40390 [Yin Huai] Remove unnecessary changes.
f8e6267 [Yin Huai] Fix test.
baa36e3 [Yin Huai] Accept MapType in the schema provided to jsonFile/jsonRDD.
---
 .../org/apache/spark/sql/json/JsonRDD.scala   |  3 +
 .../org/apache/spark/sql/json/JsonSuite.scala | 56 +++++++++++++++++++
 .../apache/spark/sql/json/TestJsonData.scala  | 17 ++++++
 3 files changed, 76 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index 3b8dde1823370..d83bdc2f7ff9a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -416,6 +416,9 @@ private[sql] object JsonRDD extends Logging {
         case NullType => null
         case ArrayType(elementType, _) =>
           value.asInstanceOf[Seq[Any]].map(enforceCorrectType(_, elementType))
+        case MapType(StringType, valueType, _) =>
+          val map = value.asInstanceOf[Map[String, Any]]
+          map.mapValues(enforceCorrectType(_, valueType)).map(identity)
         case struct: StructType => asRow(value.asInstanceOf[Map[String, Any]], struct)
         case DateType => toDate(value)
         case TimestampType => toTimestamp(value)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index c94e44bd7c397..005f20b96df79 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -657,6 +657,62 @@ class JsonSuite extends QueryTest {
     )
   }
 
+  test("Applying schemas with MapType") {
+    val schemaWithSimpleMap = StructType(
+      StructField("map", MapType(StringType, IntegerType, true), false) :: Nil)
+    val jsonWithSimpleMap = jsonRDD(mapType1, schemaWithSimpleMap)
+
+    jsonWithSimpleMap.registerTempTable("jsonWithSimpleMap")
+
+    checkAnswer(
+      sql("select map from jsonWithSimpleMap"),
+      Row(Map("a" -> 1)) ::
+      Row(Map("b" -> 2)) ::
+      Row(Map("c" -> 3)) ::
+      Row(Map("c" -> 1, "d" -> 4)) ::
+      Row(Map("e" -> null)) :: Nil
+    )
+
+    checkAnswer(
+      sql("select map['c'] from jsonWithSimpleMap"),
+      Row(null) ::
+      Row(null) ::
+      Row(3) ::
+      Row(1) ::
+      Row(null) :: Nil
+    )
+
+    val innerStruct = StructType(
+      StructField("field1", ArrayType(IntegerType, true), true) ::
+      StructField("field2", IntegerType, true) :: Nil)
+    val schemaWithComplexMap = StructType(
+      StructField("map", MapType(StringType, innerStruct, true), false) :: Nil)
+
+    val jsonWithComplexMap = jsonRDD(mapType2, schemaWithComplexMap)
+
+    jsonWithComplexMap.registerTempTable("jsonWithComplexMap")
+
+    checkAnswer(
+      sql("select map from jsonWithComplexMap"),
+      Row(Map("a" -> Row(Seq(1, 2, 3, null), null))) ::
+      Row(Map("b" -> Row(null, 2))) ::
+      Row(Map("c" -> Row(Seq(), 4))) ::
+      Row(Map("c" -> Row(null, 3), "d" -> Row(Seq(null), null))) ::
+      Row(Map("e" -> null)) ::
+      Row(Map("f" -> Row(null, null))) :: Nil
+    )
+
+    checkAnswer(
+      sql("select map['a'].field1, map['c'].field2 from jsonWithComplexMap"),
+      Row(Seq(1, 2, 3, null), null) ::
+      Row(null, null) ::
+      Row(null, 4) ::
+      Row(null, 3) ::
+      Row(null, null) ::
+      Row(null, null) :: Nil
+    )
+  }
+
   test("SPARK-2096 Correctly parse dot notations") {
     val jsonDF = jsonRDD(complexFieldAndType2)
     jsonDF.registerTempTable("jsonTable")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
index 3370b3c98b4be..15698f61e0837 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
@@ -146,6 +146,23 @@ object TestJsonData {
           ]]
       }""" :: Nil)
 
+  val mapType1 =
+    TestSQLContext.sparkContext.parallelize(
+      """{"map": {"a": 1}}""" ::
+      """{"map": {"b": 2}}""" ::
+      """{"map": {"c": 3}}""" ::
+      """{"map": {"c": 1, "d": 4}}""" ::
+      """{"map": {"e": null}}""" :: Nil)
+
+  val mapType2 =
+    TestSQLContext.sparkContext.parallelize(
+      """{"map": {"a": {"field1": [1, 2, 3, null]}}}""" ::
+      """{"map": {"b": {"field2": 2}}}""" ::
+      """{"map": {"c": {"field1": [], "field2": 4}}}""" ::
+      """{"map": {"c": {"field2": 3}, "d": {"field1": [null]}}}""" ::
+      """{"map": {"e": null}}""" ::
+      """{"map": {"f": {"field1": null}}}""" :: Nil)
+
   val nullsInArrays =
     TestSQLContext.sparkContext.parallelize(
       """{"field1":[[null], [[["Test"]]]]}""" ::

From 1ed57086d402c38d95cda6c3d9d7aea806609bf9 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Mon, 23 Feb 2015 17:34:54 -0800
Subject: [PATCH 281/357] [SPARK-5873][SQL] Allow viewing of partially analyzed
 plans in queryExecution

Author: Michael Armbrust <michael@databricks.com>

Closes #4684 from marmbrus/explainAnalysis and squashes the following commits:

afbaa19 [Michael Armbrust] fix python
d93278c [Michael Armbrust] fix hive
e5fa0a4 [Michael Armbrust] Merge remote-tracking branch 'origin/master' into explainAnalysis
52119f2 [Michael Armbrust] more tests
82a5431 [Michael Armbrust] fix tests
25753d2 [Michael Armbrust] Merge remote-tracking branch 'origin/master' into explainAnalysis
aee1e6a [Michael Armbrust] fix hive
b23a844 [Michael Armbrust] newline
de8dc51 [Michael Armbrust] more comments
acf620a [Michael Armbrust] [SPARK-5873][SQL] Show partially analyzed plans in query execution
---
 python/pyspark/sql/context.py                 |  30 ++---
 .../apache/spark/sql/catalyst/SqlParser.scala |   2 +
 .../sql/catalyst/analysis/Analyzer.scala      |  83 --------------
 .../sql/catalyst/analysis/CheckAnalysis.scala | 105 ++++++++++++++++++
 .../sql/catalyst/analysis/AnalysisSuite.scala |  35 +++---
 .../org/apache/spark/sql/DataFrame.scala      |   2 +-
 .../scala/org/apache/spark/sql/SQLConf.scala  |   5 +-
 .../org/apache/spark/sql/SQLContext.scala     |  14 ++-
 .../org/apache/spark/sql/sources/rules.scala  |  10 +-
 .../spark/sql/sources/DataSourceTest.scala    |   1 -
 .../spark/sql/sources/InsertSuite.scala       |   2 +-
 .../apache/spark/sql/hive/HiveContext.scala   |   1 -
 12 files changed, 164 insertions(+), 126 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 313f15e6d9b6f..125933c9d3ae0 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -267,20 +267,20 @@ def applySchema(self, rdd, schema):
         ...     StructField("byte2", ByteType(), False),
         ...     StructField("short1", ShortType(), False),
         ...     StructField("short2", ShortType(), False),
-        ...     StructField("int", IntegerType(), False),
-        ...     StructField("float", FloatType(), False),
-        ...     StructField("date", DateType(), False),
-        ...     StructField("time", TimestampType(), False),
-        ...     StructField("map",
+        ...     StructField("int1", IntegerType(), False),
+        ...     StructField("float1", FloatType(), False),
+        ...     StructField("date1", DateType(), False),
+        ...     StructField("time1", TimestampType(), False),
+        ...     StructField("map1",
         ...         MapType(StringType(), IntegerType(), False), False),
-        ...     StructField("struct",
+        ...     StructField("struct1",
         ...         StructType([StructField("b", ShortType(), False)]), False),
-        ...     StructField("list", ArrayType(ByteType(), False), False),
-        ...     StructField("null", DoubleType(), True)])
+        ...     StructField("list1", ArrayType(ByteType(), False), False),
+        ...     StructField("null1", DoubleType(), True)])
         >>> df = sqlCtx.applySchema(rdd, schema)
         >>> results = df.map(
-        ...     lambda x: (x.byte1, x.byte2, x.short1, x.short2, x.int, x.float, x.date,
-        ...         x.time, x.map["a"], x.struct.b, x.list, x.null))
+        ...     lambda x: (x.byte1, x.byte2, x.short1, x.short2, x.int1, x.float1, x.date1,
+        ...         x.time1, x.map1["a"], x.struct1.b, x.list1, x.null1))
         >>> results.collect()[0] # doctest: +NORMALIZE_WHITESPACE
         (127, -128, -32768, 32767, 2147483647, 1.0, datetime.date(2010, 1, 1),
              datetime.datetime(2010, 1, 1, 1, 1, 1), 1, 2, [1, 2, 3], None)
@@ -288,20 +288,20 @@ def applySchema(self, rdd, schema):
         >>> df.registerTempTable("table2")
         >>> sqlCtx.sql(
         ...   "SELECT byte1 - 1 AS byte1, byte2 + 1 AS byte2, " +
-        ...     "short1 + 1 AS short1, short2 - 1 AS short2, int - 1 AS int, " +
-        ...     "float + 1.5 as float FROM table2").collect()
-        [Row(byte1=126, byte2=-127, short1=-32767, short2=32766, int=2147483646, float=2.5)]
+        ...     "short1 + 1 AS short1, short2 - 1 AS short2, int1 - 1 AS int1, " +
+        ...     "float1 + 1.5 as float1 FROM table2").collect()
+        [Row(byte1=126, byte2=-127, short1=-32767, short2=32766, int1=2147483646, float1=2.5)]
 
         >>> from pyspark.sql.types import _parse_schema_abstract, _infer_schema_type
         >>> rdd = sc.parallelize([(127, -32768, 1.0,
         ...     datetime(2010, 1, 1, 1, 1, 1),
         ...     {"a": 1}, (2,), [1, 2, 3])])
-        >>> abstract = "byte short float time map{} struct(b) list[]"
+        >>> abstract = "byte1 short1 float1 time1 map1{} struct1(b) list1[]"
         >>> schema = _parse_schema_abstract(abstract)
         >>> typedSchema = _infer_schema_type(rdd.first(), schema)
         >>> df = sqlCtx.applySchema(rdd, typedSchema)
         >>> df.collect()
-        [Row(byte=127, short=-32768, float=1.0, time=..., list=[1, 2, 3])]
+        [Row(byte1=127, short1=-32768, float1=1.0, time1=..., list1=[1, 2, 3])]
         """
 
         if isinstance(rdd, DataFrame):
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index 124f083669358..b16aff99af1c5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -78,6 +78,7 @@ class SqlParser extends AbstractSparkSQLParser {
   protected val IF = Keyword("IF")
   protected val IN = Keyword("IN")
   protected val INNER = Keyword("INNER")
+  protected val INT = Keyword("INT")
   protected val INSERT = Keyword("INSERT")
   protected val INTERSECT = Keyword("INTERSECT")
   protected val INTO = Keyword("INTO")
@@ -394,6 +395,7 @@ class SqlParser extends AbstractSparkSQLParser {
     | fixedDecimalType
     | DECIMAL ^^^ DecimalType.Unlimited
     | DATE ^^^ DateType
+    | INT ^^^ IntegerType
     )
 
   protected lazy val fixedDecimalType: Parser[DataType] =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index fc37b8cde0806..e4e542562f22d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -52,12 +52,6 @@ class Analyzer(catalog: Catalog,
    */
   val extendedResolutionRules: Seq[Rule[LogicalPlan]] = Nil
 
-  /**
-   * Override to provide additional rules for the "Check Analysis" batch.
-   * These rules will be evaluated after our built-in check rules.
-   */
-  val extendedCheckRules: Seq[Rule[LogicalPlan]] = Nil
-
   lazy val batches: Seq[Batch] = Seq(
     Batch("Resolution", fixedPoint,
       ResolveRelations ::
@@ -71,87 +65,10 @@ class Analyzer(catalog: Catalog,
       TrimGroupingAliases ::
       typeCoercionRules ++
       extendedResolutionRules : _*),
-    Batch("Check Analysis", Once,
-      CheckResolution +:
-      extendedCheckRules: _*),
     Batch("Remove SubQueries", fixedPoint,
       EliminateSubQueries)
   )
 
-  /**
-   * Makes sure all attributes and logical plans have been resolved.
-   */
-  object CheckResolution extends Rule[LogicalPlan] {
-    def failAnalysis(msg: String) = { throw new AnalysisException(msg) }
-
-    def apply(plan: LogicalPlan): LogicalPlan = {
-      // We transform up and order the rules so as to catch the first possible failure instead
-      // of the result of cascading resolution failures.
-      plan.foreachUp {
-        case operator: LogicalPlan =>
-          operator transformExpressionsUp {
-            case a: Attribute if !a.resolved =>
-              val from = operator.inputSet.map(_.name).mkString(", ")
-              a.failAnalysis(s"cannot resolve '${a.prettyString}' given input columns $from")
-
-            case c: Cast if !c.resolved =>
-              failAnalysis(
-                s"invalid cast from ${c.child.dataType.simpleString} to ${c.dataType.simpleString}")
-
-            case b: BinaryExpression if !b.resolved =>
-              failAnalysis(
-                s"invalid expression ${b.prettyString} " +
-                s"between ${b.left.simpleString} and ${b.right.simpleString}")
-          }
-
-          operator match {
-            case f: Filter if f.condition.dataType != BooleanType =>
-              failAnalysis(
-                s"filter expression '${f.condition.prettyString}' " +
-                s"of type ${f.condition.dataType.simpleString} is not a boolean.")
-
-            case aggregatePlan @ Aggregate(groupingExprs, aggregateExprs, child) =>
-              def checkValidAggregateExpression(expr: Expression): Unit = expr match {
-                case _: AggregateExpression => // OK
-                case e: Attribute if !groupingExprs.contains(e) =>
-                  failAnalysis(
-                    s"expression '${e.prettyString}' is neither present in the group by, " +
-                    s"nor is it an aggregate function. " +
-                     "Add to group by or wrap in first() if you don't care which value you get.")
-                case e if groupingExprs.contains(e) => // OK
-                case e if e.references.isEmpty => // OK
-                case e => e.children.foreach(checkValidAggregateExpression)
-              }
-
-              val cleaned = aggregateExprs.map(_.transform {
-                // Should trim aliases around `GetField`s. These aliases are introduced while
-                // resolving struct field accesses, because `GetField` is not a `NamedExpression`.
-                // (Should we just turn `GetField` into a `NamedExpression`?)
-                case Alias(g, _) => g
-              })
-
-              cleaned.foreach(checkValidAggregateExpression)
-
-            case o if o.children.nonEmpty &&
-              !o.references.filter(_.name != "grouping__id").subsetOf(o.inputSet) =>
-              val missingAttributes = (o.references -- o.inputSet).map(_.prettyString).mkString(",")
-              val input = o.inputSet.map(_.prettyString).mkString(",")
-
-              failAnalysis(s"resolved attributes $missingAttributes missing from $input")
-
-            // Catch all
-            case o if !o.resolved =>
-              failAnalysis(
-                s"unresolved operator ${operator.simpleString}")
-
-            case _ => // Analysis successful!
-          }
-      }
-
-      plan
-    }
-  }
-
   /**
    * Removes no-op Alias expressions from the plan.
    */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
new file mode 100644
index 0000000000000..4e8fc892f3eea
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.types._
+
+/**
+ * Throws user facing errors when passed invalid queries that fail to analyze.
+ */
+class CheckAnalysis {
+
+  /**
+   * Override to provide additional checks for correct analysis.
+   * These rules will be evaluated after our built-in check rules.
+   */
+  val extendedCheckRules: Seq[LogicalPlan => Unit] = Nil
+
+  def failAnalysis(msg: String) = {
+    throw new AnalysisException(msg)
+  }
+
+  def apply(plan: LogicalPlan): Unit = {
+    // We transform up and order the rules so as to catch the first possible failure instead
+    // of the result of cascading resolution failures.
+    plan.foreachUp {
+      case operator: LogicalPlan =>
+        operator transformExpressionsUp {
+          case a: Attribute if !a.resolved =>
+            val from = operator.inputSet.map(_.name).mkString(", ")
+            a.failAnalysis(s"cannot resolve '${a.prettyString}' given input columns $from")
+
+          case c: Cast if !c.resolved =>
+            failAnalysis(
+              s"invalid cast from ${c.child.dataType.simpleString} to ${c.dataType.simpleString}")
+
+          case b: BinaryExpression if !b.resolved =>
+            failAnalysis(
+              s"invalid expression ${b.prettyString} " +
+                s"between ${b.left.simpleString} and ${b.right.simpleString}")
+        }
+
+        operator match {
+          case f: Filter if f.condition.dataType != BooleanType =>
+            failAnalysis(
+              s"filter expression '${f.condition.prettyString}' " +
+                s"of type ${f.condition.dataType.simpleString} is not a boolean.")
+
+          case aggregatePlan@Aggregate(groupingExprs, aggregateExprs, child) =>
+            def checkValidAggregateExpression(expr: Expression): Unit = expr match {
+              case _: AggregateExpression => // OK
+              case e: Attribute if !groupingExprs.contains(e) =>
+                failAnalysis(
+                  s"expression '${e.prettyString}' is neither present in the group by, " +
+                    s"nor is it an aggregate function. " +
+                    "Add to group by or wrap in first() if you don't care which value you get.")
+              case e if groupingExprs.contains(e) => // OK
+              case e if e.references.isEmpty => // OK
+              case e => e.children.foreach(checkValidAggregateExpression)
+            }
+
+            val cleaned = aggregateExprs.map(_.transform {
+              // Should trim aliases around `GetField`s. These aliases are introduced while
+              // resolving struct field accesses, because `GetField` is not a `NamedExpression`.
+              // (Should we just turn `GetField` into a `NamedExpression`?)
+              case Alias(g, _) => g
+            })
+
+            cleaned.foreach(checkValidAggregateExpression)
+
+          case o if o.children.nonEmpty &&
+            !o.references.filter(_.name != "grouping__id").subsetOf(o.inputSet) =>
+            val missingAttributes = (o.references -- o.inputSet).map(_.prettyString).mkString(",")
+            val input = o.inputSet.map(_.prettyString).mkString(",")
+
+            failAnalysis(s"resolved attributes $missingAttributes missing from $input")
+
+          // Catch all
+          case o if !o.resolved =>
+            failAnalysis(
+              s"unresolved operator ${operator.simpleString}")
+
+          case _ => // Analysis successful!
+        }
+    }
+    extendedCheckRules.foreach(_(plan))
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index aec7847356cd4..c1dd5aa913ddc 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -30,11 +30,21 @@ import org.apache.spark.sql.catalyst.dsl.plans._
 class AnalysisSuite extends FunSuite with BeforeAndAfter {
   val caseSensitiveCatalog = new SimpleCatalog(true)
   val caseInsensitiveCatalog = new SimpleCatalog(false)
-  val caseSensitiveAnalyze =
+
+  val caseSensitiveAnalyzer =
     new Analyzer(caseSensitiveCatalog, EmptyFunctionRegistry, caseSensitive = true)
-  val caseInsensitiveAnalyze =
+  val caseInsensitiveAnalyzer =
     new Analyzer(caseInsensitiveCatalog, EmptyFunctionRegistry, caseSensitive = false)
 
+  val checkAnalysis = new CheckAnalysis
+
+
+  def caseSensitiveAnalyze(plan: LogicalPlan) =
+    checkAnalysis(caseSensitiveAnalyzer(plan))
+
+  def caseInsensitiveAnalyze(plan: LogicalPlan) =
+    checkAnalysis(caseInsensitiveAnalyzer(plan))
+
   val testRelation = LocalRelation(AttributeReference("a", IntegerType, nullable = true)())
   val testRelation2 = LocalRelation(
     AttributeReference("a", StringType)(),
@@ -55,7 +65,7 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
         a.select(UnresolvedStar(None)).select('a).unionAll(b.select(UnresolvedStar(None)))
       }
 
-    assert(caseInsensitiveAnalyze(plan).resolved)
+    assert(caseInsensitiveAnalyzer(plan).resolved)
   }
 
   test("check project's resolved") {
@@ -71,11 +81,11 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
 
   test("analyze project") {
     assert(
-      caseSensitiveAnalyze(Project(Seq(UnresolvedAttribute("a")), testRelation)) ===
+      caseSensitiveAnalyzer(Project(Seq(UnresolvedAttribute("a")), testRelation)) ===
         Project(testRelation.output, testRelation))
 
     assert(
-      caseSensitiveAnalyze(
+      caseSensitiveAnalyzer(
         Project(Seq(UnresolvedAttribute("TbL.a")),
           UnresolvedRelation(Seq("TaBlE"), Some("TbL")))) ===
         Project(testRelation.output, testRelation))
@@ -88,13 +98,13 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
     assert(e.getMessage().toLowerCase.contains("cannot resolve"))
 
     assert(
-      caseInsensitiveAnalyze(
+      caseInsensitiveAnalyzer(
         Project(Seq(UnresolvedAttribute("TbL.a")),
           UnresolvedRelation(Seq("TaBlE"), Some("TbL")))) ===
         Project(testRelation.output, testRelation))
 
     assert(
-      caseInsensitiveAnalyze(
+      caseInsensitiveAnalyzer(
         Project(Seq(UnresolvedAttribute("tBl.a")),
           UnresolvedRelation(Seq("TaBlE"), Some("TbL")))) ===
         Project(testRelation.output, testRelation))
@@ -107,16 +117,13 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
     assert(e.getMessage == "Table Not Found: tAbLe")
 
     assert(
-      caseSensitiveAnalyze(UnresolvedRelation(Seq("TaBlE"), None)) ===
-        testRelation)
+      caseSensitiveAnalyzer(UnresolvedRelation(Seq("TaBlE"), None)) === testRelation)
 
     assert(
-      caseInsensitiveAnalyze(UnresolvedRelation(Seq("tAbLe"), None)) ===
-        testRelation)
+      caseInsensitiveAnalyzer(UnresolvedRelation(Seq("tAbLe"), None)) === testRelation)
 
     assert(
-      caseInsensitiveAnalyze(UnresolvedRelation(Seq("TaBlE"), None)) ===
-        testRelation)
+      caseInsensitiveAnalyzer(UnresolvedRelation(Seq("TaBlE"), None)) === testRelation)
   }
 
   def errorTest(
@@ -177,7 +184,7 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
       AttributeReference("d", DecimalType.Unlimited)(),
       AttributeReference("e", ShortType)())
 
-    val plan = caseInsensitiveAnalyze(
+    val plan = caseInsensitiveAnalyzer(
       testRelation2.select(
         'a / Literal(2) as 'div1,
         'a / 'b as 'div2,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 69e5f6a07da7f..27ac398063d43 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -117,7 +117,7 @@ class DataFrame protected[sql](
     this(sqlContext, {
       val qe = sqlContext.executePlan(logicalPlan)
       if (sqlContext.conf.dataFrameEagerAnalysis) {
-        qe.analyzed  // This should force analysis and throw errors if there are any
+        qe.assertAnalyzed()  // This should force analysis and throw errors if there are any
       }
       qe
     })
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 39f6c2f4bc8b4..a08c0f5ce3ff4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -52,8 +52,9 @@ private[spark] object SQLConf {
   // This is used to set the default data source
   val DEFAULT_DATA_SOURCE_NAME = "spark.sql.sources.default"
 
-  // Whether to perform eager analysis on a DataFrame.
-  val DATAFRAME_EAGER_ANALYSIS = "spark.sql.dataframe.eagerAnalysis"
+  // Whether to perform eager analysis when constructing a dataframe.
+  // Set to false when debugging requires the ability to look at invalid query plans.
+  val DATAFRAME_EAGER_ANALYSIS = "spark.sql.eagerAnalysis"
 
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 4bdaa023914b8..ce800e0754559 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -114,7 +114,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
     new Analyzer(catalog, functionRegistry, caseSensitive = true) {
       override val extendedResolutionRules =
         ExtractPythonUdfs ::
-        sources.PreWriteCheck(catalog) ::
         sources.PreInsertCastAndRename ::
         Nil
     }
@@ -1057,6 +1056,13 @@ class SQLContext(@transient val sparkContext: SparkContext)
       Batch("Add exchange", Once, AddExchange(self)) :: Nil
   }
 
+  @transient
+  protected[sql] lazy val checkAnalysis = new CheckAnalysis {
+    override val extendedCheckRules = Seq(
+      sources.PreWriteCheck(catalog)
+    )
+  }
+
   /**
    * :: DeveloperApi ::
    * The primary workflow for executing relational queries using Spark.  Designed to allow easy
@@ -1064,9 +1070,13 @@ class SQLContext(@transient val sparkContext: SparkContext)
    */
   @DeveloperApi
   protected[sql] class QueryExecution(val logical: LogicalPlan) {
+    def assertAnalyzed(): Unit = checkAnalysis(analyzed)
 
     lazy val analyzed: LogicalPlan = analyzer(logical)
-    lazy val withCachedData: LogicalPlan = cacheManager.useCachedData(analyzed)
+    lazy val withCachedData: LogicalPlan = {
+      assertAnalyzed
+      cacheManager.useCachedData(analyzed)
+    }
     lazy val optimizedPlan: LogicalPlan = optimizer(withCachedData)
 
     // TODO: Don't just pick the first one...
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
index 36a9c0bdc41e6..8440581074877 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala
@@ -78,10 +78,10 @@ private[sql] object PreInsertCastAndRename extends Rule[LogicalPlan] {
 /**
  * A rule to do various checks before inserting into or writing to a data source table.
  */
-private[sql] case class PreWriteCheck(catalog: Catalog) extends Rule[LogicalPlan] {
+private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan => Unit) {
   def failAnalysis(msg: String) = { throw new AnalysisException(msg) }
 
-  def apply(plan: LogicalPlan): LogicalPlan = {
+  def apply(plan: LogicalPlan): Unit = {
     plan.foreach {
       case i @ logical.InsertIntoTable(
         l @ LogicalRelation(t: InsertableRelation), partition, query, overwrite) =>
@@ -93,7 +93,7 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends Rule[LogicalPlan
           val srcRelations = query.collect {
             case LogicalRelation(src: BaseRelation) => src
           }
-          if (srcRelations.exists(src => src == t)) {
+          if (srcRelations.contains(t)) {
             failAnalysis(
               "Cannot insert overwrite into table that is also being read from.")
           } else {
@@ -119,7 +119,7 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends Rule[LogicalPlan
               val srcRelations = query.collect {
                 case LogicalRelation(src: BaseRelation) => src
               }
-              if (srcRelations.exists(src => src == dest)) {
+              if (srcRelations.contains(dest)) {
                 failAnalysis(
                   s"Cannot overwrite table $tableName that is also being read from.")
               } else {
@@ -134,7 +134,5 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends Rule[LogicalPlan
 
       case _ => // OK
     }
-
-    plan
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
index 0ec6881d7afe6..91c6367371f15 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala
@@ -30,7 +30,6 @@ abstract class DataSourceTest extends QueryTest with BeforeAndAfter {
     override protected[sql] lazy val analyzer: Analyzer =
       new Analyzer(catalog, functionRegistry, caseSensitive = false) {
         override val extendedResolutionRules =
-          PreWriteCheck(catalog) ::
           PreInsertCastAndRename ::
           Nil
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 5682e5a2bcea9..b5b16f9546691 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -205,7 +205,7 @@ class InsertSuite extends DataSourceTest with BeforeAndAfterAll {
     val message = intercept[AnalysisException] {
       sql(
         s"""
-        |INSERT OVERWRITE TABLE oneToTen SELECT a FROM jt
+        |INSERT OVERWRITE TABLE oneToTen SELECT CAST(a AS INT) FROM jt
         """.stripMargin)
     }.getMessage
     assert(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 2e205e67c0fdd..c439dfe0a71f8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -268,7 +268,6 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
         catalog.PreInsertionCasts ::
         ExtractPythonUdfs ::
         ResolveUdtfsAlias ::
-        sources.PreWriteCheck(catalog) ::
         sources.PreInsertCastAndRename ::
         Nil
     }

From cf2e41653de778dc8db8b03385a053aae1152e19 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Mon, 23 Feb 2015 22:08:44 -0800
Subject: [PATCH 282/357] [SPARK-5958][MLLIB][DOC] update block matrix user
 guide

* Removed SVD code from examples.
* Corrected Java API doc link.
* Updated variable names: `AtransposeA` -> `ata`.
* Minor changes.

brkyvz

Author: Xiangrui Meng <meng@databricks.com>

Closes #4737 from mengxr/update-block-matrix-user-guide and squashes the following commits:

70f53ac [Xiangrui Meng] update block matrix user guide
---
 docs/mllib-data-types.md | 41 +++++++++++++++-------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md
index 24d22b9bcdfa4..fe6c1bf7bfd99 100644
--- a/docs/mllib-data-types.md
+++ b/docs/mllib-data-types.md
@@ -298,23 +298,22 @@ In general the use of non-deterministic RDDs can lead to errors.
 
 ### BlockMatrix
 
-A `BlockMatrix` is a distributed matrix backed by an RDD of `MatrixBlock`s, where `MatrixBlock` is
+A `BlockMatrix` is a distributed matrix backed by an RDD of `MatrixBlock`s, where a `MatrixBlock` is
 a tuple of `((Int, Int), Matrix)`, where the `(Int, Int)` is the index of the block, and `Matrix` is
 the sub-matrix at the given index with size `rowsPerBlock` x `colsPerBlock`.
-`BlockMatrix` supports methods such as `.add` and `.multiply` with another `BlockMatrix`.
-`BlockMatrix` also has a helper function `.validate` which can be used to debug whether the
+`BlockMatrix` supports methods such as `add` and `multiply` with another `BlockMatrix`.
+`BlockMatrix` also has a helper function `validate` which can be used to check whether the
 `BlockMatrix` is set up properly.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
 A [`BlockMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.BlockMatrix) can be
-most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` using `.toBlockMatrix()`.
-`.toBlockMatrix()` will create blocks of size 1024 x 1024. Users may change the sizes of their blocks
-by supplying the values through `.toBlockMatrix(rowsPerBlock, colsPerBlock)`.
+most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` by calling `toBlockMatrix`.
+`toBlockMatrix` creates blocks of size 1024 x 1024 by default.
+Users may change the block size by supplying the values through `toBlockMatrix(rowsPerBlock, colsPerBlock)`.
 
 {% highlight scala %}
-import org.apache.spark.mllib.linalg.SingularValueDecomposition
 import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, MatrixEntry}
 
 val entries: RDD[MatrixEntry] = ... // an RDD of (i, j, v) matrix entries
@@ -323,29 +322,24 @@ val coordMat: CoordinateMatrix = new CoordinateMatrix(entries)
 // Transform the CoordinateMatrix to a BlockMatrix
 val matA: BlockMatrix = coordMat.toBlockMatrix().cache()
 
-// validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
+// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
 // Nothing happens if it is valid.
-matA.validate
+matA.validate()
 
 // Calculate A^T A.
-val AtransposeA = matA.transpose.multiply(matA)
-
-// get SVD of 2 * A
-val A2 = matA.add(matA)
-val svd = A2.toIndexedRowMatrix().computeSVD(20, false, 1e-9)
+val ata = matA.transpose.multiply(matA)
 {% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
 
-A [`BlockMatrix`](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.BlockMatrix) can be
-most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` using `.toBlockMatrix()`.
-`.toBlockMatrix()` will create blocks of size 1024 x 1024. Users may change the sizes of their blocks
-by supplying the values through `.toBlockMatrix(rowsPerBlock, colsPerBlock)`.
+A [`BlockMatrix`](api/java/org/apache/spark/mllib/linalg/distributed/BlockMatrix.html) can be
+most easily created from an `IndexedRowMatrix` or `CoordinateMatrix` by calling `toBlockMatrix`.
+`toBlockMatrix` creates blocks of size 1024 x 1024 by default.
+Users may change the block size by supplying the values through `toBlockMatrix(rowsPerBlock, colsPerBlock)`.
 
 {% highlight java %}
 import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.linalg.SingularValueDecomposition;
 import org.apache.spark.mllib.linalg.distributed.BlockMatrix;
 import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
 import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix;
@@ -356,17 +350,12 @@ CoordinateMatrix coordMat = new CoordinateMatrix(entries.rdd());
 // Transform the CoordinateMatrix to a BlockMatrix
 BlockMatrix matA = coordMat.toBlockMatrix().cache();
 
-// validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
+// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
 // Nothing happens if it is valid.
 matA.validate();
 
 // Calculate A^T A.
-BlockMatrix AtransposeA = matA.transpose().multiply(matA);
-
-// get SVD of 2 * A
-BlockMatrix A2 = matA.add(matA);
-SingularValueDecomposition<IndexedRowMatrix, Matrix> svd =
-  A2.toIndexedRowMatrix().computeSVD(20, false, 1e-9);
+BlockMatrix ata = matA.transpose().multiply(matA);
 {% endhighlight %}
 </div>
 </div>

From 840333133396d443e747f62fce9967f7681fb276 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 24 Feb 2015 10:45:38 -0800
Subject: [PATCH 283/357] [SPARK-5968] [SQL] Suppresses ParquetOutputCommitter
 WARN logs

Please refer to the [JIRA ticket] [1] for the motivation.

[1]: https://issues.apache.org/jira/browse/SPARK-5968

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4744)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>

Closes #4744 from liancheng/spark-5968 and squashes the following commits:

caac6a8 [Cheng Lian] Suppresses ParquetOutputCommitter WARN logs
---
 .../apache/spark/sql/parquet/ParquetRelation.scala   | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
index b0db9943a506c..a0d1005c0cae3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -18,11 +18,12 @@
 package org.apache.spark.sql.parquet
 
 import java.io.IOException
+import java.util.logging.Level
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.fs.permission.FsAction
-import parquet.hadoop.ParquetOutputFormat
+import parquet.hadoop.{ParquetOutputCommitter, ParquetOutputFormat}
 import parquet.hadoop.metadata.CompressionCodecName
 import parquet.schema.MessageType
 
@@ -91,7 +92,7 @@ private[sql] object ParquetRelation {
     // checks first to see if there's any handlers already set
     // and if not it creates them. If this method executes prior
     // to that class being loaded then:
-    //  1) there's no handlers installed so there's none to 
+    //  1) there's no handlers installed so there's none to
     // remove. But when it IS finally loaded the desired affect
     // of removing them is circumvented.
     //  2) The parquet.Log static initializer calls setUseParentHanders(false)
@@ -99,7 +100,7 @@ private[sql] object ParquetRelation {
     //
     // Therefore we need to force the class to be loaded.
     // This should really be resolved by Parquet.
-    Class.forName(classOf[parquet.Log].getName())
+    Class.forName(classOf[parquet.Log].getName)
 
     // Note: Logger.getLogger("parquet") has a default logger
     // that appends to Console which needs to be cleared.
@@ -108,6 +109,11 @@ private[sql] object ParquetRelation {
     // TODO(witgo): Need to set the log level ?
     // if(parquetLogger.getLevel != null) parquetLogger.setLevel(null)
     if (!parquetLogger.getUseParentHandlers) parquetLogger.setUseParentHandlers(true)
+
+    // Disables WARN log message in ParquetOutputCommitter.
+    // See https://issues.apache.org/jira/browse/SPARK-5968 for details
+    Class.forName(classOf[ParquetOutputCommitter].getName)
+    java.util.logging.Logger.getLogger(classOf[ParquetOutputCommitter].getName).setLevel(Level.OFF)
   }
 
   // The element type for the RDDs that this relation maps to.

From 0a59e45e2f2e6f00ccd5f10c79f629fb796fd8d0 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 24 Feb 2015 10:49:51 -0800
Subject: [PATCH 284/357] [SPARK-5910][SQL] Support for as in selectExpr

Author: Michael Armbrust <michael@databricks.com>

Closes #4736 from marmbrus/asExprs and squashes the following commits:

5ba97e4 [Michael Armbrust] [SPARK-5910][SQL] Support for as in selectExpr
---
 .../scala/org/apache/spark/sql/catalyst/SqlParser.scala     | 2 +-
 .../test/scala/org/apache/spark/sql/DataFrameSuite.scala    | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index b16aff99af1c5..c363a5efacde8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -40,7 +40,7 @@ class SqlParser extends AbstractSparkSQLParser {
   def parseExpression(input: String): Expression = {
     // Initialize the Keywords.
     lexical.initialize(reservedWords)
-    phrase(expression)(new lexical.Scanner(input)) match {
+    phrase(projection)(new lexical.Scanner(input)) match {
       case Success(plan, _) => plan
       case failureOrError => sys.error(failureOrError.toString)
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 6b9b3a8425964..e71e9bee3a6d8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -130,6 +130,12 @@ class DataFrameSuite extends QueryTest {
       testData.collect().map(row => Row(math.abs(row.getInt(0)), row.getString(1))).toSeq)
   }
 
+  test("selectExpr with alias") {
+    checkAnswer(
+      testData.selectExpr("key as k").select("k"),
+      testData.select("key").collect().toSeq)
+  }
+
   test("filterExpr") {
     checkAnswer(
       testData.filter("key > 90"),

From 201236628a344194f7c20ba8e9afeeaefbe9318c Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 24 Feb 2015 10:52:18 -0800
Subject: [PATCH 285/357] [SPARK-5532][SQL] Repartition should not use external
 rdd representation

Author: Michael Armbrust <michael@databricks.com>

Closes #4738 from marmbrus/udtRepart and squashes the following commits:

c06d7b5 [Michael Armbrust] fix compilation
91c8829 [Michael Armbrust] [SQL][SPARK-5532] Repartition should not use external rdd representation
---
 .../scala/org/apache/spark/sql/DataFrame.scala   |  5 +++--
 .../spark/sql/execution/debug/package.scala      |  1 +
 .../apache/spark/sql/UserDefinedTypeSuite.scala  | 16 +++++++++++++++-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 27ac398063d43..04bf5d9b0f931 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -799,14 +799,15 @@ class DataFrame protected[sql](
    * Returns the number of rows in the [[DataFrame]].
    * @group action
    */
-  override def count(): Long = groupBy().count().rdd.collect().head.getLong(0)
+  override def count(): Long = groupBy().count().collect().head.getLong(0)
 
   /**
    * Returns a new [[DataFrame]] that has exactly `numPartitions` partitions.
    * @group rdd
    */
   override def repartition(numPartitions: Int): DataFrame = {
-    sqlContext.createDataFrame(rdd.repartition(numPartitions), schema)
+    sqlContext.createDataFrame(
+      queryExecution.toRdd.map(_.copy()).repartition(numPartitions), schema)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
index 73162b22fa9cd..ffe388cfa9532 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala
@@ -168,6 +168,7 @@ package object debug {
       case (_: Short, ShortType) =>
       case (_: Boolean, BooleanType) =>
       case (_: Double, DoubleType) =>
+      case (v, udt: UserDefinedType[_]) => typeCheck(v, udt.sqlType)
 
       case (d, t) => sys.error(s"Invalid data found: got $d (${d.getClass}) expected $t")
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index 5f21d990e2e5b..9c098df24c65f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.sql
 
+import java.io.File
+
 import scala.beans.{BeanInfo, BeanProperty}
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext.{sparkContext, sql}
 import org.apache.spark.sql.test.TestSQLContext.implicits._
@@ -91,4 +92,17 @@ class UserDefinedTypeSuite extends QueryTest {
       sql("SELECT testType(features) from points"),
       Seq(Row(true), Row(true)))
   }
+
+
+  test("UDTs with Parquet") {
+    val tempDir = File.createTempFile("parquet", "test")
+    tempDir.delete()
+    pointsRDD.saveAsParquetFile(tempDir.getCanonicalPath)
+  }
+
+  test("Repartition UDTs with Parquet") {
+    val tempDir = File.createTempFile("parquet", "test")
+    tempDir.delete()
+    pointsRDD.repartition(1).saveAsParquetFile(tempDir.getCanonicalPath)
+  }
 }

From 64d2c01ff1048de83b9b8efce987b55e457298f9 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 24 Feb 2015 11:02:47 -0800
Subject: [PATCH 286/357] [Spark-5967] [UI] Correctly clean
 JobProgressListener.stageIdToActiveJobIds

Patch should be self-explanatory
pwendell JoshRosen

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #4741 from tdas/SPARK-5967 and squashes the following commits:

653b5bb [Tathagata Das] Fixed the fix and added test
e2de972 [Tathagata Das] Clear stages which have no corresponding active jobs.
---
 .../spark/ui/jobs/JobProgressListener.scala   |  3 +++
 .../ui/jobs/JobProgressListenerSuite.scala    | 22 +++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
index 0b6fe70bd2062..937d95a934b59 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
@@ -203,6 +203,9 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
     for (stageId <- jobData.stageIds) {
       stageIdToActiveJobIds.get(stageId).foreach { jobsUsingStage =>
         jobsUsingStage.remove(jobEnd.jobId)
+        if (jobsUsingStage.isEmpty) {
+          stageIdToActiveJobIds.remove(stageId)
+        }
         stageIdToInfo.get(stageId).foreach { stageInfo =>
           if (stageInfo.submissionTime.isEmpty) {
             // if this stage is pending, it won't complete, so mark it as "skipped":
diff --git a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
index 6019282d2fb70..730a4b54f5aa1 100644
--- a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
@@ -88,6 +88,28 @@ class JobProgressListenerSuite extends FunSuite with LocalSparkContext with Matc
     listener.completedStages.map(_.stageId).toSet should be (Set(50, 49, 48, 47, 46))
   }
 
+  test("test clearing of stageIdToActiveJobs") {
+    val conf = new SparkConf()
+    conf.set("spark.ui.retainedStages", 5.toString)
+    val listener = new JobProgressListener(conf)
+    val jobId = 0
+    val stageIds = 1 to 50
+    // Start a job with 50 stages
+    listener.onJobStart(createJobStartEvent(jobId, stageIds))
+    for (stageId <- stageIds) {
+      listener.onStageSubmitted(createStageStartEvent(stageId))
+    }
+    listener.stageIdToActiveJobIds.size should be > 0
+
+    // Complete the stages and job
+    for (stageId <- stageIds) {
+      listener.onStageCompleted(createStageEndEvent(stageId, failed = false))
+    }
+    listener.onJobEnd(createJobEndEvent(jobId, false))
+    assertActiveJobsStateIsEmpty(listener)
+    listener.stageIdToActiveJobIds.size should be (0)
+  }
+
   test("test LRU eviction of jobs") {
     val conf = new SparkConf()
     conf.set("spark.ui.retainedStages", 5.toString)

From 6d2caa576fcdc5c848d1472b09c685b3871e220e Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Tue, 24 Feb 2015 11:08:07 -0800
Subject: [PATCH 287/357] [SPARK-5965] Standalone Worker UI displays
 {{USER_JAR}}

For screenshot see: https://issues.apache.org/jira/browse/SPARK-5965
This was caused by 20a6013106b56a1a1cc3e8cda092330ffbe77cc3.

Author: Andrew Or <andrew@databricks.com>

Closes #4739 from andrewor14/user-jar-blocker and squashes the following commits:

23c4a9e [Andrew Or] Use right argument
---
 .../scala/org/apache/spark/deploy/worker/ui/WorkerPage.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerPage.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerPage.scala
index 327b905032800..720f13bfa829b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerPage.scala
@@ -134,7 +134,7 @@ private[spark] class WorkerPage(parent: WorkerWebUI) extends WebUIPage("") {
   def driverRow(driver: DriverRunner): Seq[Node] = {
     <tr>
       <td>{driver.driverId}</td>
-      <td>{driver.driverDesc.command.arguments(1)}</td>
+      <td>{driver.driverDesc.command.arguments(2)}</td>
       <td>{driver.finalState.getOrElse(DriverState.RUNNING)}</td>
       <td sorttable_customkey={driver.driverDesc.cores.toString}>
         {driver.driverDesc.cores.toString}

From 105791e35cee694f3b2ac1e06758650fe44e2c71 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 24 Feb 2015 11:38:59 -0800
Subject: [PATCH 288/357] [MLLIB] Change x_i to y_i in Variance's user guide

Variance is calculated on labels/responses.

Author: Xiangrui Meng <meng@databricks.com>

Closes #4740 from mengxr/patch-1 and squashes the following commits:

673317b [Xiangrui Meng] [MLLIB] Change x_i to y_i in Variance's user guide
---
 docs/mllib-decision-tree.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md
index d1537def851e7..6675133a810db 100644
--- a/docs/mllib-decision-tree.md
+++ b/docs/mllib-decision-tree.md
@@ -54,8 +54,8 @@ impurity measure for regression (variance).
     <tr>
       <td>Variance</td>
 	  <td>Regression</td>
-     <td>$\frac{1}{N} \sum_{i=1}^{N} (x_i - \mu)^2$</td><td>$y_i$ is label for an instance,
-	  $N$ is the number of instances and $\mu$ is the mean given by $\frac{1}{N} \sum_{i=1}^N x_i$.</td>
+     <td>$\frac{1}{N} \sum_{i=1}^{N} (y_i - \mu)^2$</td><td>$y_i$ is label for an instance,
+	  $N$ is the number of instances and $\mu$ is the mean given by $\frac{1}{N} \sum_{i=1}^N y_i$.</td>
     </tr>
   </tbody>
 </table>

From c5ba975ee85521f708ebeec81144347cf1b40fba Mon Sep 17 00:00:00 2001
From: Judy <judynash@microsoft.com>
Date: Tue, 24 Feb 2015 20:50:16 +0000
Subject: [PATCH 289/357] [Spark-5708] Add Slf4jSink to Spark Metrics

Add Slf4jSink to Spark Metrics using Coda Hale's SlfjReporter.
This sends metrics to log4j, allowing spark users to reuse log4j pipeline for metrics collection.

Reviewed existing unit tests and didn't see any sink-related tests. Please advise on if tests should be added.

Author: Judy <judynash@microsoft.com>
Author: judynash <judynash@microsoft.com>

Closes #4644 from judynash/master and squashes the following commits:

57ef214 [judynash] doc clarification and indent fixes
a751a66 [Judy] Spark-5708: Add Slf4jSink to Spark Metrics
---
 conf/metrics.properties.template              |  9 +++
 .../apache/spark/metrics/sink/Slf4jSink.scala | 68 +++++++++++++++++++
 docs/monitoring.md                            |  1 +
 3 files changed, 78 insertions(+)
 create mode 100644 core/src/main/scala/org/apache/spark/metrics/sink/Slf4jSink.scala

diff --git a/conf/metrics.properties.template b/conf/metrics.properties.template
index 464c14457e53f..2e0cb5db170ac 100644
--- a/conf/metrics.properties.template
+++ b/conf/metrics.properties.template
@@ -122,6 +122,15 @@
 
 #worker.sink.csv.unit=minutes
 
+# Enable Slf4jSink for all instances by class name
+#*.sink.slf4j.class=org.apache.spark.metrics.sink.Slf4jSink
+
+# Polling period for Slf4JSink
+#*.sink.sl4j.period=1
+
+#*.sink.sl4j.unit=minutes
+
+
 # Enable jvm source for instance master, worker, driver and executor
 #master.source.jvm.class=org.apache.spark.metrics.source.JvmSource
 
diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/Slf4jSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/Slf4jSink.scala
new file mode 100644
index 0000000000000..e8b3074e8f1a6
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/metrics/sink/Slf4jSink.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.metrics.sink
+
+import java.util.Properties
+import java.util.concurrent.TimeUnit
+
+import com.codahale.metrics.{Slf4jReporter, MetricRegistry}
+
+import org.apache.spark.SecurityManager
+import org.apache.spark.metrics.MetricsSystem
+
+private[spark] class Slf4jSink(
+    val property: Properties, 
+    val registry: MetricRegistry,
+    securityMgr: SecurityManager) 
+  extends Sink {
+  val SLF4J_DEFAULT_PERIOD = 10
+  val SLF4J_DEFAULT_UNIT = "SECONDS"
+
+  val SLF4J_KEY_PERIOD = "period"
+  val SLF4J_KEY_UNIT = "unit"
+
+  val pollPeriod = Option(property.getProperty(SLF4J_KEY_PERIOD)) match {
+    case Some(s) => s.toInt
+    case None => SLF4J_DEFAULT_PERIOD
+  }
+
+  val pollUnit: TimeUnit = Option(property.getProperty(SLF4J_KEY_UNIT)) match {
+    case Some(s) => TimeUnit.valueOf(s.toUpperCase())
+    case None => TimeUnit.valueOf(SLF4J_DEFAULT_UNIT)
+  }
+
+  MetricsSystem.checkMinimalPollingPeriod(pollUnit, pollPeriod)
+
+  val reporter: Slf4jReporter = Slf4jReporter.forRegistry(registry)
+    .convertDurationsTo(TimeUnit.MILLISECONDS)
+    .convertRatesTo(TimeUnit.SECONDS)
+    .build()
+
+  override def start() {
+    reporter.start(pollPeriod, pollUnit)
+  }
+
+  override def stop() {
+    reporter.stop()
+  }
+
+  override def report() {
+    reporter.report()
+  }
+}
+
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 7a5cadc171d6d..009a344dff4bb 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -176,6 +176,7 @@ Each instance can report to zero or more _sinks_. Sinks are contained in the
 * `JmxSink`: Registers metrics for viewing in a JMX console.
 * `MetricsServlet`: Adds a servlet within the existing Spark UI to serve metrics data as JSON data.
 * `GraphiteSink`: Sends metrics to a Graphite node.
+* `Slf4jSink`: Sends metrics to slf4j as log entries.
 
 Spark also supports a Ganglia sink which is not included in the default build due to
 licensing restrictions:

From a2b9137923e0ba328da8fff2fbbfcf2abf50b033 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 24 Feb 2015 13:39:29 -0800
Subject: [PATCH 290/357] [SPARK-5952][SQL] Lock when using hive metastore
 client

Author: Michael Armbrust <michael@databricks.com>

Closes #4746 from marmbrus/hiveLock and squashes the following commits:

8b871cf [Michael Armbrust] [SPARK-5952][SQL] Lock when using hive metastore client
---
 .../spark/sql/hive/HiveMetastoreCatalog.scala  | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index f7ad2efc9544e..2cc8d65d3cb79 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -52,6 +52,7 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
   /** Connection to hive metastore.  Usages should lock on `this`. */
   protected[hive] val client = Hive.get(hive.hiveconf)
 
+  /** Usages should lock on `this`. */
   protected[hive] lazy val hiveWarehouse = new Warehouse(hive.hiveconf)
 
   // TODO: Use this everywhere instead of tuples or databaseName, tableName,.
@@ -65,7 +66,9 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
     val cacheLoader = new CacheLoader[QualifiedTableName, LogicalPlan]() {
       override def load(in: QualifiedTableName): LogicalPlan = {
         logDebug(s"Creating new cached data source for $in")
-        val table = client.getTable(in.database, in.name)
+        val table = synchronized {
+          client.getTable(in.database, in.name)
+        }
         val schemaString = table.getProperty("spark.sql.sources.schema")
         val userSpecifiedSchema =
           if (schemaString == null) {
@@ -134,15 +137,18 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
     }
   }
 
-  def hiveDefaultTableFilePath(tableName: String): String = {
+  def hiveDefaultTableFilePath(tableName: String): String = synchronized {
     val currentDatabase = client.getDatabase(hive.sessionState.getCurrentDatabase)
+
     hiveWarehouse.getTablePath(currentDatabase, tableName).toString
   }
 
-  def tableExists(tableIdentifier: Seq[String]): Boolean = {
+  def tableExists(tableIdentifier: Seq[String]): Boolean = synchronized {
     val tableIdent = processTableIdentifier(tableIdentifier)
-    val databaseName = tableIdent.lift(tableIdent.size - 2).getOrElse(
-      hive.sessionState.getCurrentDatabase)
+    val databaseName =
+      tableIdent
+        .lift(tableIdent.size - 2)
+        .getOrElse(hive.sessionState.getCurrentDatabase)
     val tblName = tableIdent.last
     client.getTable(databaseName, tblName, false) != null
   }
@@ -219,7 +225,7 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
     }
   }
 
-  override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
+  override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = synchronized {
     val dbName = if (!caseSensitive) {
       if (databaseName.isDefined) Some(databaseName.get.toLowerCase) else None
     } else {

From da505e59274d1c838653c1109db65ad374e65304 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 24 Feb 2015 14:50:00 -0800
Subject: [PATCH 291/357] [SPARK-5973] [PySpark] fix zip with two RDDs with
 AutoBatchedSerializer

Author: Davies Liu <davies@databricks.com>

Closes #4745 from davies/fix_zip and squashes the following commits:

2124b2c [Davies Liu] Update tests.py
b5c828f [Davies Liu] increase the number of records
c1e40fd [Davies Liu] fix zip with two RDDs with AutoBatchedSerializer
---
 python/pyspark/rdd.py   | 2 +-
 python/pyspark/tests.py | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index ba2347ae76844..d3148de6f41a3 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1950,7 +1950,7 @@ def batch_as(rdd, batchSize):
 
         my_batch = get_batch_size(self._jrdd_deserializer)
         other_batch = get_batch_size(other._jrdd_deserializer)
-        if my_batch != other_batch:
+        if my_batch != other_batch or not my_batch:
             # use the smallest batchSize for both of them
             batchSize = min(my_batch, other_batch)
             if batchSize <= 0:
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 52e82091c9f81..06ba2b461d53e 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -543,6 +543,12 @@ def test_zip_with_different_serializers(self):
         # regression test for bug in _reserializer()
         self.assertEqual(cnt, t.zip(rdd).count())
 
+    def test_zip_with_different_object_sizes(self):
+        # regress test for SPARK-5973
+        a = self.sc.parallelize(range(10000)).map(lambda i: '*' * i)
+        b = self.sc.parallelize(range(10000, 20000)).map(lambda i: '*' * i)
+        self.assertEqual(10000, a.zip(b).count())
+
     def test_zip_with_different_number_of_items(self):
         a = self.sc.parallelize(range(5), 2)
         # different number of partitions

From 2a0fe34891882e0fde1b5722d8227aa99acc0f1f Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 24 Feb 2015 15:13:22 -0800
Subject: [PATCH 292/357] [SPARK-5436] [MLlib] Validate GradientBoostedTrees
 using runWithValidation

One can early stop if the decrease in error rate is lesser than a certain tol or if the error increases if the training data is overfit.

This introduces a new method runWithValidation which takes in a pair of RDD's , one for the training data and the other for the validation.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #4677 from MechCoder/spark-5436 and squashes the following commits:

1bb21d4 [MechCoder] Combine regression and classification tests into a single one
e4d799b [MechCoder] Addresses indentation and doc comments
b48a70f [MechCoder] COSMIT
b928a19 [MechCoder] Move validation while training section under usage tips
fad9b6e [MechCoder] Made the following changes 1. Add section to documentation 2. Return corresponding to bestValidationError 3. Allow negative tolerance.
55e5c3b [MechCoder] One liner for prevValidateError
3e74372 [MechCoder] TST: Add test for classification
77549a9 [MechCoder] [SPARK-5436] Validate GradientBoostedTrees using runWithValidation
---
 docs/mllib-ensembles.md                       | 11 +++
 .../mllib/tree/GradientBoostedTrees.scala     | 75 +++++++++++++++++--
 .../tree/configuration/BoostingStrategy.scala |  6 +-
 .../tree/GradientBoostedTreesSuite.scala      | 36 +++++++++
 4 files changed, 122 insertions(+), 6 deletions(-)

diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md
index fb90b7039971c..00040e6073d0d 100644
--- a/docs/mllib-ensembles.md
+++ b/docs/mllib-ensembles.md
@@ -427,6 +427,17 @@ We omit some decision tree parameters since those are covered in the [decision t
 
 * **`algo`**: The algorithm or task (classification vs. regression) is set using the tree [Strategy] parameter.
 
+#### Validation while training
+
+Gradient boosting can overfit when trained with more trees. In order to prevent overfitting, it is useful to validate while
+training. The method runWithValidation has been provided to make use of this option. It takes a pair of RDD's as arguments, the
+first one being the training dataset and the second being the validation dataset.
+
+The training is stopped when the improvement in the validation error is not more than a certain tolerance
+(supplied by the `validationTol` argument in `BoostingStrategy`). In practice, the validation error
+decreases initially and later increases. There might be cases in which the validation error does not change monotonically,
+and the user is advised to set a large enough negative tolerance and examine the validation curve to to tune the number of
+iterations.
 
 ### Examples
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index 61f6b1313f82e..b4466ff40937f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -60,11 +60,12 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
   def run(input: RDD[LabeledPoint]): GradientBoostedTreesModel = {
     val algo = boostingStrategy.treeStrategy.algo
     algo match {
-      case Regression => GradientBoostedTrees.boost(input, boostingStrategy)
+      case Regression => GradientBoostedTrees.boost(input, input, boostingStrategy, validate=false)
       case Classification =>
         // Map labels to -1, +1 so binary classification can be treated as regression.
         val remappedInput = input.map(x => new LabeledPoint((x.label * 2) - 1, x.features))
-        GradientBoostedTrees.boost(remappedInput, boostingStrategy)
+        GradientBoostedTrees.boost(remappedInput,
+          remappedInput, boostingStrategy, validate=false)
       case _ =>
         throw new IllegalArgumentException(s"$algo is not supported by the gradient boosting.")
     }
@@ -76,8 +77,46 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
   def run(input: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = {
     run(input.rdd)
   }
-}
 
+  /**
+   * Method to validate a gradient boosting model
+   * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+   * @param validationInput Validation dataset:
+                          RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+                          Should be different from and follow the same distribution as input.
+                          e.g., these two datasets could be created from an original dataset
+                          by using [[org.apache.spark.rdd.RDD.randomSplit()]]
+   * @return a gradient boosted trees model that can be used for prediction
+   */
+  def runWithValidation(
+      input: RDD[LabeledPoint],
+      validationInput: RDD[LabeledPoint]): GradientBoostedTreesModel = {
+    val algo = boostingStrategy.treeStrategy.algo
+    algo match {
+      case Regression => GradientBoostedTrees.boost(
+        input, validationInput, boostingStrategy, validate=true)
+      case Classification =>
+        // Map labels to -1, +1 so binary classification can be treated as regression.
+        val remappedInput = input.map(
+          x => new LabeledPoint((x.label * 2) - 1, x.features))
+        val remappedValidationInput = validationInput.map(
+          x => new LabeledPoint((x.label * 2) - 1, x.features))
+        GradientBoostedTrees.boost(remappedInput, remappedValidationInput, boostingStrategy,
+          validate=true)
+      case _ =>
+        throw new IllegalArgumentException(s"$algo is not supported by the gradient boosting.")
+    }
+  }
+
+  /**
+   * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#runWithValidation]].
+   */
+  def runWithValidation(
+      input: JavaRDD[LabeledPoint],
+      validationInput: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = {
+    runWithValidation(input.rdd, validationInput.rdd)
+  }
+}
 
 object GradientBoostedTrees extends Logging {
 
@@ -108,12 +147,16 @@ object GradientBoostedTrees extends Logging {
   /**
    * Internal method for performing regression using trees as base learners.
    * @param input training dataset
+   * @param validationInput validation dataset, ignored if validate is set to false.
    * @param boostingStrategy boosting parameters
+   * @param validate whether or not to use the validation dataset.
    * @return a gradient boosted trees model that can be used for prediction
    */
   private def boost(
       input: RDD[LabeledPoint],
-      boostingStrategy: BoostingStrategy): GradientBoostedTreesModel = {
+      validationInput: RDD[LabeledPoint],
+      boostingStrategy: BoostingStrategy,
+      validate: Boolean): GradientBoostedTreesModel = {
 
     val timer = new TimeTracker()
     timer.start("total")
@@ -129,6 +172,7 @@ object GradientBoostedTrees extends Logging {
     val learningRate = boostingStrategy.learningRate
     // Prepare strategy for individual trees, which use regression with variance impurity.
     val treeStrategy = boostingStrategy.treeStrategy.copy
+    val validationTol = boostingStrategy.validationTol
     treeStrategy.algo = Regression
     treeStrategy.impurity = Variance
     treeStrategy.assertValid()
@@ -152,13 +196,16 @@ object GradientBoostedTrees extends Logging {
     baseLearnerWeights(0) = 1.0
     val startingModel = new GradientBoostedTreesModel(Regression, Array(firstTreeModel), Array(1.0))
     logDebug("error of gbt = " + loss.computeError(startingModel, input))
+
     // Note: A model of type regression is used since we require raw prediction
     timer.stop("building tree 0")
 
+    var bestValidateError = if (validate) loss.computeError(startingModel, validationInput) else 0.0
+    var bestM = 1
+
     // psuedo-residual for second iteration
     data = input.map(point => LabeledPoint(loss.gradient(startingModel, point),
       point.features))
-
     var m = 1
     while (m < numIterations) {
       timer.start(s"building tree $m")
@@ -177,6 +224,23 @@ object GradientBoostedTrees extends Logging {
       val partialModel = new GradientBoostedTreesModel(
         Regression, baseLearners.slice(0, m + 1), baseLearnerWeights.slice(0, m + 1))
       logDebug("error of gbt = " + loss.computeError(partialModel, input))
+
+      if (validate) {
+        // Stop training early if
+        // 1. Reduction in error is less than the validationTol or
+        // 2. If the error increases, that is if the model is overfit.
+        // We want the model returned corresponding to the best validation error.
+        val currentValidateError = loss.computeError(partialModel, validationInput)
+        if (bestValidateError - currentValidateError < validationTol) {
+          return new GradientBoostedTreesModel(
+            boostingStrategy.treeStrategy.algo,
+            baseLearners.slice(0, bestM),
+            baseLearnerWeights.slice(0, bestM))
+        } else if (currentValidateError < bestValidateError) {
+            bestValidateError = currentValidateError
+            bestM = m + 1
+        }
+      }
       // Update data with pseudo-residuals
       data = input.map(point => LabeledPoint(-loss.gradient(partialModel, point),
         point.features))
@@ -191,4 +255,5 @@ object GradientBoostedTrees extends Logging {
     new GradientBoostedTreesModel(
       boostingStrategy.treeStrategy.algo, baseLearners, baseLearnerWeights)
   }
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index ed8e6a796f8c4..664c8df019233 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -34,6 +34,9 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss}
  *                      weak hypotheses used in the final model.
  * @param learningRate Learning rate for shrinking the contribution of each estimator. The
  *                     learning rate should be between in the interval (0, 1]
+ * @param validationTol Useful when runWithValidation is used. If the error rate on the
+ *                      validation input between two iterations is less than the validationTol
+ *                      then stop. Ignored when [[run]] is used.
  */
 @Experimental
 case class BoostingStrategy(
@@ -42,7 +45,8 @@ case class BoostingStrategy(
     @BeanProperty var loss: Loss,
     // Optional boosting parameters
     @BeanProperty var numIterations: Int = 100,
-    @BeanProperty var learningRate: Double = 0.1) extends Serializable {
+    @BeanProperty var learningRate: Double = 0.1,
+    @BeanProperty var validationTol: Double = 1e-5) extends Serializable {
 
   /**
    * Check validity of parameters.
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
index bde47606eb001..b437aeaaf0547 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
@@ -158,6 +158,40 @@ class GradientBoostedTreesSuite extends FunSuite with MLlibTestSparkContext {
       }
     }
   }
+
+  test("runWithValidation stops early and performs better on a validation dataset") {
+    // Set numIterations large enough so that it stops early.
+    val numIterations = 20
+    val trainRdd = sc.parallelize(GradientBoostedTreesSuite.trainData, 2)
+    val validateRdd = sc.parallelize(GradientBoostedTreesSuite.validateData, 2)
+
+    val algos = Array(Regression, Regression, Classification)
+    val losses = Array(SquaredError, AbsoluteError, LogLoss)
+    (algos zip losses) map {
+      case (algo, loss) => {
+        val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2,
+          categoricalFeaturesInfo = Map.empty)
+        val boostingStrategy =
+          new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0)
+        val gbtValidate = new GradientBoostedTrees(boostingStrategy)
+          .runWithValidation(trainRdd, validateRdd)
+        assert(gbtValidate.numTrees !== numIterations)
+
+        // Test that it performs better on the validation dataset.
+        val gbt = GradientBoostedTrees.train(trainRdd, boostingStrategy)
+        val (errorWithoutValidation, errorWithValidation) = {
+          if (algo == Classification) {
+            val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features))
+            (loss.computeError(gbt, remappedRdd), loss.computeError(gbtValidate, remappedRdd))
+          } else {
+            (loss.computeError(gbt, validateRdd), loss.computeError(gbtValidate, validateRdd))
+          }
+        }
+        assert(errorWithValidation <= errorWithoutValidation)
+      }
+    }
+  }
+
 }
 
 private object GradientBoostedTreesSuite {
@@ -166,4 +200,6 @@ private object GradientBoostedTreesSuite {
   val testCombinations = Array((10, 1.0, 1.0), (10, 0.1, 1.0), (10, 0.5, 0.75), (10, 0.1, 0.75))
 
   val data = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 10, 100)
+  val trainData = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 20, 120)
+  val validateData = EnsembleTestHelper.generateOrderedLabeledPoints(numFeatures = 20, 80)
 }

From f816e73902b8ca28e24bf1f79a70533f75f239db Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 25 Feb 2015 08:34:55 +0800
Subject: [PATCH 293/357] [SPARK-5751] [SQL] [WIP] Revamped
 HiveThriftServer2Suite for robustness

**NOTICE** Do NOT merge this, as we're waiting for #3881 to be merged.

`HiveThriftServer2Suite` has been notorious for its flakiness for a while. This was mostly due to spawning and communicate with external server processes. This PR revamps this test suite for better robustness:

1. Fixes a racing condition occurred while using `tail -f` to check log file

   It's possible that the line we are looking for has already been printed into the log file before we start the `tail -f` process. This PR uses `tail -n +0 -f` to ensure all lines are checked.

2. Retries up to 3 times if the server fails to start

   In most of the cases, the server fails to start because of port conflict. This PR no longer asks the system to choose an available TCP port, but uses a random port first, and retries up to 3 times if the server fails to start.

3. A server instance is reused among all test cases within a single suite

   The original `HiveThriftServer2Suite` is splitted into two test suites, `HiveThriftBinaryServerSuite` and `HiveThriftHttpServerSuite`. Each suite starts a `HiveThriftServer2` instance and reuses it for all of its test cases.

**TODO**

- [ ] Starts the Thrift server in foreground once #3881 is merged (adding `--foreground` flag to `spark-daemon.sh`)

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4720)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>

Closes #4720 from liancheng/revamp-thrift-server-tests and squashes the following commits:

d6c80eb [Cheng Lian] Relaxes server startup timeout
6f14eb1 [Cheng Lian] Revamped HiveThriftServer2Suite for robustness
---
 .../thriftserver/HiveThriftServer2Suite.scala | 387 -----------------
 .../HiveThriftServer2Suites.scala             | 403 ++++++++++++++++++
 2 files changed, 403 insertions(+), 387 deletions(-)
 delete mode 100644 sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala
 create mode 100644 sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala

diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala
deleted file mode 100644
index b52a51d11e4ad..0000000000000
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suite.scala
+++ /dev/null
@@ -1,387 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.hive.thriftserver
-
-import java.io.File
-import java.net.ServerSocket
-import java.sql.{Date, DriverManager, Statement}
-import java.util.concurrent.TimeoutException
-
-import scala.collection.JavaConversions._
-import scala.collection.mutable.ArrayBuffer
-import scala.concurrent.duration._
-import scala.concurrent.{Await, Promise}
-import scala.sys.process.{Process, ProcessLogger}
-import scala.util.Try
-
-import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.apache.hive.jdbc.HiveDriver
-import org.apache.hive.service.auth.PlainSaslHelper
-import org.apache.hive.service.cli.GetInfoType
-import org.apache.hive.service.cli.thrift.TCLIService.Client
-import org.apache.hive.service.cli.thrift._
-import org.apache.thrift.protocol.TBinaryProtocol
-import org.apache.thrift.transport.TSocket
-import org.scalatest.FunSuite
-
-import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.util.getTempFilePath
-import org.apache.spark.sql.hive.HiveShim
-
-/**
- * Tests for the HiveThriftServer2 using JDBC.
- *
- * NOTE: SPARK_PREPEND_CLASSES is explicitly disabled in this test suite. Assembly jar must be
- * rebuilt after changing HiveThriftServer2 related code.
- */
-class HiveThriftServer2Suite extends FunSuite with Logging {
-  Class.forName(classOf[HiveDriver].getCanonicalName)
-
-  object TestData {
-    def getTestDataFilePath(name: String) = {
-      Thread.currentThread().getContextClassLoader.getResource(s"data/files/$name")
-    }
-
-    val smallKv = getTestDataFilePath("small_kv.txt")
-    val smallKvWithNull = getTestDataFilePath("small_kv_with_null.txt")
-  }
-
-  def randomListeningPort =  {
-    // Let the system to choose a random available port to avoid collision with other parallel
-    // builds.
-    val socket = new ServerSocket(0)
-    val port = socket.getLocalPort
-    socket.close()
-    port
-  }
-
-  def withJdbcStatement(
-      serverStartTimeout: FiniteDuration = 1.minute,
-      httpMode: Boolean = false)(
-      f: Statement => Unit) {
-    val port = randomListeningPort
-
-    startThriftServer(port, serverStartTimeout, httpMode) {
-      val jdbcUri = if (httpMode) {
-        s"jdbc:hive2://${"localhost"}:$port/" +
-          "default?hive.server2.transport.mode=http;hive.server2.thrift.http.path=cliservice"
-      } else {
-        s"jdbc:hive2://${"localhost"}:$port/"
-      }
-
-      val user = System.getProperty("user.name")
-      val connection = DriverManager.getConnection(jdbcUri, user, "")
-      val statement = connection.createStatement()
-
-      try {
-        f(statement)
-      } finally {
-        statement.close()
-        connection.close()
-      }
-    }
-  }
-
-  def withCLIServiceClient(
-      serverStartTimeout: FiniteDuration = 1.minute)(
-      f: ThriftCLIServiceClient => Unit) {
-    val port = randomListeningPort
-
-    startThriftServer(port) {
-      // Transport creation logics below mimics HiveConnection.createBinaryTransport
-      val rawTransport = new TSocket("localhost", port)
-      val user = System.getProperty("user.name")
-      val transport = PlainSaslHelper.getPlainTransport(user, "anonymous", rawTransport)
-      val protocol = new TBinaryProtocol(transport)
-      val client = new ThriftCLIServiceClient(new Client(protocol))
-
-      transport.open()
-
-      try {
-        f(client)
-      } finally {
-        transport.close()
-      }
-    }
-  }
-
-  def startThriftServer(
-      port: Int,
-      serverStartTimeout: FiniteDuration = 1.minute,
-      httpMode: Boolean = false)(
-      f: => Unit) {
-    val startScript = "../../sbin/start-thriftserver.sh".split("/").mkString(File.separator)
-    val stopScript = "../../sbin/stop-thriftserver.sh".split("/").mkString(File.separator)
-
-    val warehousePath = getTempFilePath("warehouse")
-    val metastorePath = getTempFilePath("metastore")
-    val metastoreJdbcUri = s"jdbc:derby:;databaseName=$metastorePath;create=true"
-
-    val command =
-      if (httpMode) {
-          s"""$startScript
-             |  --master local
-             |  --hiveconf hive.root.logger=INFO,console
-             |  --hiveconf ${ConfVars.METASTORECONNECTURLKEY}=$metastoreJdbcUri
-             |  --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath
-             |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST}=localhost
-             |  --hiveconf ${ConfVars.HIVE_SERVER2_TRANSPORT_MODE}=http
-             |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_HTTP_PORT}=$port
-             |  --driver-class-path ${sys.props("java.class.path")}
-             |  --conf spark.ui.enabled=false
-           """.stripMargin.split("\\s+").toSeq
-      } else {
-          s"""$startScript
-             |  --master local
-             |  --hiveconf hive.root.logger=INFO,console
-             |  --hiveconf ${ConfVars.METASTORECONNECTURLKEY}=$metastoreJdbcUri
-             |  --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath
-             |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST}=localhost
-             |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_PORT}=$port
-             |  --driver-class-path ${sys.props("java.class.path")}
-             |  --conf spark.ui.enabled=false
-           """.stripMargin.split("\\s+").toSeq
-      }
-
-    val serverRunning = Promise[Unit]()
-    val buffer = new ArrayBuffer[String]()
-    val LOGGING_MARK =
-      s"starting ${HiveThriftServer2.getClass.getCanonicalName.stripSuffix("$")}, logging to "
-    var logTailingProcess: Process = null
-    var logFilePath: String = null
-
-    def captureLogOutput(line: String): Unit = {
-      buffer += line
-      if (line.contains("ThriftBinaryCLIService listening on") ||
-          line.contains("Started ThriftHttpCLIService in http")) {
-        serverRunning.success(())
-      }
-    }
-
-    def captureThriftServerOutput(source: String)(line: String): Unit = {
-      if (line.startsWith(LOGGING_MARK)) {
-        logFilePath = line.drop(LOGGING_MARK.length).trim
-        // Ensure that the log file is created so that the `tail' command won't fail
-        Try(new File(logFilePath).createNewFile())
-        logTailingProcess = Process(s"/usr/bin/env tail -f $logFilePath")
-          .run(ProcessLogger(captureLogOutput, _ => ()))
-      }
-    }
-
-    val env = Seq(
-      // Resets SPARK_TESTING to avoid loading Log4J configurations in testing class paths
-      "SPARK_TESTING" -> "0")
-
-    Process(command, None, env: _*).run(ProcessLogger(
-      captureThriftServerOutput("stdout"),
-      captureThriftServerOutput("stderr")))
-
-    try {
-      Await.result(serverRunning.future, serverStartTimeout)
-      f
-    } catch {
-      case cause: Exception =>
-        cause match {
-          case _: TimeoutException =>
-            logError(s"Failed to start Hive Thrift server within $serverStartTimeout", cause)
-          case _ =>
-        }
-        logError(
-          s"""
-             |=====================================
-             |HiveThriftServer2Suite failure output
-             |=====================================
-             |HiveThriftServer2 command line: ${command.mkString(" ")}
-             |Binding port: $port
-             |System user: ${System.getProperty("user.name")}
-             |
-             |${buffer.mkString("\n")}
-             |=========================================
-             |End HiveThriftServer2Suite failure output
-             |=========================================
-           """.stripMargin, cause)
-        throw cause
-    } finally {
-      warehousePath.delete()
-      metastorePath.delete()
-      Process(stopScript, None, env: _*).run().exitValue()
-      // The `spark-daemon.sh' script uses kill, which is not synchronous, have to wait for a while.
-      Thread.sleep(3.seconds.toMillis)
-      Option(logTailingProcess).map(_.destroy())
-      Option(logFilePath).map(new File(_).delete())
-    }
-  }
-
-  test("Test JDBC query execution") {
-    withJdbcStatement() { statement =>
-      val queries = Seq(
-        "SET spark.sql.shuffle.partitions=3",
-        "DROP TABLE IF EXISTS test",
-        "CREATE TABLE test(key INT, val STRING)",
-        s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test",
-        "CACHE TABLE test")
-
-      queries.foreach(statement.execute)
-
-      assertResult(5, "Row count mismatch") {
-        val resultSet = statement.executeQuery("SELECT COUNT(*) FROM test")
-        resultSet.next()
-        resultSet.getInt(1)
-      }
-    }
-  }
-
-  test("Test JDBC query execution in Http Mode") {
-    withJdbcStatement(httpMode = true) { statement =>
-      val queries = Seq(
-        "SET spark.sql.shuffle.partitions=3",
-        "DROP TABLE IF EXISTS test",
-        "CREATE TABLE test(key INT, val STRING)",
-        s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test",
-        "CACHE TABLE test")
-
-      queries.foreach(statement.execute)
-
-      assertResult(5, "Row count mismatch") {
-        val resultSet = statement.executeQuery("SELECT COUNT(*) FROM test")
-        resultSet.next()
-        resultSet.getInt(1)
-      }
-    }
-  }
-
-  test("SPARK-3004 regression: result set containing NULL") {
-    withJdbcStatement() { statement =>
-      val queries = Seq(
-        "DROP TABLE IF EXISTS test_null",
-        "CREATE TABLE test_null(key INT, val STRING)",
-        s"LOAD DATA LOCAL INPATH '${TestData.smallKvWithNull}' OVERWRITE INTO TABLE test_null")
-
-      queries.foreach(statement.execute)
-
-      val resultSet = statement.executeQuery("SELECT * FROM test_null WHERE key IS NULL")
-
-      (0 until 5).foreach { _ =>
-        resultSet.next()
-        assert(resultSet.getInt(1) === 0)
-        assert(resultSet.wasNull())
-      }
-
-      assert(!resultSet.next())
-    }
-  }
-
-  test("GetInfo Thrift API") {
-    withCLIServiceClient() { client =>
-      val user = System.getProperty("user.name")
-      val sessionHandle = client.openSession(user, "")
-
-      assertResult("Spark SQL", "Wrong GetInfo(CLI_DBMS_NAME) result") {
-        client.getInfo(sessionHandle, GetInfoType.CLI_DBMS_NAME).getStringValue
-      }
-
-      assertResult("Spark SQL", "Wrong GetInfo(CLI_SERVER_NAME) result") {
-        client.getInfo(sessionHandle, GetInfoType.CLI_SERVER_NAME).getStringValue
-      }
-
-      assertResult(true, "Spark version shouldn't be \"Unknown\"") {
-        val version = client.getInfo(sessionHandle, GetInfoType.CLI_DBMS_VER).getStringValue
-        logInfo(s"Spark version: $version")
-        version != "Unknown"
-      }
-    }
-  }
-
-  test("Checks Hive version") {
-    withJdbcStatement() { statement =>
-      val resultSet = statement.executeQuery("SET spark.sql.hive.version")
-      resultSet.next()
-      assert(resultSet.getString(1) === s"spark.sql.hive.version=${HiveShim.version}")
-    }
-  }
-
-  test("Checks Hive version in Http Mode") {
-    withJdbcStatement(httpMode = true) { statement =>
-      val resultSet = statement.executeQuery("SET spark.sql.hive.version")
-      resultSet.next()
-      assert(resultSet.getString(1) === s"spark.sql.hive.version=${HiveShim.version}")
-    }
-  }
-
-  test("SPARK-4292 regression: result set iterator issue") {
-    withJdbcStatement() { statement =>
-      val queries = Seq(
-        "DROP TABLE IF EXISTS test_4292",
-        "CREATE TABLE test_4292(key INT, val STRING)",
-        s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_4292")
-
-      queries.foreach(statement.execute)
-
-      val resultSet = statement.executeQuery("SELECT key FROM test_4292")
-
-      Seq(238, 86, 311, 27, 165).foreach { key =>
-        resultSet.next()
-        assert(resultSet.getInt(1) === key)
-      }
-
-      statement.executeQuery("DROP TABLE IF EXISTS test_4292")
-    }
-  }
-
-  test("SPARK-4309 regression: Date type support") {
-    withJdbcStatement() { statement =>
-      val queries = Seq(
-        "DROP TABLE IF EXISTS test_date",
-        "CREATE TABLE test_date(key INT, value STRING)",
-        s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_date")
-
-      queries.foreach(statement.execute)
-
-      assertResult(Date.valueOf("2011-01-01")) {
-        val resultSet = statement.executeQuery(
-          "SELECT CAST('2011-01-01' as date) FROM test_date LIMIT 1")
-        resultSet.next()
-        resultSet.getDate(1)
-      }
-    }
-  }
-
-  test("SPARK-4407 regression: Complex type support") {
-    withJdbcStatement() { statement =>
-      val queries = Seq(
-        "DROP TABLE IF EXISTS test_map",
-        "CREATE TABLE test_map(key INT, value STRING)",
-        s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_map")
-
-      queries.foreach(statement.execute)
-
-      assertResult("""{238:"val_238"}""") {
-        val resultSet = statement.executeQuery("SELECT MAP(key, value) FROM test_map LIMIT 1")
-        resultSet.next()
-        resultSet.getString(1)
-      }
-
-      assertResult("""["238","val_238"]""") {
-        val resultSet = statement.executeQuery(
-          "SELECT ARRAY(CAST(key AS STRING), value) FROM test_map LIMIT 1")
-        resultSet.next()
-        resultSet.getString(1)
-      }
-    }
-  }
-}
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
new file mode 100644
index 0000000000000..77ef37253e38f
--- /dev/null
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -0,0 +1,403 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.thriftserver
+
+import java.io.File
+import java.sql.{Date, DriverManager, Statement}
+
+import scala.collection.mutable.ArrayBuffer
+import scala.concurrent.duration._
+import scala.concurrent.{Await, Promise}
+import scala.sys.process.{Process, ProcessLogger}
+import scala.util.{Random, Try}
+
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars
+import org.apache.hive.jdbc.HiveDriver
+import org.apache.hive.service.auth.PlainSaslHelper
+import org.apache.hive.service.cli.GetInfoType
+import org.apache.hive.service.cli.thrift.TCLIService.Client
+import org.apache.hive.service.cli.thrift.ThriftCLIServiceClient
+import org.apache.thrift.protocol.TBinaryProtocol
+import org.apache.thrift.transport.TSocket
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.util
+import org.apache.spark.sql.hive.HiveShim
+
+object TestData {
+  def getTestDataFilePath(name: String) = {
+    Thread.currentThread().getContextClassLoader.getResource(s"data/files/$name")
+  }
+
+  val smallKv = getTestDataFilePath("small_kv.txt")
+  val smallKvWithNull = getTestDataFilePath("small_kv_with_null.txt")
+}
+
+class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
+  override def mode = ServerMode.binary
+
+  private def withCLIServiceClient(f: ThriftCLIServiceClient => Unit): Unit = {
+    // Transport creation logics below mimics HiveConnection.createBinaryTransport
+    val rawTransport = new TSocket("localhost", serverPort)
+    val user = System.getProperty("user.name")
+    val transport = PlainSaslHelper.getPlainTransport(user, "anonymous", rawTransport)
+    val protocol = new TBinaryProtocol(transport)
+    val client = new ThriftCLIServiceClient(new Client(protocol))
+
+    transport.open()
+    try f(client) finally transport.close()
+  }
+
+  test("GetInfo Thrift API") {
+    withCLIServiceClient { client =>
+      val user = System.getProperty("user.name")
+      val sessionHandle = client.openSession(user, "")
+
+      assertResult("Spark SQL", "Wrong GetInfo(CLI_DBMS_NAME) result") {
+        client.getInfo(sessionHandle, GetInfoType.CLI_DBMS_NAME).getStringValue
+      }
+
+      assertResult("Spark SQL", "Wrong GetInfo(CLI_SERVER_NAME) result") {
+        client.getInfo(sessionHandle, GetInfoType.CLI_SERVER_NAME).getStringValue
+      }
+
+      assertResult(true, "Spark version shouldn't be \"Unknown\"") {
+        val version = client.getInfo(sessionHandle, GetInfoType.CLI_DBMS_VER).getStringValue
+        logInfo(s"Spark version: $version")
+        version != "Unknown"
+      }
+    }
+  }
+
+  test("JDBC query execution") {
+    withJdbcStatement { statement =>
+      val queries = Seq(
+        "SET spark.sql.shuffle.partitions=3",
+        "DROP TABLE IF EXISTS test",
+        "CREATE TABLE test(key INT, val STRING)",
+        s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test",
+        "CACHE TABLE test")
+
+      queries.foreach(statement.execute)
+
+      assertResult(5, "Row count mismatch") {
+        val resultSet = statement.executeQuery("SELECT COUNT(*) FROM test")
+        resultSet.next()
+        resultSet.getInt(1)
+      }
+    }
+  }
+
+  test("Checks Hive version") {
+    withJdbcStatement { statement =>
+      val resultSet = statement.executeQuery("SET spark.sql.hive.version")
+      resultSet.next()
+      assert(resultSet.getString(1) === s"spark.sql.hive.version=${HiveShim.version}")
+    }
+  }
+
+  test("SPARK-3004 regression: result set containing NULL") {
+    withJdbcStatement { statement =>
+      val queries = Seq(
+        "DROP TABLE IF EXISTS test_null",
+        "CREATE TABLE test_null(key INT, val STRING)",
+        s"LOAD DATA LOCAL INPATH '${TestData.smallKvWithNull}' OVERWRITE INTO TABLE test_null")
+
+      queries.foreach(statement.execute)
+
+      val resultSet = statement.executeQuery("SELECT * FROM test_null WHERE key IS NULL")
+
+      (0 until 5).foreach { _ =>
+        resultSet.next()
+        assert(resultSet.getInt(1) === 0)
+        assert(resultSet.wasNull())
+      }
+
+      assert(!resultSet.next())
+    }
+  }
+
+  test("SPARK-4292 regression: result set iterator issue") {
+    withJdbcStatement { statement =>
+      val queries = Seq(
+        "DROP TABLE IF EXISTS test_4292",
+        "CREATE TABLE test_4292(key INT, val STRING)",
+        s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_4292")
+
+      queries.foreach(statement.execute)
+
+      val resultSet = statement.executeQuery("SELECT key FROM test_4292")
+
+      Seq(238, 86, 311, 27, 165).foreach { key =>
+        resultSet.next()
+        assert(resultSet.getInt(1) === key)
+      }
+
+      statement.executeQuery("DROP TABLE IF EXISTS test_4292")
+    }
+  }
+
+  test("SPARK-4309 regression: Date type support") {
+    withJdbcStatement { statement =>
+      val queries = Seq(
+        "DROP TABLE IF EXISTS test_date",
+        "CREATE TABLE test_date(key INT, value STRING)",
+        s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_date")
+
+      queries.foreach(statement.execute)
+
+      assertResult(Date.valueOf("2011-01-01")) {
+        val resultSet = statement.executeQuery(
+          "SELECT CAST('2011-01-01' as date) FROM test_date LIMIT 1")
+        resultSet.next()
+        resultSet.getDate(1)
+      }
+    }
+  }
+
+  test("SPARK-4407 regression: Complex type support") {
+    withJdbcStatement { statement =>
+      val queries = Seq(
+        "DROP TABLE IF EXISTS test_map",
+        "CREATE TABLE test_map(key INT, value STRING)",
+        s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_map")
+
+      queries.foreach(statement.execute)
+
+      assertResult("""{238:"val_238"}""") {
+        val resultSet = statement.executeQuery("SELECT MAP(key, value) FROM test_map LIMIT 1")
+        resultSet.next()
+        resultSet.getString(1)
+      }
+
+      assertResult("""["238","val_238"]""") {
+        val resultSet = statement.executeQuery(
+          "SELECT ARRAY(CAST(key AS STRING), value) FROM test_map LIMIT 1")
+        resultSet.next()
+        resultSet.getString(1)
+      }
+    }
+  }
+}
+
+class HiveThriftHttpServerSuite extends HiveThriftJdbcTest {
+  override def mode = ServerMode.http
+
+  test("JDBC query execution") {
+    withJdbcStatement { statement =>
+      val queries = Seq(
+        "SET spark.sql.shuffle.partitions=3",
+        "DROP TABLE IF EXISTS test",
+        "CREATE TABLE test(key INT, val STRING)",
+        s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test",
+        "CACHE TABLE test")
+
+      queries.foreach(statement.execute)
+
+      assertResult(5, "Row count mismatch") {
+        val resultSet = statement.executeQuery("SELECT COUNT(*) FROM test")
+        resultSet.next()
+        resultSet.getInt(1)
+      }
+    }
+  }
+
+  test("Checks Hive version") {
+    withJdbcStatement { statement =>
+      val resultSet = statement.executeQuery("SET spark.sql.hive.version")
+      resultSet.next()
+      assert(resultSet.getString(1) === s"spark.sql.hive.version=${HiveShim.version}")
+    }
+  }
+}
+
+object ServerMode extends Enumeration {
+  val binary, http = Value
+}
+
+abstract class HiveThriftJdbcTest extends HiveThriftServer2Test {
+  Class.forName(classOf[HiveDriver].getCanonicalName)
+
+  private def jdbcUri = if (mode == ServerMode.http) {
+    s"""jdbc:hive2://localhost:$serverPort/
+       |default?
+       |hive.server2.transport.mode=http;
+       |hive.server2.thrift.http.path=cliservice
+     """.stripMargin.split("\n").mkString.trim
+  } else {
+    s"jdbc:hive2://localhost:$serverPort/"
+  }
+
+  protected def withJdbcStatement(f: Statement => Unit): Unit = {
+    val connection = DriverManager.getConnection(jdbcUri, user, "")
+    val statement = connection.createStatement()
+
+    try f(statement) finally {
+      statement.close()
+      connection.close()
+    }
+  }
+}
+
+abstract class HiveThriftServer2Test extends FunSuite with BeforeAndAfterAll with Logging {
+  def mode: ServerMode.Value
+
+  private val CLASS_NAME = HiveThriftServer2.getClass.getCanonicalName.stripSuffix("$")
+  private val LOG_FILE_MARK = s"starting $CLASS_NAME, logging to "
+
+  private val startScript = "../../sbin/start-thriftserver.sh".split("/").mkString(File.separator)
+  private val stopScript = "../../sbin/stop-thriftserver.sh".split("/").mkString(File.separator)
+
+  private var listeningPort: Int = _
+  protected def serverPort: Int = listeningPort
+
+  protected def user = System.getProperty("user.name")
+
+  private var warehousePath: File = _
+  private var metastorePath: File = _
+  private def metastoreJdbcUri = s"jdbc:derby:;databaseName=$metastorePath;create=true"
+
+  private var logPath: File = _
+  private var logTailingProcess: Process = _
+  private var diagnosisBuffer: ArrayBuffer[String] = ArrayBuffer.empty[String]
+
+  private def serverStartCommand(port: Int) = {
+    val portConf = if (mode == ServerMode.binary) {
+      ConfVars.HIVE_SERVER2_THRIFT_PORT
+    } else {
+      ConfVars.HIVE_SERVER2_THRIFT_HTTP_PORT
+    }
+
+    s"""$startScript
+       |  --master local
+       |  --hiveconf hive.root.logger=INFO,console
+       |  --hiveconf ${ConfVars.METASTORECONNECTURLKEY}=$metastoreJdbcUri
+       |  --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath
+       |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST}=localhost
+       |  --hiveconf ${ConfVars.HIVE_SERVER2_TRANSPORT_MODE}=$mode
+       |  --hiveconf $portConf=$port
+       |  --driver-class-path ${sys.props("java.class.path")}
+       |  --conf spark.ui.enabled=false
+     """.stripMargin.split("\\s+").toSeq
+  }
+
+  private def startThriftServer(port: Int, attempt: Int) = {
+    warehousePath = util.getTempFilePath("warehouse")
+    metastorePath = util.getTempFilePath("metastore")
+    logPath = null
+    logTailingProcess = null
+
+    val command = serverStartCommand(port)
+
+    diagnosisBuffer ++=
+      s"""
+         |### Attempt $attempt ###
+         |HiveThriftServer2 command line: $command
+         |Listening port: $port
+         |System user: $user
+       """.stripMargin.split("\n")
+
+    logInfo(s"Trying to start HiveThriftServer2: port=$port, mode=$mode, attempt=$attempt")
+
+    logPath = Process(command, None, "SPARK_TESTING" -> "0").lines.collectFirst {
+      case line if line.contains(LOG_FILE_MARK) => new File(line.drop(LOG_FILE_MARK.length))
+    }.getOrElse {
+      throw new RuntimeException("Failed to find HiveThriftServer2 log file.")
+    }
+
+    val serverStarted = Promise[Unit]()
+
+    // Ensures that the following "tail" command won't fail.
+    logPath.createNewFile()
+    logTailingProcess =
+      // Using "-n +0" to make sure all lines in the log file are checked.
+      Process(s"/usr/bin/env tail -n +0 -f ${logPath.getCanonicalPath}").run(ProcessLogger(
+        (line: String) => {
+          diagnosisBuffer += line
+
+          if (line.contains("ThriftBinaryCLIService listening on") ||
+              line.contains("Started ThriftHttpCLIService in http")) {
+            serverStarted.trySuccess(())
+          } else if (line.contains("HiveServer2 is stopped")) {
+            // This log line appears when the server fails to start and terminates gracefully (e.g.
+            // because of port contention).
+            serverStarted.tryFailure(new RuntimeException("Failed to start HiveThriftServer2"))
+          }
+        }))
+
+    Await.result(serverStarted.future, 2.minute)
+  }
+
+  private def stopThriftServer(): Unit = {
+    // The `spark-daemon.sh' script uses kill, which is not synchronous, have to wait for a while.
+    Process(stopScript, None).run().exitValue()
+    Thread.sleep(3.seconds.toMillis)
+
+    warehousePath.delete()
+    warehousePath = null
+
+    metastorePath.delete()
+    metastorePath = null
+
+    Option(logPath).foreach(_.delete())
+    logPath = null
+
+    Option(logTailingProcess).foreach(_.destroy())
+    logTailingProcess = null
+  }
+
+  private def dumpLogs(): Unit = {
+    logError(
+      s"""
+         |=====================================
+         |HiveThriftServer2Suite failure output
+         |=====================================
+         |${diagnosisBuffer.mkString("\n")}
+         |=========================================
+         |End HiveThriftServer2Suite failure output
+         |=========================================
+       """.stripMargin)
+  }
+
+  override protected def beforeAll(): Unit = {
+    // Chooses a random port between 10000 and 19999
+    listeningPort = 10000 + Random.nextInt(10000)
+    diagnosisBuffer.clear()
+
+    // Retries up to 3 times with different port numbers if the server fails to start
+    (1 to 3).foldLeft(Try(startThriftServer(listeningPort, 0))) { case (started, attempt) =>
+      started.orElse {
+        listeningPort += 1
+        stopThriftServer()
+        Try(startThriftServer(listeningPort, attempt))
+      }
+    }.recover {
+      case cause: Throwable =>
+        dumpLogs()
+        throw cause
+    }.get
+
+    logInfo(s"HiveThriftServer2 started successfully")
+  }
+
+  override protected def afterAll(): Unit = {
+    stopThriftServer()
+    logInfo("HiveThriftServer2 stopped")
+  }
+}

From 53a1ebf33b5c349ae3a40d7eebf357b839b363af Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 24 Feb 2015 18:51:41 -0800
Subject: [PATCH 294/357] [SPARK-5904][SQL] DataFrame Java API test suites.

Added a new test suite to make sure Java DF programs can use varargs properly.
Also moved all suites into test.org.apache.spark package to make sure the suites also test for method visibility.

Author: Reynold Xin <rxin@databricks.com>

Closes #4751 from rxin/df-tests and squashes the following commits:

1e8b8e4 [Reynold Xin] Fixed imports and renamed JavaAPISuite.
a6ca53b [Reynold Xin] [SPARK-5904][SQL] DataFrame Java API test suites.
---
 .../apache/spark/sql/api/java/JavaDsl.java    | 120 ------------------
 .../spark/sql}/JavaApplySchemaSuite.java      |  28 ++--
 .../apache/spark/sql/JavaDataFrameSuite.java  |  84 ++++++++++++
 .../org/apache/spark/sql}/JavaRowSuite.java   |   2 +-
 .../org/apache/spark/sql/JavaUDFSuite.java}   |  10 +-
 .../spark/sql/sources/JavaSaveLoadSuite.java  |   3 +-
 .../org/apache/spark/sql/DataFrameSuite.scala |   4 +-
 7 files changed, 108 insertions(+), 143 deletions(-)
 delete mode 100644 sql/core/src/test/java/org/apache/spark/sql/api/java/JavaDsl.java
 rename sql/core/src/test/java/{org/apache/spark/sql/api/java => test/org/apache/spark/sql}/JavaApplySchemaSuite.java (90%)
 create mode 100644 sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
 rename sql/core/src/test/java/{org/apache/spark/sql/api/java => test/org/apache/spark/sql}/JavaRowSuite.java (99%)
 rename sql/core/src/test/java/{org/apache/spark/sql/api/java/JavaAPISuite.java => test/org/apache/spark/sql/JavaUDFSuite.java} (94%)
 rename sql/core/src/test/java/{ => test}/org/apache/spark/sql/sources/JavaSaveLoadSuite.java (98%)

diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaDsl.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaDsl.java
deleted file mode 100644
index 05233dc5ffc58..0000000000000
--- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaDsl.java
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.api.java;
-
-import com.google.common.collect.ImmutableMap;
-
-import org.apache.spark.sql.Column;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.types.DataTypes;
-
-import static org.apache.spark.sql.functions.*;
-
-/**
- * This test doesn't actually run anything. It is here to check the API compatibility for Java.
- */
-public class JavaDsl {
-
-  public static void testDataFrame(final DataFrame df) {
-    DataFrame df1 = df.select("colA");
-    df1 = df.select("colA", "colB");
-
-    df1 = df.select(col("colA"), col("colB"), lit("literal value").$plus(1));
-
-    df1 = df.filter(col("colA"));
-
-    java.util.Map<String, String> aggExprs = ImmutableMap.<String, String>builder()
-      .put("colA", "sum")
-      .put("colB", "avg")
-      .build();
-
-    df1 = df.agg(aggExprs);
-
-    df1 = df.groupBy("groupCol").agg(aggExprs);
-
-    df1 = df.join(df1, col("key1").$eq$eq$eq(col("key2")), "outer");
-
-    df.orderBy("colA");
-    df.orderBy("colA", "colB", "colC");
-    df.orderBy(col("colA").desc());
-    df.orderBy(col("colA").desc(), col("colB").asc());
-
-    df.sort("colA");
-    df.sort("colA", "colB", "colC");
-    df.sort(col("colA").desc());
-    df.sort(col("colA").desc(), col("colB").asc());
-
-    df.as("b");
-
-    df.limit(5);
-
-    df.unionAll(df1);
-    df.intersect(df1);
-    df.except(df1);
-
-    df.sample(true, 0.1, 234);
-
-    df.head();
-    df.head(5);
-    df.first();
-    df.count();
-  }
-
-  public static void testColumn(final Column c) {
-    c.asc();
-    c.desc();
-
-    c.endsWith("abcd");
-    c.startsWith("afgasdf");
-
-    c.like("asdf%");
-    c.rlike("wef%asdf");
-
-    c.as("newcol");
-
-    c.cast("int");
-    c.cast(DataTypes.IntegerType);
-  }
-
-  public static void testDsl() {
-    // Creating a column.
-    Column c = col("abcd");
-    Column c1 = column("abcd");
-
-    // Literals
-    Column l1 = lit(1);
-    Column l2 = lit(1.0);
-    Column l3 = lit("abcd");
-
-    // Functions
-    Column a = upper(c);
-    a = lower(c);
-    a = sqrt(c);
-    a = abs(c);
-
-    // Aggregates
-    a = min(c);
-    a = max(c);
-    a = sum(c);
-    a = sumDistinct(c);
-    a = countDistinct(c, a);
-    a = avg(c);
-    a = first(c);
-    a = last(c);
-  }
-}
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java
similarity index 90%
rename from sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
rename to sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java
index 643b891ab1b63..c344a9b095c52 100644
--- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaApplySchemaSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaApplySchemaSuite.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.api.java;
+package test.org.apache.spark.sql;
 
 import java.io.Serializable;
 import java.util.ArrayList;
@@ -39,18 +39,18 @@
 // see http://stackoverflow.com/questions/758570/.
 public class JavaApplySchemaSuite implements Serializable {
   private transient JavaSparkContext javaCtx;
-  private transient SQLContext javaSqlCtx;
+  private transient SQLContext sqlContext;
 
   @Before
   public void setUp() {
-    javaSqlCtx = TestSQLContext$.MODULE$;
-    javaCtx = new JavaSparkContext(javaSqlCtx.sparkContext());
+    sqlContext = TestSQLContext$.MODULE$;
+    javaCtx = new JavaSparkContext(sqlContext.sparkContext());
   }
 
   @After
   public void tearDown() {
     javaCtx = null;
-    javaSqlCtx = null;
+    sqlContext = null;
   }
 
   public static class Person implements Serializable {
@@ -98,9 +98,9 @@ public Row call(Person person) throws Exception {
     fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false));
     StructType schema = DataTypes.createStructType(fields);
 
-    DataFrame df = javaSqlCtx.applySchema(rowRDD, schema);
+    DataFrame df = sqlContext.applySchema(rowRDD, schema);
     df.registerTempTable("people");
-    Row[] actual = javaSqlCtx.sql("SELECT * FROM people").collect();
+    Row[] actual = sqlContext.sql("SELECT * FROM people").collect();
 
     List<Row> expected = new ArrayList<Row>(2);
     expected.add(RowFactory.create("Michael", 29));
@@ -109,8 +109,6 @@ public Row call(Person person) throws Exception {
     Assert.assertEquals(expected, Arrays.asList(actual));
   }
 
-
-
   @Test
   public void dataFrameRDDOperations() {
     List<Person> personList = new ArrayList<Person>(2);
@@ -135,9 +133,9 @@ public Row call(Person person) throws Exception {
     fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false));
     StructType schema = DataTypes.createStructType(fields);
 
-    DataFrame df = javaSqlCtx.applySchema(rowRDD, schema);
+    DataFrame df = sqlContext.applySchema(rowRDD, schema);
     df.registerTempTable("people");
-    List<String> actual = javaSqlCtx.sql("SELECT * FROM people").toJavaRDD().map(new Function<Row, String>() {
+    List<String> actual = sqlContext.sql("SELECT * FROM people").toJavaRDD().map(new Function<Row, String>() {
 
       public String call(Row row) {
         return row.getString(0) + "_" + row.get(1).toString();
@@ -189,18 +187,18 @@ public void applySchemaToJSON() {
         null,
         "this is another simple string."));
 
-    DataFrame df1 = javaSqlCtx.jsonRDD(jsonRDD);
+    DataFrame df1 = sqlContext.jsonRDD(jsonRDD);
     StructType actualSchema1 = df1.schema();
     Assert.assertEquals(expectedSchema, actualSchema1);
     df1.registerTempTable("jsonTable1");
-    List<Row> actual1 = javaSqlCtx.sql("select * from jsonTable1").collectAsList();
+    List<Row> actual1 = sqlContext.sql("select * from jsonTable1").collectAsList();
     Assert.assertEquals(expectedResult, actual1);
 
-    DataFrame df2 = javaSqlCtx.jsonRDD(jsonRDD, expectedSchema);
+    DataFrame df2 = sqlContext.jsonRDD(jsonRDD, expectedSchema);
     StructType actualSchema2 = df2.schema();
     Assert.assertEquals(expectedSchema, actualSchema2);
     df2.registerTempTable("jsonTable2");
-    List<Row> actual2 = javaSqlCtx.sql("select * from jsonTable2").collectAsList();
+    List<Row> actual2 = sqlContext.sql("select * from jsonTable2").collectAsList();
     Assert.assertEquals(expectedResult, actual2);
   }
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
new file mode 100644
index 0000000000000..c1c51f80d6586
--- /dev/null
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.test.TestSQLContext$;
+import static org.apache.spark.sql.functions.*;
+
+
+public class JavaDataFrameSuite {
+  private transient SQLContext context;
+
+  @Before
+  public void setUp() {
+    // Trigger static initializer of TestData
+    TestData$.MODULE$.testData();
+    context = TestSQLContext$.MODULE$;
+  }
+
+  @After
+  public void tearDown() {
+    context = null;
+  }
+
+  @Test
+  public void testExecution() {
+    DataFrame df = context.table("testData").filter("key = 1");
+    Assert.assertEquals(df.select("key").collect()[0].get(0), 1);
+  }
+
+  /**
+   * See SPARK-5904. Abstract vararg methods defined in Scala do not work in Java.
+   */
+  @Test
+  public void testVarargMethods() {
+    DataFrame df = context.table("testData");
+
+    df.toDF("key1", "value1");
+
+    df.select("key", "value");
+    df.select(col("key"), col("value"));
+    df.selectExpr("key", "value + 1");
+
+    df.sort("key", "value");
+    df.sort(col("key"), col("value"));
+    df.orderBy("key", "value");
+    df.orderBy(col("key"), col("value"));
+
+    df.groupBy("key", "value").agg(col("key"), col("value"), sum("value"));
+    df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value"));
+    df.agg(first("key"), sum("value"));
+
+    df.groupBy().avg("key");
+    df.groupBy().mean("key");
+    df.groupBy().max("key");
+    df.groupBy().min("key");
+    df.groupBy().sum("key");
+
+    // Varargs in column expressions
+    df.groupBy().agg(countDistinct("key", "value"));
+    df.groupBy().agg(countDistinct(col("key"), col("value")));
+    df.select(coalesce(col("key")));
+  }
+}
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaRowSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaRowSuite.java
similarity index 99%
rename from sql/core/src/test/java/org/apache/spark/sql/api/java/JavaRowSuite.java
rename to sql/core/src/test/java/test/org/apache/spark/sql/JavaRowSuite.java
index fbfcd3f59d910..4ce1d1dddb26a 100644
--- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaRowSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaRowSuite.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.api.java;
+package test.org.apache.spark.sql;
 
 import java.math.BigDecimal;
 import java.sql.Date;
diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
similarity index 94%
rename from sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java
rename to sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
index a21a15409080c..79d92734ff375 100644
--- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaAPISuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaUDFSuite.java
@@ -15,24 +15,26 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.api.java;
+package test.org.apache.spark.sql;
 
 import java.io.Serializable;
 
-import org.apache.spark.sql.test.TestSQLContext$;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 
-import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.api.java.UDF1;
+import org.apache.spark.sql.api.java.UDF2;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.test.TestSQLContext$;
 import org.apache.spark.sql.types.DataTypes;
 
 // The test suite itself is Serializable so that anonymous Function implementations can be
 // serialized, as an alternative to converting these anonymous classes to static inner classes;
 // see http://stackoverflow.com/questions/758570/.
-public class JavaAPISuite implements Serializable {
+public class JavaUDFSuite implements Serializable {
   private transient JavaSparkContext sc;
   private transient SQLContext sqlContext;
 
diff --git a/sql/core/src/test/java/org/apache/spark/sql/sources/JavaSaveLoadSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
similarity index 98%
rename from sql/core/src/test/java/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
rename to sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
index 311f1bdd07510..b76f7d421f643 100644
--- a/sql/core/src/test/java/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/JavaSaveLoadSuite.java
@@ -14,7 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.sources;
+
+package test.org.apache.spark.sql.sources;
 
 import java.io.File;
 import java.io.IOException;
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index e71e9bee3a6d8..30e77e4ef30f2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -411,7 +411,7 @@ class DataFrameSuite extends QueryTest {
     )
   }
 
-  test("addColumn") {
+  test("withColumn") {
     val df = testData.toDF().withColumn("newCol", col("key") + 1)
     checkAnswer(
       df,
@@ -421,7 +421,7 @@ class DataFrameSuite extends QueryTest {
     assert(df.schema.map(_.name).toSeq === Seq("key", "value", "newCol"))
   }
 
-  test("renameColumn") {
+  test("withColumnRenamed") {
     val df = testData.toDF().withColumn("newCol", col("key") + 1)
       .withColumnRenamed("value", "valueRenamed")
     checkAnswer(

From fba11c2f55dd81e4f6230e7edca3c7b2e01ccd9d Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 24 Feb 2015 18:59:23 -0800
Subject: [PATCH 295/357] [SPARK-5985][SQL] DataFrame sortBy -> orderBy in
 Python.

Also added desc/asc function for constructing sorting expressions more conveniently. And added a small fix to lift alias out of cast expression.

Author: Reynold Xin <rxin@databricks.com>

Closes #4752 from rxin/SPARK-5985 and squashes the following commits:

aeda5ae [Reynold Xin] Added Experimental flag to ColumnName.
047ad03 [Reynold Xin] Lift alias out of cast.
c9cf17c [Reynold Xin] [SPARK-5985][SQL] DataFrame sortBy -> orderBy in Python.
---
 python/pyspark/sql/dataframe.py               | 11 +++++--
 python/pyspark/sql/functions.py               |  3 ++
 .../scala/org/apache/spark/sql/Column.scala   | 13 +++++++--
 .../org/apache/spark/sql/functions.scala      | 29 +++++++++++++++++++
 .../spark/sql/ColumnExpressionSuite.scala     |  4 +++
 .../org/apache/spark/sql/DataFrameSuite.scala |  4 +++
 6 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 010c38f93b9cf..6f746d136b22d 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -504,13 +504,18 @@ def join(self, other, joinExprs=None, joinType=None):
         return DataFrame(jdf, self.sql_ctx)
 
     def sort(self, *cols):
-        """ Return a new :class:`DataFrame` sorted by the specified column.
+        """ Return a new :class:`DataFrame` sorted by the specified column(s).
 
         :param cols: The columns or expressions used for sorting
 
         >>> df.sort(df.age.desc()).collect()
         [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
-        >>> df.sortBy(df.age.desc()).collect()
+        >>> df.orderBy(df.age.desc()).collect()
+        [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
+        >>> from pyspark.sql.functions import *
+        >>> df.sort(asc("age")).collect()
+        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
+        >>> df.orderBy(desc("age"), "name").collect()
         [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
         """
         if not cols:
@@ -520,7 +525,7 @@ def sort(self, *cols):
         jdf = self._jdf.sort(self._sc._jvm.PythonUtils.toSeq(jcols))
         return DataFrame(jdf, self.sql_ctx)
 
-    sortBy = sort
+    orderBy = sort
 
     def head(self, n=None):
         """ Return the first `n` rows or the first row if n is None.
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index fc61162f0b827..8aa44765205c1 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -48,6 +48,9 @@ def _(col):
     'lit': 'Creates a :class:`Column` of literal value.',
     'col': 'Returns a :class:`Column` based on the given column name.',
     'column': 'Returns a :class:`Column` based on the given column name.',
+    'asc': 'Returns a sort expression based on the ascending order of the given column name.',
+    'desc': 'Returns a sort expression based on the descending order of the given column name.',
+
     'upper': 'Converts a string expression to upper case.',
     'lower': 'Converts a string expression to upper case.',
     'sqrt': 'Computes the square root of the specified float value.',
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 980754322e6c8..a2cc9a9b93eb8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -600,7 +600,11 @@ class Column(protected[sql] val expr: Expression) {
    *
    * @group expr_ops
    */
-  def cast(to: DataType): Column = Cast(expr, to)
+  def cast(to: DataType): Column = expr match {
+    // Lift alias out of cast so we can support col.as("name").cast(IntegerType)
+    case Alias(childExpr, name) => Alias(Cast(childExpr, to), name)()
+    case _ => Cast(expr, to)
+  }
 
   /**
    * Casts the column to a different data type, using the canonical string representation
@@ -613,7 +617,7 @@ class Column(protected[sql] val expr: Expression) {
    *
    * @group expr_ops
    */
-  def cast(to: String): Column = Cast(expr, to.toLowerCase match {
+  def cast(to: String): Column = cast(to.toLowerCase match {
     case "string" | "str" => StringType
     case "boolean" => BooleanType
     case "byte" => ByteType
@@ -671,6 +675,11 @@ class Column(protected[sql] val expr: Expression) {
 }
 
 
+/**
+ * :: Experimental ::
+ * A convenient class used for constructing schema.
+ */
+@Experimental
 class ColumnName(name: String) extends Column(name) {
 
   /** Creates a new AttributeReference of type boolean */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 2a1e086891423..4fdbfc6d22c9b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.types._
  *
  * @groupname udf_funcs UDF functions
  * @groupname agg_funcs Aggregate functions
+ * @groupname sort_funcs Sorting functions
  * @groupname normal_funcs Non-aggregate functions
  * @groupname Ungrouped Support functions for DataFrames.
  */
@@ -96,6 +97,33 @@ object functions {
   }
 
   //////////////////////////////////////////////////////////////////////////////////////////////
+  // Sort functions
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * Returns a sort expression based on ascending order of the column.
+   * {{
+   *   // Sort by dept in ascending order, and then age in descending order.
+   *   df.sort(asc("dept"), desc("age"))
+   * }}
+   *
+   * @group sort_funcs
+   */
+  def asc(columnName: String): Column = Column(columnName).asc
+
+  /**
+   * Returns a sort expression based on the descending order of the column.
+   * {{
+   *   // Sort by dept in ascending order, and then age in descending order.
+   *   df.sort(asc("dept"), desc("age"))
+   * }}
+   *
+   * @group sort_funcs
+   */
+  def desc(columnName: String): Column = Column(columnName).desc
+
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  // Aggregate functions
   //////////////////////////////////////////////////////////////////////////////////////////////
 
   /**
@@ -263,6 +291,7 @@ object functions {
   def max(columnName: String): Column = max(Column(columnName))
 
   //////////////////////////////////////////////////////////////////////////////////////////////
+  // Non-aggregate functions
   //////////////////////////////////////////////////////////////////////////////////////////////
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 928b0deb61921..37c02aaa5460b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -309,4 +309,8 @@ class ColumnExpressionSuite extends QueryTest {
       (1 to 100).map(n => Row(null))
     )
   }
+
+  test("lift alias out of cast") {
+    assert(col("1234").as("name").cast("int").expr === col("1234").cast("int").as("name").expr)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 30e77e4ef30f2..c392a553c03f3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -239,6 +239,10 @@ class DataFrameSuite extends QueryTest {
       testData2.orderBy('a.asc, 'b.asc),
       Seq(Row(1,1), Row(1,2), Row(2,1), Row(2,2), Row(3,1), Row(3,2)))
 
+    checkAnswer(
+      testData2.orderBy(asc("a"), desc("b")),
+      Seq(Row(1,2), Row(1,1), Row(2,2), Row(2,1), Row(3,2), Row(3,1)))
+
     checkAnswer(
       testData2.orderBy('a.asc, 'b.desc),
       Seq(Row(1,2), Row(1,1), Row(2,2), Row(2,1), Row(3,2), Row(3,1)))

From 922b43b3cc1cca04e0313bf9e31c5f944ac06d1f Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 24 Feb 2015 19:10:37 -0800
Subject: [PATCH 296/357] [SPARK-5993][Streaming][Build] Fix assembly jar
 location of kafka-assembly

Published Kafka-assembly JAR was empty in 1.3.0-RC1
This is because the maven build generated two Jars-
1. an empty JAR file (since kafka-assembly has no code of its own)
2. a assembly JAR file containing everything in a different location as 1
The maven publishing plugin uploaded 1 and not 2.
Instead if 2 is not configure to generate in a different location, there is only 1 jar containing everything, which gets published.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #4753 from tdas/SPARK-5993 and squashes the following commits:

c390db8 [Tathagata Das] Fix assembly jar location of kafka-assembly
---
 external/kafka-assembly/pom.xml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml
index 503fc129dc4f2..8daa7ed608f6a 100644
--- a/external/kafka-assembly/pom.xml
+++ b/external/kafka-assembly/pom.xml
@@ -33,9 +33,6 @@
 
   <properties>
     <sbt.project.name>streaming-kafka-assembly</sbt.project.name>
-    <spark.jar.dir>scala-${scala.binary.version}</spark.jar.dir>
-    <spark.jar.basename>spark-streaming-kafka-assembly-${project.version}.jar</spark.jar.basename>
-    <spark.jar>${project.build.directory}/${spark.jar.dir}/${spark.jar.basename}</spark.jar>
   </properties>
 
   <dependencies>
@@ -61,7 +58,6 @@
       <artifactId>maven-shade-plugin</artifactId>
       <configuration>
         <shadedArtifactAttached>false</shadedArtifactAttached>
-        <outputFile>${spark.jar}</outputFile>
         <artifactSet>
           <includes>
             <include>*:*</include>

From 769e092bdc51582372093f76dbaece27149cc4ea Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 24 Feb 2015 19:51:36 -0800
Subject: [PATCH 297/357] [SPARK-5286][SQL] SPARK-5286 followup

https://issues.apache.org/jira/browse/SPARK-5286

Author: Yin Huai <yhuai@databricks.com>

Closes #4755 from yhuai/SPARK-5286-throwable and squashes the following commits:

4c0c450 [Yin Huai] Catch Throwable instead of Exception.
---
 .../org/apache/spark/sql/hive/execution/commands.scala      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index c88d0e6b79491..9934a5d3c30a2 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -63,10 +63,10 @@ case class DropTable(
     } catch {
       // This table's metadata is not in
       case _: org.apache.hadoop.hive.ql.metadata.InvalidTableException =>
-      // Other exceptions can be caused by users providing wrong parameters in OPTIONS
+      // Other Throwables can be caused by users providing wrong parameters in OPTIONS
       // (e.g. invalid paths). We catch it and log a warning message.
-      // Users should be able to drop such kinds of tables regardless if there is an exception.
-      case e: Exception => log.warn(s"${e.getMessage}")
+      // Users should be able to drop such kinds of tables regardless if there is an error.
+      case e: Throwable => log.warn(s"${e.getMessage}")
     }
     hiveContext.invalidateTable(tableName)
     hiveContext.runSqlHive(s"DROP TABLE $ifExistsClause$tableName")

From d641fbb39c90b1d734cc55396ca43d7e98788975 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 24 Feb 2015 20:51:55 -0800
Subject: [PATCH 298/357] [SPARK-5994] [SQL] Python DataFrame documentation
 fixes

select empty should NOT be the same as select. make sure selectExpr is behaving the same.
join param documentation
link to source doesn't work in jekyll generated file
cross reference of columns (i.e. enabling linking)
show(): move df example before df.show()
move tests in SQLContext out of docstring otherwise doc is too long
Column.desc and .asc doesn't have any documentation
in documentation, sort functions.*)

Author: Davies Liu <davies@databricks.com>

Closes #4756 from davies/df_docs and squashes the following commits:

f30502c [Davies Liu] fix doc
32f0d46 [Davies Liu] fix DataFrame docs
---
 docs/_config.yml                |   1 +
 python/docs/pyspark.sql.rst     |   3 -
 python/pyspark/sql/context.py   | 182 ++++++--------------------------
 python/pyspark/sql/dataframe.py |  56 +++++-----
 python/pyspark/sql/functions.py |   1 +
 python/pyspark/sql/tests.py     |  68 +++++++++++-
 python/pyspark/sql/types.py     |   2 +-
 7 files changed, 130 insertions(+), 183 deletions(-)

diff --git a/docs/_config.yml b/docs/_config.yml
index e2db274e1f619..0652927a8ce9b 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -10,6 +10,7 @@ kramdown:
 
 include:
   - _static
+  - _modules
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
diff --git a/python/docs/pyspark.sql.rst b/python/docs/pyspark.sql.rst
index e03379e521a07..2e3f69b9a562a 100644
--- a/python/docs/pyspark.sql.rst
+++ b/python/docs/pyspark.sql.rst
@@ -7,7 +7,6 @@ Module Context
 .. automodule:: pyspark.sql
     :members:
     :undoc-members:
-    :show-inheritance:
 
 
 pyspark.sql.types module
@@ -15,7 +14,6 @@ pyspark.sql.types module
 .. automodule:: pyspark.sql.types
     :members:
     :undoc-members:
-    :show-inheritance:
 
 
 pyspark.sql.functions module
@@ -23,4 +21,3 @@ pyspark.sql.functions module
 .. automodule:: pyspark.sql.functions
     :members:
     :undoc-members:
-    :show-inheritance:
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 125933c9d3ae0..5d7aeb664cadf 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -129,6 +129,7 @@ def registerFunction(self, name, f, returnType=StringType()):
         >>> sqlCtx.registerFunction("stringLengthString", lambda x: len(x))
         >>> sqlCtx.sql("SELECT stringLengthString('test')").collect()
         [Row(c0=u'4')]
+
         >>> from pyspark.sql.types import IntegerType
         >>> sqlCtx.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
         >>> sqlCtx.sql("SELECT stringLengthInt('test')").collect()
@@ -197,31 +198,6 @@ def inferSchema(self, rdd, samplingRatio=None):
         >>> df = sqlCtx.inferSchema(rdd)
         >>> df.collect()[0]
         Row(field1=1, field2=u'row1')
-
-        >>> NestedRow = Row("f1", "f2")
-        >>> nestedRdd1 = sc.parallelize([
-        ...     NestedRow(array('i', [1, 2]), {"row1": 1.0}),
-        ...     NestedRow(array('i', [2, 3]), {"row2": 2.0})])
-        >>> df = sqlCtx.inferSchema(nestedRdd1)
-        >>> df.collect()
-        [Row(f1=[1, 2], f2={u'row1': 1.0}), ..., f2={u'row2': 2.0})]
-
-        >>> nestedRdd2 = sc.parallelize([
-        ...     NestedRow([[1, 2], [2, 3]], [1, 2]),
-        ...     NestedRow([[2, 3], [3, 4]], [2, 3])])
-        >>> df = sqlCtx.inferSchema(nestedRdd2)
-        >>> df.collect()
-        [Row(f1=[[1, 2], [2, 3]], f2=[1, 2]), ..., f2=[2, 3])]
-
-        >>> from collections import namedtuple
-        >>> CustomRow = namedtuple('CustomRow', 'field1 field2')
-        >>> rdd = sc.parallelize(
-        ...     [CustomRow(field1=1, field2="row1"),
-        ...      CustomRow(field1=2, field2="row2"),
-        ...      CustomRow(field1=3, field2="row3")])
-        >>> df = sqlCtx.inferSchema(rdd)
-        >>> df.collect()[0]
-        Row(field1=1, field2=u'row1')
         """
 
         if isinstance(rdd, DataFrame):
@@ -252,56 +228,8 @@ def applySchema(self, rdd, schema):
         >>> schema = StructType([StructField("field1", IntegerType(), False),
         ...     StructField("field2", StringType(), False)])
         >>> df = sqlCtx.applySchema(rdd2, schema)
-        >>> sqlCtx.registerDataFrameAsTable(df, "table1")
-        >>> df2 = sqlCtx.sql("SELECT * from table1")
-        >>> df2.collect()
-        [Row(field1=1, field2=u'row1'),..., Row(field1=3, field2=u'row3')]
-
-        >>> from datetime import date, datetime
-        >>> rdd = sc.parallelize([(127, -128L, -32768, 32767, 2147483647L, 1.0,
-        ...     date(2010, 1, 1),
-        ...     datetime(2010, 1, 1, 1, 1, 1),
-        ...     {"a": 1}, (2,), [1, 2, 3], None)])
-        >>> schema = StructType([
-        ...     StructField("byte1", ByteType(), False),
-        ...     StructField("byte2", ByteType(), False),
-        ...     StructField("short1", ShortType(), False),
-        ...     StructField("short2", ShortType(), False),
-        ...     StructField("int1", IntegerType(), False),
-        ...     StructField("float1", FloatType(), False),
-        ...     StructField("date1", DateType(), False),
-        ...     StructField("time1", TimestampType(), False),
-        ...     StructField("map1",
-        ...         MapType(StringType(), IntegerType(), False), False),
-        ...     StructField("struct1",
-        ...         StructType([StructField("b", ShortType(), False)]), False),
-        ...     StructField("list1", ArrayType(ByteType(), False), False),
-        ...     StructField("null1", DoubleType(), True)])
-        >>> df = sqlCtx.applySchema(rdd, schema)
-        >>> results = df.map(
-        ...     lambda x: (x.byte1, x.byte2, x.short1, x.short2, x.int1, x.float1, x.date1,
-        ...         x.time1, x.map1["a"], x.struct1.b, x.list1, x.null1))
-        >>> results.collect()[0] # doctest: +NORMALIZE_WHITESPACE
-        (127, -128, -32768, 32767, 2147483647, 1.0, datetime.date(2010, 1, 1),
-             datetime.datetime(2010, 1, 1, 1, 1, 1), 1, 2, [1, 2, 3], None)
-
-        >>> df.registerTempTable("table2")
-        >>> sqlCtx.sql(
-        ...   "SELECT byte1 - 1 AS byte1, byte2 + 1 AS byte2, " +
-        ...     "short1 + 1 AS short1, short2 - 1 AS short2, int1 - 1 AS int1, " +
-        ...     "float1 + 1.5 as float1 FROM table2").collect()
-        [Row(byte1=126, byte2=-127, short1=-32767, short2=32766, int1=2147483646, float1=2.5)]
-
-        >>> from pyspark.sql.types import _parse_schema_abstract, _infer_schema_type
-        >>> rdd = sc.parallelize([(127, -32768, 1.0,
-        ...     datetime(2010, 1, 1, 1, 1, 1),
-        ...     {"a": 1}, (2,), [1, 2, 3])])
-        >>> abstract = "byte1 short1 float1 time1 map1{} struct1(b) list1[]"
-        >>> schema = _parse_schema_abstract(abstract)
-        >>> typedSchema = _infer_schema_type(rdd.first(), schema)
-        >>> df = sqlCtx.applySchema(rdd, typedSchema)
         >>> df.collect()
-        [Row(byte1=127, short1=-32768, float1=1.0, time1=..., list1=[1, 2, 3])]
+        [Row(field1=1, field2=u'row1'),..., Row(field1=3, field2=u'row3')]
         """
 
         if isinstance(rdd, DataFrame):
@@ -459,46 +387,28 @@ def jsonFile(self, path, schema=None, samplingRatio=1.0):
         >>> import tempfile, shutil
         >>> jsonFile = tempfile.mkdtemp()
         >>> shutil.rmtree(jsonFile)
-        >>> ofn = open(jsonFile, 'w')
-        >>> for json in jsonStrings:
-        ...   print>>ofn, json
-        >>> ofn.close()
+        >>> with open(jsonFile, 'w') as f:
+        ...     f.writelines(jsonStrings)
         >>> df1 = sqlCtx.jsonFile(jsonFile)
-        >>> sqlCtx.registerDataFrameAsTable(df1, "table1")
-        >>> df2 = sqlCtx.sql(
-        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
-        ...   "field6 as f4 from table1")
-        >>> for r in df2.collect():
-        ...     print r
-        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
-        Row(f1=2, f2=None, f3=Row(field4=22,..., f4=[Row(field7=u'row2')])
-        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
-
-        >>> df3 = sqlCtx.jsonFile(jsonFile, df1.schema)
-        >>> sqlCtx.registerDataFrameAsTable(df3, "table2")
-        >>> df4 = sqlCtx.sql(
-        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
-        ...   "field6 as f4 from table2")
-        >>> for r in df4.collect():
-        ...    print r
-        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
-        Row(f1=2, f2=None, f3=Row(field4=22,..., f4=[Row(field7=u'row2')])
-        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
+        >>> df1.printSchema()
+        root
+         |-- field1: long (nullable = true)
+         |-- field2: string (nullable = true)
+         |-- field3: struct (nullable = true)
+         |    |-- field4: long (nullable = true)
 
         >>> from pyspark.sql.types import *
         >>> schema = StructType([
-        ...     StructField("field2", StringType(), True),
+        ...     StructField("field2", StringType()),
         ...     StructField("field3",
-        ...         StructType([
-        ...             StructField("field5",
-        ...                 ArrayType(IntegerType(), False), True)]), False)])
-        >>> df5 = sqlCtx.jsonFile(jsonFile, schema)
-        >>> sqlCtx.registerDataFrameAsTable(df5, "table3")
-        >>> df6 = sqlCtx.sql(
-        ...   "SELECT field2 AS f1, field3.field5 as f2, "
-        ...   "field3.field5[0] as f3 from table3")
-        >>> df6.collect()
-        [Row(f1=u'row1', f2=None, f3=None)...Row(f1=u'row3', f2=[], f3=None)]
+        ...         StructType([StructField("field5", ArrayType(IntegerType()))]))])
+        >>> df2 = sqlCtx.jsonFile(jsonFile, schema)
+        >>> df2.printSchema()
+        root
+         |-- field2: string (nullable = true)
+         |-- field3: struct (nullable = true)
+         |    |-- field5: array (nullable = true)
+         |    |    |-- element: integer (containsNull = true)
         """
         if schema is None:
             df = self._ssql_ctx.jsonFile(path, samplingRatio)
@@ -517,48 +427,23 @@ def jsonRDD(self, rdd, schema=None, samplingRatio=1.0):
         determine the schema.
 
         >>> df1 = sqlCtx.jsonRDD(json)
-        >>> sqlCtx.registerDataFrameAsTable(df1, "table1")
-        >>> df2 = sqlCtx.sql(
-        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
-        ...   "field6 as f4 from table1")
-        >>> for r in df2.collect():
-        ...     print r
-        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
-        Row(f1=2, f2=None, f3=Row(field4=22..., f4=[Row(field7=u'row2')])
-        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
-
-        >>> df3 = sqlCtx.jsonRDD(json, df1.schema)
-        >>> sqlCtx.registerDataFrameAsTable(df3, "table2")
-        >>> df4 = sqlCtx.sql(
-        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, "
-        ...   "field6 as f4 from table2")
-        >>> for r in df4.collect():
-        ...     print r
-        Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
-        Row(f1=2, f2=None, f3=Row(field4=22..., f4=[Row(field7=u'row2')])
-        Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
+        >>> df1.first()
+        Row(field1=1, field2=u'row1', field3=Row(field4=11, field5=None), field6=None)
+
+        >>> df2 = sqlCtx.jsonRDD(json, df1.schema)
+        >>> df2.first()
+        Row(field1=1, field2=u'row1', field3=Row(field4=11, field5=None), field6=None)
 
         >>> from pyspark.sql.types import *
         >>> schema = StructType([
-        ...     StructField("field2", StringType(), True),
+        ...     StructField("field2", StringType()),
         ...     StructField("field3",
-        ...         StructType([
-        ...             StructField("field5",
-        ...                 ArrayType(IntegerType(), False), True)]), False)])
-        >>> df5 = sqlCtx.jsonRDD(json, schema)
-        >>> sqlCtx.registerDataFrameAsTable(df5, "table3")
-        >>> df6 = sqlCtx.sql(
-        ...   "SELECT field2 AS f1, field3.field5 as f2, "
-        ...   "field3.field5[0] as f3 from table3")
-        >>> df6.collect()
-        [Row(f1=u'row1', f2=None,...Row(f1=u'row3', f2=[], f3=None)]
-
-        >>> sqlCtx.jsonRDD(sc.parallelize(['{}',
-        ...         '{"key0": {"key1": "value1"}}'])).collect()
-        [Row(key0=None), Row(key0=Row(key1=u'value1'))]
-        >>> sqlCtx.jsonRDD(sc.parallelize(['{"key0": null}',
-        ...         '{"key0": {"key1": "value1"}}'])).collect()
-        [Row(key0=None), Row(key0=Row(key1=u'value1'))]
+        ...                 StructType([StructField("field5", ArrayType(IntegerType()))]))
+        ... ])
+        >>> df3 = sqlCtx.jsonRDD(json, schema)
+        >>> df3.first()
+        Row(field2=u'row1', field3=Row(field5=None))
+
         """
 
         def func(iterator):
@@ -848,7 +733,8 @@ def _test():
     globs['jsonStrings'] = jsonStrings
     globs['json'] = sc.parallelize(jsonStrings)
     (failure_count, test_count) = doctest.testmod(
-        pyspark.sql.context, globs=globs, optionflags=doctest.ELLIPSIS)
+        pyspark.sql.context, globs=globs,
+        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
     globs['sc'].stop()
     if failure_count:
         exit(-1)
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 6f746d136b22d..6d42410020b64 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -96,7 +96,7 @@ def applySchema(it):
         return self._lazy_rdd
 
     def toJSON(self, use_unicode=False):
-        """Convert a DataFrame into a MappedRDD of JSON documents; one document per row.
+        """Convert a :class:`DataFrame` into a MappedRDD of JSON documents; one document per row.
 
         >>> df.toJSON().first()
         '{"age":2,"name":"Alice"}'
@@ -108,7 +108,7 @@ def saveAsParquetFile(self, path):
         """Save the contents as a Parquet file, preserving the schema.
 
         Files that are written out using this method can be read back in as
-        a DataFrame using the L{SQLContext.parquetFile} method.
+        a :class:`DataFrame` using the L{SQLContext.parquetFile} method.
 
         >>> import tempfile, shutil
         >>> parquetFile = tempfile.mkdtemp()
@@ -139,7 +139,7 @@ def registerAsTable(self, name):
         self.registerTempTable(name)
 
     def insertInto(self, tableName, overwrite=False):
-        """Inserts the contents of this DataFrame into the specified table.
+        """Inserts the contents of this :class:`DataFrame` into the specified table.
 
         Optionally overwriting any existing data.
         """
@@ -165,7 +165,7 @@ def _java_save_mode(self, mode):
         return jmode
 
     def saveAsTable(self, tableName, source=None, mode="append", **options):
-        """Saves the contents of the DataFrame to a data source as a table.
+        """Saves the contents of the :class:`DataFrame` to a data source as a table.
 
         The data source is specified by the `source` and a set of `options`.
         If `source` is not specified, the default data source configured by
@@ -174,12 +174,13 @@ def saveAsTable(self, tableName, source=None, mode="append", **options):
         Additionally, mode is used to specify the behavior of the saveAsTable operation when
         table already exists in the data source. There are four modes:
 
-        * append: Contents of this DataFrame are expected to be appended to existing table.
-        * overwrite: Data in the existing table is expected to be overwritten by the contents of \
-            this DataFrame.
+        * append: Contents of this :class:`DataFrame` are expected to be appended \
+            to existing table.
+        * overwrite: Data in the existing table is expected to be overwritten by \
+            the contents of  this DataFrame.
         * error: An exception is expected to be thrown.
-        * ignore: The save operation is expected to not save the contents of the DataFrame and \
-            to not change the existing table.
+        * ignore: The save operation is expected to not save the contents of the \
+            :class:`DataFrame` and to not change the existing table.
         """
         if source is None:
             source = self.sql_ctx.getConf("spark.sql.sources.default",
@@ -190,7 +191,7 @@ def saveAsTable(self, tableName, source=None, mode="append", **options):
         self._jdf.saveAsTable(tableName, source, jmode, joptions)
 
     def save(self, path=None, source=None, mode="append", **options):
-        """Saves the contents of the DataFrame to a data source.
+        """Saves the contents of the :class:`DataFrame` to a data source.
 
         The data source is specified by the `source` and a set of `options`.
         If `source` is not specified, the default data source configured by
@@ -199,11 +200,11 @@ def save(self, path=None, source=None, mode="append", **options):
         Additionally, mode is used to specify the behavior of the save operation when
         data already exists in the data source. There are four modes:
 
-        * append: Contents of this DataFrame are expected to be appended to existing data.
+        * append: Contents of this :class:`DataFrame` are expected to be appended to existing data.
         * overwrite: Existing data is expected to be overwritten by the contents of this DataFrame.
         * error: An exception is expected to be thrown.
-        * ignore: The save operation is expected to not save the contents of the DataFrame and \
-            to not change the existing data.
+        * ignore: The save operation is expected to not save the contents of \
+            the :class:`DataFrame` and to not change the existing data.
         """
         if path is not None:
             options["path"] = path
@@ -217,7 +218,7 @@ def save(self, path=None, source=None, mode="append", **options):
 
     @property
     def schema(self):
-        """Returns the schema of this DataFrame (represented by
+        """Returns the schema of this :class:`DataFrame` (represented by
         a L{StructType}).
 
         >>> df.schema
@@ -275,12 +276,12 @@ def show(self):
         """
         Print the first 20 rows.
 
+        >>> df
+        DataFrame[age: int, name: string]
         >>> df.show()
         age name
         2   Alice
         5   Bob
-        >>> df
-        DataFrame[age: int, name: string]
         """
         print self._jdf.showString().encode('utf8', 'ignore')
 
@@ -481,8 +482,8 @@ def columns(self):
 
     def join(self, other, joinExprs=None, joinType=None):
         """
-        Join with another DataFrame, using the given join expression.
-        The following performs a full outer join between `df1` and `df2`::
+        Join with another :class:`DataFrame`, using the given join expression.
+        The following performs a full outer join between `df1` and `df2`.
 
         :param other: Right side of the join
         :param joinExprs: Join expression
@@ -582,8 +583,6 @@ def __getattr__(self, name):
     def select(self, *cols):
         """ Selecting a set of expressions.
 
-        >>> df.select().collect()
-        [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
         >>> df.select('*').collect()
         [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
         >>> df.select('name', 'age').collect()
@@ -591,8 +590,6 @@ def select(self, *cols):
         >>> df.select(df.name, (df.age + 10).alias('age')).collect()
         [Row(name=u'Alice', age=12), Row(name=u'Bob', age=15)]
         """
-        if not cols:
-            cols = ["*"]
         jcols = ListConverter().convert([_to_java_column(c) for c in cols],
                                         self._sc._gateway._gateway_client)
         jdf = self._jdf.select(self.sql_ctx._sc._jvm.PythonUtils.toSeq(jcols))
@@ -612,7 +609,7 @@ def selectExpr(self, *expr):
 
     def filter(self, condition):
         """ Filtering rows using the given condition, which could be
-        Column expression or string of SQL expression.
+        :class:`Column` expression or string of SQL expression.
 
         where() is an alias for filter().
 
@@ -666,7 +663,7 @@ def agg(self, *exprs):
         return self.groupBy().agg(*exprs)
 
     def unionAll(self, other):
-        """ Return a new DataFrame containing union of rows in this
+        """ Return a new :class:`DataFrame` containing union of rows in this
         frame and another frame.
 
         This is equivalent to `UNION ALL` in SQL.
@@ -919,9 +916,10 @@ class Column(object):
     """
     A column in a DataFrame.
 
-    `Column` instances can be created by::
+    :class:`Column` instances can be created by::
 
         # 1. Select a column out of a DataFrame
+
         df.colName
         df["colName"]
 
@@ -975,7 +973,7 @@ def __init__(self, jc):
 
     def substr(self, startPos, length):
         """
-        Return a Column which is a substring of the column
+        Return a :class:`Column` which is a substring of the column
 
         :param startPos: start position (int or Column)
         :param length:  length of the substring (int or Column)
@@ -996,8 +994,10 @@ def substr(self, startPos, length):
     __getslice__ = substr
 
     # order
-    asc = _unary_op("asc")
-    desc = _unary_op("desc")
+    asc = _unary_op("asc", "Returns a sort expression based on the"
+                           " ascending order of the given column name.")
+    desc = _unary_op("desc", "Returns a sort expression based on the"
+                             " descending order of the given column name.")
 
     isNull = _unary_op("isNull", "True if the current expression is null.")
     isNotNull = _unary_op("isNotNull", "True if the current expression is not null.")
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 8aa44765205c1..5873f09ae3275 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -72,6 +72,7 @@ def _(col):
     globals()[_name] = _create_function(_name, _doc)
 del _name, _doc
 __all__ += _functions.keys()
+__all__.sort()
 
 
 def countDistinct(col, *cols):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 39071e7e35ca1..83899ad4b1b12 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -36,9 +36,9 @@
 else:
     import unittest
 
-from pyspark.sql import SQLContext, HiveContext, Column
-from pyspark.sql.types import IntegerType, Row, ArrayType, StructType, StructField, \
-    UserDefinedType, DoubleType, LongType, StringType, _infer_type
+from pyspark.sql import SQLContext, HiveContext, Column, Row
+from pyspark.sql.types import *
+from pyspark.sql.types import UserDefinedType, _infer_type
 from pyspark.tests import ReusedPySparkTestCase
 
 
@@ -204,6 +204,68 @@ def test_infer_schema(self):
         result = self.sqlCtx.sql("SELECT l[0].a from test2 where d['key'].d = '2'")
         self.assertEqual(1, result.head()[0])
 
+    def test_infer_nested_schema(self):
+        NestedRow = Row("f1", "f2")
+        nestedRdd1 = self.sc.parallelize([NestedRow([1, 2], {"row1": 1.0}),
+                                          NestedRow([2, 3], {"row2": 2.0})])
+        df = self.sqlCtx.inferSchema(nestedRdd1)
+        self.assertEqual(Row(f1=[1, 2], f2={u'row1': 1.0}), df.collect()[0])
+
+        nestedRdd2 = self.sc.parallelize([NestedRow([[1, 2], [2, 3]], [1, 2]),
+                                          NestedRow([[2, 3], [3, 4]], [2, 3])])
+        df = self.sqlCtx.inferSchema(nestedRdd2)
+        self.assertEqual(Row(f1=[[1, 2], [2, 3]], f2=[1, 2]), df.collect()[0])
+
+        from collections import namedtuple
+        CustomRow = namedtuple('CustomRow', 'field1 field2')
+        rdd = self.sc.parallelize([CustomRow(field1=1, field2="row1"),
+                                   CustomRow(field1=2, field2="row2"),
+                                   CustomRow(field1=3, field2="row3")])
+        df = self.sqlCtx.inferSchema(rdd)
+        self.assertEquals(Row(field1=1, field2=u'row1'), df.first())
+
+    def test_apply_schema(self):
+        from datetime import date, datetime
+        rdd = self.sc.parallelize([(127, -128L, -32768, 32767, 2147483647L, 1.0,
+                                    date(2010, 1, 1), datetime(2010, 1, 1, 1, 1, 1),
+                                    {"a": 1}, (2,), [1, 2, 3], None)])
+        schema = StructType([
+            StructField("byte1", ByteType(), False),
+            StructField("byte2", ByteType(), False),
+            StructField("short1", ShortType(), False),
+            StructField("short2", ShortType(), False),
+            StructField("int1", IntegerType(), False),
+            StructField("float1", FloatType(), False),
+            StructField("date1", DateType(), False),
+            StructField("time1", TimestampType(), False),
+            StructField("map1", MapType(StringType(), IntegerType(), False), False),
+            StructField("struct1", StructType([StructField("b", ShortType(), False)]), False),
+            StructField("list1", ArrayType(ByteType(), False), False),
+            StructField("null1", DoubleType(), True)])
+        df = self.sqlCtx.applySchema(rdd, schema)
+        results = df.map(lambda x: (x.byte1, x.byte2, x.short1, x.short2, x.int1, x.float1, x.date1,
+                                    x.time1, x.map1["a"], x.struct1.b, x.list1, x.null1))
+        r = (127, -128, -32768, 32767, 2147483647, 1.0, date(2010, 1, 1),
+             datetime(2010, 1, 1, 1, 1, 1), 1, 2, [1, 2, 3], None)
+        self.assertEqual(r, results.first())
+
+        df.registerTempTable("table2")
+        r = self.sqlCtx.sql("SELECT byte1 - 1 AS byte1, byte2 + 1 AS byte2, " +
+                            "short1 + 1 AS short1, short2 - 1 AS short2, int1 - 1 AS int1, " +
+                            "float1 + 1.5 as float1 FROM table2").first()
+
+        self.assertEqual((126, -127, -32767, 32766, 2147483646, 2.5), tuple(r))
+
+        from pyspark.sql.types import _parse_schema_abstract, _infer_schema_type
+        rdd = self.sc.parallelize([(127, -32768, 1.0, datetime(2010, 1, 1, 1, 1, 1),
+                                    {"a": 1}, (2,), [1, 2, 3])])
+        abstract = "byte1 short1 float1 time1 map1{} struct1(b) list1[]"
+        schema = _parse_schema_abstract(abstract)
+        typedSchema = _infer_schema_type(rdd.first(), schema)
+        df = self.sqlCtx.applySchema(rdd, typedSchema)
+        r = (127, -32768, 1.0, datetime(2010, 1, 1, 1, 1, 1), {"a": 1}, Row(b=2), [1, 2, 3])
+        self.assertEqual(r, tuple(df.first()))
+
     def test_struct_in_map(self):
         d = [Row(m={Row(i=1): Row(s="")})]
         df = self.sc.parallelize(d).toDF()
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index b6e41cf0b29ff..0f5dc2be6dab8 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -28,7 +28,7 @@
 __all__ = [
     "DataType", "NullType", "StringType", "BinaryType", "BooleanType", "DateType",
     "TimestampType", "DecimalType", "DoubleType", "FloatType", "ByteType", "IntegerType",
-    "LongType", "ShortType", "ArrayType", "MapType", "StructField", "StructType", ]
+    "LongType", "ShortType", "ArrayType", "MapType", "StructField", "StructType"]
 
 
 class DataType(object):

From d51ed263ee791967380de6b9c892985ce87f6fcb Mon Sep 17 00:00:00 2001
From: prabs <prabsmails@gmail.com>
Date: Wed, 25 Feb 2015 14:37:35 +0000
Subject: [PATCH 299/357] [SPARK-5666][streaming][MQTT streaming] some trivial
 fixes

modified to adhere to accepted coding standards as pointed by tdas in PR #3844

Author: prabs <prabsmails@gmail.com>
Author: Prabeesh K <prabsmails@gmail.com>

Closes #4178 from prabeesh/master and squashes the following commits:

bd2cb49 [Prabeesh K] adress the comment
ccc0765 [prabs] adress the comment
46f9619 [prabs] adress the comment
c035bdc [prabs] adress the comment
22dd7f7 [prabs] address the comments
0cc67bd [prabs] adress the comment
838c38e [prabs] adress the comment
cd57029 [prabs] address the comments
66919a3 [Prabeesh K] changed MqttDefaultFilePersistence to MemoryPersistence
5857989 [prabs] modified to adhere to accepted coding standards
---
 .../examples/streaming/MQTTWordCount.scala    | 49 +++++++++++--------
 .../streaming/mqtt/MQTTInputDStream.scala     | 26 +++++-----
 .../spark/streaming/mqtt/MQTTUtils.scala      |  3 +-
 .../streaming/mqtt/MQTTStreamSuite.scala      | 12 ++---
 4 files changed, 50 insertions(+), 40 deletions(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala
index 6ff0c47793a25..f40caad322f59 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/MQTTWordCount.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.examples.streaming
 
-import org.eclipse.paho.client.mqttv3.{MqttClient, MqttClientPersistence, MqttException, MqttMessage, MqttTopic}
-import org.eclipse.paho.client.mqttv3.persist.MqttDefaultFilePersistence
+import org.eclipse.paho.client.mqttv3._
+import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
 
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.{Seconds, StreamingContext}
@@ -31,8 +31,6 @@ import org.apache.spark.SparkConf
  */
 object MQTTPublisher {
 
-  var client: MqttClient = _
-
   def main(args: Array[String]) {
     if (args.length < 2) {
       System.err.println("Usage: MQTTPublisher <MqttBrokerUrl> <topic>")
@@ -42,25 +40,36 @@ object MQTTPublisher {
     StreamingExamples.setStreamingLogLevels()
 
     val Seq(brokerUrl, topic) = args.toSeq
+    
+    var client: MqttClient = null
 
     try {
-      var peristance:MqttClientPersistence =new MqttDefaultFilePersistence("/tmp")
-      client = new MqttClient(brokerUrl, MqttClient.generateClientId(), peristance)
+      val persistence = new MemoryPersistence()
+      client = new MqttClient(brokerUrl, MqttClient.generateClientId(), persistence)
+
+      client.connect()
+
+      val msgtopic  = client.getTopic(topic)
+      val msgContent = "hello mqtt demo for spark streaming"
+      val message = new MqttMessage(msgContent.getBytes("utf-8"))
+
+      while (true) {
+        try {
+          msgtopic.publish(message)
+          println(s"Published data. topic: {msgtopic.getName()}; Message: {message}")
+        } catch {
+          case e: MqttException if e.getReasonCode == MqttException.REASON_CODE_MAX_INFLIGHT =>
+            Thread.sleep(10) 
+            println("Queue is full, wait for to consume data from the message queue")
+        }  
+      }      
     } catch {
       case e: MqttException => println("Exception Caught: " + e)
+    } finally {
+      if (client != null) {
+        client.disconnect()
+      }
     }
-
-    client.connect()
-
-    val msgtopic: MqttTopic = client.getTopic(topic)
-    val msg: String = "hello mqtt demo for spark streaming"
-
-    while (true) {
-      val message: MqttMessage = new MqttMessage(String.valueOf(msg).getBytes("utf-8"))
-      msgtopic.publish(message)
-      println("Published data. topic: " + msgtopic.getName() + " Message: " + message)
-    }
-   client.disconnect()
   }
 }
 
@@ -96,9 +105,9 @@ object MQTTWordCount {
     val sparkConf = new SparkConf().setAppName("MQTTWordCount")
     val ssc = new StreamingContext(sparkConf, Seconds(2))
     val lines = MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_ONLY_SER_2)
-
-    val words = lines.flatMap(x => x.toString.split(" "))
+    val words = lines.flatMap(x => x.split(" "))
     val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
+    
     wordCounts.print()
     ssc.start()
     ssc.awaitTermination()
diff --git a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala
index 1ef91dd49284f..3c0ef94cb0fab 100644
--- a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala
+++ b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTInputDStream.scala
@@ -17,23 +17,23 @@
 
 package org.apache.spark.streaming.mqtt
 
+import java.io.IOException
+import java.util.concurrent.Executors
+import java.util.Properties
+
+import scala.collection.JavaConversions._
 import scala.collection.Map
 import scala.collection.mutable.HashMap
-import scala.collection.JavaConversions._
 import scala.reflect.ClassTag
 
-import java.util.Properties
-import java.util.concurrent.Executors
-import java.io.IOException
-
+import org.eclipse.paho.client.mqttv3.IMqttDeliveryToken
 import org.eclipse.paho.client.mqttv3.MqttCallback
 import org.eclipse.paho.client.mqttv3.MqttClient
 import org.eclipse.paho.client.mqttv3.MqttClientPersistence
-import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
-import org.eclipse.paho.client.mqttv3.IMqttDeliveryToken
 import org.eclipse.paho.client.mqttv3.MqttException
 import org.eclipse.paho.client.mqttv3.MqttMessage
 import org.eclipse.paho.client.mqttv3.MqttTopic
+import org.eclipse.paho.client.mqttv3.persist.MemoryPersistence
 
 import org.apache.spark.Logging
 import org.apache.spark.storage.StorageLevel
@@ -82,18 +82,18 @@ class MQTTReceiver(
     val client = new MqttClient(brokerUrl, MqttClient.generateClientId(), persistence)
 
     // Callback automatically triggers as and when new message arrives on specified topic
-    val callback: MqttCallback = new MqttCallback() {
+    val callback = new MqttCallback() {
 
       // Handles Mqtt message
-      override def messageArrived(arg0: String, arg1: MqttMessage) {
-        store(new String(arg1.getPayload(),"utf-8"))
+      override def messageArrived(topic: String, message: MqttMessage) {
+        store(new String(message.getPayload(),"utf-8"))
       }
 
-      override def deliveryComplete(arg0: IMqttDeliveryToken) {
+      override def deliveryComplete(token: IMqttDeliveryToken) {
       }
 
-      override def connectionLost(arg0: Throwable) {
-        restart("Connection lost ", arg0)
+      override def connectionLost(cause: Throwable) {
+        restart("Connection lost ", cause)
       }
     }
 
diff --git a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTUtils.scala b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTUtils.scala
index c5ffe51f9986c..1142d0f56ba34 100644
--- a/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTUtils.scala
+++ b/external/mqtt/src/main/scala/org/apache/spark/streaming/mqtt/MQTTUtils.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.streaming.mqtt
 
+import scala.reflect.ClassTag
+
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.StreamingContext
 import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext, JavaDStream}
-import scala.reflect.ClassTag
 import org.apache.spark.streaming.dstream.{ReceiverInputDStream, DStream}
 
 object MQTTUtils {
diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
index 19c9271af77be..0f3298af6234a 100644
--- a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
+++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
@@ -42,8 +42,8 @@ import org.apache.spark.util.Utils
 class MQTTStreamSuite extends FunSuite with Eventually with BeforeAndAfter {
 
   private val batchDuration = Milliseconds(500)
-  private val master: String = "local[2]"
-  private val framework: String = this.getClass.getSimpleName
+  private val master = "local[2]"
+  private val framework = this.getClass.getSimpleName
   private val freePort = findFreePort()
   private val brokerUri = "//localhost:" + freePort
   private val topic = "def"
@@ -69,7 +69,7 @@ class MQTTStreamSuite extends FunSuite with Eventually with BeforeAndAfter {
 
   test("mqtt input stream") {
     val sendMessage = "MQTT demo for spark streaming"
-    val receiveStream: ReceiverInputDStream[String] =
+    val receiveStream =
       MQTTUtils.createStream(ssc, "tcp:" + brokerUri, topic, StorageLevel.MEMORY_ONLY)
     @volatile var receiveMessage: List[String] = List()
     receiveStream.foreachRDD { rdd =>
@@ -123,12 +123,12 @@ class MQTTStreamSuite extends FunSuite with Eventually with BeforeAndAfter {
   def publishData(data: String): Unit = {
     var client: MqttClient = null
     try {
-      val persistence: MqttClientPersistence = new MqttDefaultFilePersistence(persistenceDir.getAbsolutePath)
+      val persistence = new MqttDefaultFilePersistence(persistenceDir.getAbsolutePath)
       client = new MqttClient("tcp:" + brokerUri, MqttClient.generateClientId(), persistence)
       client.connect()
       if (client.isConnected) {
-        val msgTopic: MqttTopic = client.getTopic(topic)
-        val message: MqttMessage = new MqttMessage(data.getBytes("utf-8"))
+        val msgTopic = client.getTopic(topic)
+        val message = new MqttMessage(data.getBytes("utf-8"))
         message.setQos(1)
         message.setRetained(true)
 

From 5b8480e0359d5af8bdf570f115acb0b1b8997735 Mon Sep 17 00:00:00 2001
From: Benedikt Linse <benedikt.linse@gmail.com>
Date: Wed, 25 Feb 2015 14:46:17 +0000
Subject: [PATCH 300/357] [GraphX] fixing 3 typos in the graphx programming
 guide

Corrected 3 Typos in the GraphX programming guide. I hope this is the correct way to contribute.

Author: Benedikt Linse <benedikt.linse@gmail.com>

Closes #4766 from 1123/master and squashes the following commits:

8a63812 [Benedikt Linse] fixing 3 typos in the graphx programming guide
---
 docs/graphx-programming-guide.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md
index 826f6d8f371c7..28bdf81ca0ca5 100644
--- a/docs/graphx-programming-guide.md
+++ b/docs/graphx-programming-guide.md
@@ -538,7 +538,7 @@ val joinedGraph = graph.joinVertices(uniqueCosts,
 
 ## Neighborhood Aggregation
 
-A key step in may graph analytics tasks is aggregating information about the neighborhood of each
+A key step in many graph analytics tasks is aggregating information about the neighborhood of each
 vertex.
 For example, we might want to know the number of followers each user has or the average age of the
 the followers of each user.  Many iterative graph algorithms (e.g., PageRank, Shortest Path, and
@@ -634,7 +634,7 @@ avgAgeOfOlderFollowers.collect.foreach(println(_))
 
 ### Map Reduce Triplets Transition Guide (Legacy)
 
-In earlier versions of GraphX we neighborhood aggregation was accomplished using the
+In earlier versions of GraphX neighborhood aggregation was accomplished using the
 [`mapReduceTriplets`][Graph.mapReduceTriplets] operator:
 
 {% highlight scala %}
@@ -682,8 +682,8 @@ val result = graph.aggregateMessages[String](msgFun, reduceFun)
 ### Computing Degree Information
 
 A common aggregation task is computing the degree of each vertex: the number of edges adjacent to
-each vertex.  In the context of directed graphs it often necessary to know the in-degree, out-
-degree, and the total degree of each vertex.  The  [`GraphOps`][GraphOps] class contains a
+each vertex.  In the context of directed graphs it is often necessary to know the in-degree, 
+out-degree, and the total degree of each vertex.  The  [`GraphOps`][GraphOps] class contains a
 collection of operators to compute the degrees of each vertex.  For example in the following we
 compute the max in, out, and total degrees:
 

From dd077abf2e2949fdfec31074b760b587f00efcf2 Mon Sep 17 00:00:00 2001
From: guliangliang <guliangliang@qiyi.com>
Date: Wed, 25 Feb 2015 14:48:02 +0000
Subject: [PATCH 301/357] [SPARK-5771] Number of Cores in Completed
 Applications of Standalone Master Web Page always be 0 if sc.stop() is called

In Standalone mode, the number of cores in Completed Applications of the Master Web Page will always be zero, if sc.stop() is called.
But the number will always be right, if sc.stop() is not called.
The reason maybe:
after sc.stop() is called, the function removeExecutor of class ApplicationInfo will be called, thus reduce the variable coresGranted to zero. The variable coresGranted is used to display the number of Cores on the Web Page.

Author: guliangliang <guliangliang@qiyi.com>

Closes #4567 from marsishandsome/Spark5771 and squashes the following commits:

694796e [guliangliang] remove duplicate code
a20e390 [guliangliang] change to Cores Using & Requested
0c19c95 [guliangliang] change Cores to Cores (max)
cfbd97d [guliangliang] [SPARK-5771] Number of Cores in Completed Applications of Standalone Master Web Page always be 0 if sc.stop() is called
---
 .../spark/deploy/master/ApplicationInfo.scala |  4 +--
 .../spark/deploy/master/ui/MasterPage.scala   | 31 +++++++++++++++----
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala
index ede0a9dbefb8d..a962dc4af2f6c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala
@@ -90,9 +90,9 @@ private[spark] class ApplicationInfo(
     }
   }
 
-  private val myMaxCores = desc.maxCores.getOrElse(defaultCores)
+  val requestedCores = desc.maxCores.getOrElse(defaultCores)
 
-  def coresLeft: Int = myMaxCores - coresGranted
+  def coresLeft: Int = requestedCores - coresGranted
 
   private var _retryCount = 0
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
index fd514f07664a9..9dd96493ee48d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
@@ -50,12 +50,16 @@ private[spark] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
     val workers = state.workers.sortBy(_.id)
     val workerTable = UIUtils.listingTable(workerHeaders, workerRow, workers)
 
-    val appHeaders = Seq("Application ID", "Name", "Cores", "Memory per Node", "Submitted Time",
-      "User", "State", "Duration")
+    val activeAppHeaders = Seq("Application ID", "Name", "Cores in Use",
+      "Cores Requested", "Memory per Node", "Submitted Time", "User", "State", "Duration")
     val activeApps = state.activeApps.sortBy(_.startTime).reverse
-    val activeAppsTable = UIUtils.listingTable(appHeaders, appRow, activeApps)
+    val activeAppsTable = UIUtils.listingTable(activeAppHeaders, activeAppRow, activeApps)
+
+    val completedAppHeaders = Seq("Application ID", "Name", "Cores Requested", "Memory per Node",
+      "Submitted Time", "User", "State", "Duration")
     val completedApps = state.completedApps.sortBy(_.endTime).reverse
-    val completedAppsTable = UIUtils.listingTable(appHeaders, appRow, completedApps)
+    val completedAppsTable = UIUtils.listingTable(completedAppHeaders, completeAppRow,
+      completedApps)
 
     val driverHeaders = Seq("Submission ID", "Submitted Time", "Worker", "State", "Cores",
       "Memory", "Main Class")
@@ -162,7 +166,7 @@ private[spark] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
     </tr>
   }
 
-  private def appRow(app: ApplicationInfo): Seq[Node] = {
+  private def appRow(app: ApplicationInfo, active: Boolean): Seq[Node] = {
     <tr>
       <td>
         <a href={"app?appId=" + app.id}>{app.id}</a>
@@ -170,8 +174,15 @@ private[spark] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
       <td>
         <a href={app.desc.appUiUrl}>{app.desc.name}</a>
       </td>
+      {
+        if (active) {
+          <td>
+            {app.coresGranted}
+          </td>
+        }
+      }
       <td>
-        {app.coresGranted}
+        {app.requestedCores}
       </td>
       <td sorttable_customkey={app.desc.memoryPerSlave.toString}>
         {Utils.megabytesToString(app.desc.memoryPerSlave)}
@@ -183,6 +194,14 @@ private[spark] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
     </tr>
   }
 
+  private def activeAppRow(app: ApplicationInfo): Seq[Node] = {
+    appRow(app, active = true)
+  }
+
+  private def completeAppRow(app: ApplicationInfo): Seq[Node] = {
+    appRow(app, active = false)
+  }
+
   private def driverRow(driver: DriverInfo): Seq[Node] = {
     <tr>
       <td>{driver.id} </td>

From f84c799ea0b82abca6a4fad39532c2515743b632 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Wed, 25 Feb 2015 10:13:40 -0800
Subject: [PATCH 302/357] [SPARK-5996][SQL] Fix specialized outbound
 conversions

Author: Michael Armbrust <michael@databricks.com>

Closes #4757 from marmbrus/udtConversions and squashes the following commits:

3714aad [Michael Armbrust] [SPARK-5996][SQL] Fix specialized outbound conversions
---
 .../apache/spark/sql/execution/LocalTableScan.scala    |  7 +++++--
 .../apache/spark/sql/execution/basicOperators.scala    |  8 +++++---
 .../org/apache/spark/sql/UserDefinedTypeSuite.scala    | 10 ++++++++++
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
index d6d8258f46a9a..d3a18b37d52b9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.expressions.Attribute
 
 
@@ -30,7 +31,9 @@ case class LocalTableScan(output: Seq[Attribute], rows: Seq[Row]) extends LeafNo
 
   override def execute() = rdd
 
-  override def executeCollect() = rows.toArray
+  override def executeCollect() =
+    rows.map(ScalaReflection.convertRowToScala(_, schema)).toArray
 
-  override def executeTake(limit: Int) = rows.take(limit).toArray
+  override def executeTake(limit: Int) =
+    rows.map(ScalaReflection.convertRowToScala(_, schema)).take(limit).toArray
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 4dc506c21ab9e..710268584cff1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -134,13 +134,15 @@ case class TakeOrdered(limit: Int, sortOrder: Seq[SortOrder], child: SparkPlan)
 
   val ord = new RowOrdering(sortOrder, child.output)
 
+  private def collectData() = child.execute().map(_.copy()).takeOrdered(limit)(ord)
+
   // TODO: Is this copying for no reason?
-  override def executeCollect() = child.execute().map(_.copy()).takeOrdered(limit)(ord)
-    .map(ScalaReflection.convertRowToScala(_, this.schema))
+  override def executeCollect() =
+    collectData().map(ScalaReflection.convertRowToScala(_, this.schema))
 
   // TODO: Terminal split should be implemented differently from non-terminal split.
   // TODO: Pick num splits based on |limit|.
-  override def execute() = sparkContext.makeRDD(executeCollect(), 1)
+  override def execute() = sparkContext.makeRDD(collectData(), 1)
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index 9c098df24c65f..47fdb5543235c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -22,6 +22,7 @@ import java.io.File
 import scala.beans.{BeanInfo, BeanProperty}
 
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext.{sparkContext, sql}
 import org.apache.spark.sql.test.TestSQLContext.implicits._
@@ -105,4 +106,13 @@ class UserDefinedTypeSuite extends QueryTest {
     tempDir.delete()
     pointsRDD.repartition(1).saveAsParquetFile(tempDir.getCanonicalPath)
   }
+
+  // Tests to make sure that all operators correctly convert types on the way out.
+  test("Local UDTs") {
+    val df = Seq((1, new MyDenseVector(Array(0.1, 1.0)))).toDF("int", "vec")
+    df.collect()(0).getAs[MyDenseVector](1)
+    df.take(1)(0).getAs[MyDenseVector](1)
+    df.limit(1).groupBy('int).agg(first('vec)).collect()(0).getAs[MyDenseVector](0)
+    df.orderBy('int).limit(1).groupBy('int).agg(first('vec)).collect()(0).getAs[MyDenseVector](0)
+  }
 }

From 7d8e6a2e44e13a6d6cdcd98a0d0c33b243ef0dc2 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 25 Feb 2015 12:20:44 -0800
Subject: [PATCH 303/357] SPARK-5930 [DOCS] Documented default of
 spark.shuffle.io.retryWait is confusing

Clarify default max wait in spark.shuffle.io.retryWait docs

CC andrewor14

Author: Sean Owen <sowen@cloudera.com>

Closes #4769 from srowen/SPARK-5930 and squashes the following commits:

ae2792b [Sean Owen] Clarify default max wait in spark.shuffle.io.retryWait docs
---
 docs/configuration.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index c8db338cb6f89..81298514a7cb2 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -955,7 +955,9 @@ Apart from these, the following properties are also available, and may be useful
   <td>5</td>
   <td>
     (Netty only) Seconds to wait between retries of fetches. The maximum delay caused by retrying
-    is simply <code>maxRetries * retryWait</code>, by default 15 seconds.
+    is simply <code>maxRetries * retryWait</code>. The default maximum delay is therefore 
+    15 seconds, because the default value of <code>maxRetries</code> is 3, and the default
+    <code>retryWait</code> here is 5 seconds.
   </td>
 </tr>
 </table>

From a777c65da9bc636e5cf5426e15a2e76d6b21b744 Mon Sep 17 00:00:00 2001
From: Milan Straka <fox@ucw.cz>
Date: Wed, 25 Feb 2015 21:33:34 +0000
Subject: [PATCH 304/357] [SPARK-5970][core] Register directory created in
 getOrCreateLocalRootDirs for automatic deletion.

As documented in createDirectory, the result of createDirectory is not registered for automatic removal. Currently there are 4 directories left in `/tmp` after just running `pyspark`.

Author: Milan Straka <fox@ucw.cz>

Closes #4759 from foxik/remove-tmp-dirs and squashes the following commits:

280450d [Milan Straka] Use createTempDir in getOrCreateLocalRootDirs...
---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index df21ed37e76b1..4803ff9403b1d 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -696,7 +696,7 @@ private[spark] object Utils extends Logging {
           try {
             val rootDir = new File(root)
             if (rootDir.exists || rootDir.mkdirs()) {
-              val dir = createDirectory(root)
+              val dir = createTempDir(root)
               chmod700(dir)
               Some(dir.getAbsolutePath)
             } else {

From 9f603fce78fcc997926e9a72dec44d48cbc396fc Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 25 Feb 2015 14:11:12 -0800
Subject: [PATCH 305/357] [SPARK-1955][GraphX]: VertexRDD can incorrectly
 assume index sharing

Fixes the issue whereby when VertexRDD's are `diff`ed, `innerJoin`ed, or `leftJoin`ed and have different partition sizes they fail under the `zipPartitions` method. This fix tests whether the partitions are equal or not and, if not, will repartition the other to match the partition size of the calling VertexRDD.

Author: Brennon York <brennon.york@capitalone.com>

Closes #4705 from brennonyork/SPARK-1955 and squashes the following commits:

0882590 [Brennon York] updated to properly handle differently-partitioned vertexRDDs
---
 .../org/apache/spark/graphx/impl/VertexRDDImpl.scala | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala
index 6dad167fa7411..904be213147dc 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala
@@ -104,8 +104,14 @@ class VertexRDDImpl[VD] private[graphx] (
     this.mapVertexPartitions(_.map(f))
 
   override def diff(other: VertexRDD[VD]): VertexRDD[VD] = {
+    val otherPartition = other match {
+      case other: VertexRDD[_] if this.partitioner == other.partitioner =>
+        other.partitionsRDD
+      case _ =>
+        VertexRDD(other.partitionBy(this.partitioner.get)).partitionsRDD
+    }
     val newPartitionsRDD = partitionsRDD.zipPartitions(
-      other.partitionsRDD, preservesPartitioning = true
+      otherPartition, preservesPartitioning = true
     ) { (thisIter, otherIter) =>
       val thisPart = thisIter.next()
       val otherPart = otherIter.next()
@@ -133,7 +139,7 @@ class VertexRDDImpl[VD] private[graphx] (
     // Test if the other vertex is a VertexRDD to choose the optimal join strategy.
     // If the other set is a VertexRDD then we use the much more efficient leftZipJoin
     other match {
-      case other: VertexRDD[_] =>
+      case other: VertexRDD[_] if this.partitioner == other.partitioner =>
         leftZipJoin(other)(f)
       case _ =>
         this.withPartitionsRDD[VD3](
@@ -162,7 +168,7 @@ class VertexRDDImpl[VD] private[graphx] (
     // Test if the other vertex is a VertexRDD to choose the optimal join strategy.
     // If the other set is a VertexRDD then we use the much more efficient innerZipJoin
     other match {
-      case other: VertexRDD[_] =>
+      case other: VertexRDD[_] if this.partitioner == other.partitioner =>
         innerZipJoin(other)(f)
       case _ =>
         this.withPartitionsRDD(

From 838a48036c050cef03b8c3620e16b5495cd7beab Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Wed, 25 Feb 2015 14:55:24 -0800
Subject: [PATCH 306/357] [SPARK-5982] Remove incorrect Local Read Time Metric

This metric is incomplete, because the files are memory mapped, so much of the read from disk occurs later as tasks actually read the file's data.

This should be merged into 1.3, so that we never expose this incorrect metric to users.

CC pwendell ksakellis sryza

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #4749 from kayousterhout/SPARK-5982 and squashes the following commits:

9737b5e [Kay Ousterhout] More fixes
a1eb300 [Kay Ousterhout] Removed one more use of local read time
cf13497 [Kay Ousterhout] [SPARK-5982] Remove incorrectwq Local Read Time Metric
---
 .../scala/org/apache/spark/executor/TaskMetrics.scala     | 8 --------
 .../main/scala/org/apache/spark/scheduler/JobLogger.scala | 1 -
 .../spark/storage/ShuffleBlockFetcherIterator.scala       | 2 --
 .../main/scala/org/apache/spark/util/JsonProtocol.scala   | 2 --
 .../scala/org/apache/spark/util/JsonProtocolSuite.scala   | 3 ---
 5 files changed, 16 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
index df36566bec4b1..07b152651dedf 100644
--- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
+++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
@@ -203,7 +203,6 @@ class TaskMetrics extends Serializable {
         merged.incRemoteBlocksFetched(depMetrics.remoteBlocksFetched)
         merged.incRemoteBytesRead(depMetrics.remoteBytesRead)
         merged.incLocalBytesRead(depMetrics.localBytesRead)
-        merged.incLocalReadTime(depMetrics.localReadTime)
         merged.incRecordsRead(depMetrics.recordsRead)
       }
       _shuffleReadMetrics = Some(merged)
@@ -345,13 +344,6 @@ class ShuffleReadMetrics extends Serializable {
   private[spark] def incRemoteBytesRead(value: Long) = _remoteBytesRead += value
   private[spark] def decRemoteBytesRead(value: Long) = _remoteBytesRead -= value
 
-  /**
-   * Time the task spent (in milliseconds) reading local shuffle blocks (from the local disk).
-   */
-  private var _localReadTime: Long = _
-  def localReadTime = _localReadTime
-  private[spark] def incLocalReadTime(value: Long) = _localReadTime += value
-
   /**
    * Shuffle data that was read from the local disk (as opposed to from a remote executor).
    */
diff --git a/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala b/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala
index f9fc8aa30454e..8aa528ac573d0 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala
@@ -170,7 +170,6 @@ class JobLogger(val user: String, val logDirName: String) extends SparkListener
         " BLOCK_FETCHED_REMOTE=" + metrics.remoteBlocksFetched +
         " REMOTE_FETCH_WAIT_TIME=" + metrics.fetchWaitTime +
         " REMOTE_BYTES_READ=" + metrics.remoteBytesRead +
-        " LOCAL_READ_TIME=" + metrics.localReadTime +
         " LOCAL_BYTES_READ=" + metrics.localBytesRead
       case None => ""
     }
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index 2ebb79989da43..8f28ef49a8a6f 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -228,7 +228,6 @@ final class ShuffleBlockFetcherIterator(
    * track in-memory are the ManagedBuffer references themselves.
    */
   private[this] def fetchLocalBlocks() {
-    val startTime = System.currentTimeMillis
     val iter = localBlocks.iterator
     while (iter.hasNext) {
       val blockId = iter.next()
@@ -246,7 +245,6 @@ final class ShuffleBlockFetcherIterator(
           return
       }
     }
-    shuffleMetrics.incLocalReadTime(System.currentTimeMillis - startTime)
   }
 
   private[this] def initialize(): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 58d37e2d667f7..8e20864db5673 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -294,7 +294,6 @@ private[spark] object JsonProtocol {
     ("Local Blocks Fetched" -> shuffleReadMetrics.localBlocksFetched) ~
     ("Fetch Wait Time" -> shuffleReadMetrics.fetchWaitTime) ~
     ("Remote Bytes Read" -> shuffleReadMetrics.remoteBytesRead) ~
-    ("Local Read Time" -> shuffleReadMetrics.localReadTime) ~
     ("Local Bytes Read" -> shuffleReadMetrics.localBytesRead) ~
     ("Total Records Read" -> shuffleReadMetrics.recordsRead)
   }
@@ -676,7 +675,6 @@ private[spark] object JsonProtocol {
     metrics.incLocalBlocksFetched((json \ "Local Blocks Fetched").extract[Int])
     metrics.incFetchWaitTime((json \ "Fetch Wait Time").extract[Long])
     metrics.incRemoteBytesRead((json \ "Remote Bytes Read").extract[Long])
-    metrics.incLocalReadTime((json \ "Local Read Time").extractOpt[Long].getOrElse(0))
     metrics.incLocalBytesRead((json \ "Local Bytes Read").extractOpt[Long].getOrElse(0))
     metrics.incRecordsRead((json \ "Total Records Read").extractOpt[Long].getOrElse(0))
     metrics
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index c181baf6844b0..a2be724254d7c 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -270,7 +270,6 @@ class JsonProtocolSuite extends FunSuite {
       .removeField { case (field, _) => field == "Local Read Time" }
     val newMetrics = JsonProtocol.taskMetricsFromJson(oldJson)
     assert(newMetrics.shuffleReadMetrics.get.localBytesRead == 0)
-    assert(newMetrics.shuffleReadMetrics.get.localReadTime == 0)
   }
 
   test("SparkListenerApplicationStart backwards compatibility") {
@@ -708,7 +707,6 @@ class JsonProtocolSuite extends FunSuite {
       sr.incFetchWaitTime(a + d)
       sr.incRemoteBlocksFetched(f)
       sr.incRecordsRead(if (hasRecords) (b + d) / 100 else -1)
-      sr.incLocalReadTime(a + e)
       sr.incLocalBytesRead(a + f)
       t.setShuffleReadMetrics(Some(sr))
     }
@@ -956,7 +954,6 @@ class JsonProtocolSuite extends FunSuite {
       |      "Local Blocks Fetched": 700,
       |      "Fetch Wait Time": 900,
       |      "Remote Bytes Read": 1000,
-      |      "Local Read Time": 1000,
       |      "Local Bytes Read": 1100,
       |      "Total Records Read" : 10
       |    },

From f3f4c87b3d944c10d1200dfe49091ebb2a149be6 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 25 Feb 2015 15:13:34 -0800
Subject: [PATCH 307/357] [SPARK-5944] [PySpark] fix version in Python API docs

use RELEASE_VERSION when building the Python API docs

Author: Davies Liu <davies@databricks.com>

Closes #4731 from davies/api_version and squashes the following commits:

c9744c9 [Davies Liu] Update create-release.sh
08cbc3f [Davies Liu] fix python docs
---
 dev/create-release/create-release.sh | 2 +-
 python/docs/conf.py                  | 6 +++---
 python/docs/pyspark.sql.rst          | 2 +-
 python/pyspark/rdd.py                | 4 ++++
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh
index 607ce1c803507..da15ce3e0e2f7 100755
--- a/dev/create-release/create-release.sh
+++ b/dev/create-release/create-release.sh
@@ -237,7 +237,7 @@ if [[ ! "$@" =~ --skip-package ]]; then
   sbt/sbt clean
   cd docs
   # Compile docs with Java 7 to use nicer format
-  JAVA_HOME=$JAVA_7_HOME PRODUCTION=1 jekyll build
+  JAVA_HOME="$JAVA_7_HOME" PRODUCTION=1 RELEASE_VERSION="$RELEASE_VERSION" jekyll build
   echo "Copying release documentation"
   rc_docs_folder=${rc_folder}-docs
   ssh $ASF_USERNAME@people.apache.org \
diff --git a/python/docs/conf.py b/python/docs/conf.py
index cbbf7ffb08992..163987dd8e5fa 100644
--- a/python/docs/conf.py
+++ b/python/docs/conf.py
@@ -48,16 +48,16 @@
 
 # General information about the project.
 project = u'PySpark'
-copyright = u'2014, Author'
+copyright = u''
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = '1.3-SNAPSHOT'
+version = 'master'
 # The full version, including alpha/beta/rc tags.
-release = '1.3-SNAPSHOT'
+release = os.environ.get('RELEASE_VERSION', version)
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/python/docs/pyspark.sql.rst b/python/docs/pyspark.sql.rst
index 2e3f69b9a562a..6259379ed05b7 100644
--- a/python/docs/pyspark.sql.rst
+++ b/python/docs/pyspark.sql.rst
@@ -17,7 +17,7 @@ pyspark.sql.types module
 
 
 pyspark.sql.functions module
-------------------------
+----------------------------
 .. automodule:: pyspark.sql.functions
     :members:
     :undoc-members:
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index d3148de6f41a3..cb12fed98c53d 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -2111,6 +2111,7 @@ def _to_java_object_rdd(self):
     def countApprox(self, timeout, confidence=0.95):
         """
         .. note:: Experimental
+
         Approximate version of count() that returns a potentially incomplete
         result within a timeout, even if not all tasks have finished.
 
@@ -2124,6 +2125,7 @@ def countApprox(self, timeout, confidence=0.95):
     def sumApprox(self, timeout, confidence=0.95):
         """
         .. note:: Experimental
+
         Approximate operation to return the sum within a timeout
         or meet the confidence.
 
@@ -2140,6 +2142,7 @@ def sumApprox(self, timeout, confidence=0.95):
     def meanApprox(self, timeout, confidence=0.95):
         """
         .. note:: Experimental
+
         Approximate operation to return the mean within a timeout
         or meet the confidence.
 
@@ -2156,6 +2159,7 @@ def meanApprox(self, timeout, confidence=0.95):
     def countApproxDistinct(self, relativeSD=0.05):
         """
         .. note:: Experimental
+
         Return approximate number of distinct elements in the RDD.
 
         The algorithm used is based on streamlib's implementation of

From e0fdd467e277867d6bec5c6605cc1cabce70ac89 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 25 Feb 2015 15:15:22 -0800
Subject: [PATCH 308/357] [SPARK-6010] [SQL] Merging compatible Parquet schemas
 before computing splits

`ReadContext.init` calls `InitContext.getMergedKeyValueMetadata`, which doesn't know how to merge conflicting user defined key-value metadata and throws exception. In our case, when dealing with different but compatible schemas, we have different Spark SQL schema JSON strings in different Parquet part-files, thus causes this problem. Reading similar Parquet files generated by Hive doesn't suffer from this issue.

In this PR, we manually merge the schemas before passing it to `ReadContext` to avoid the exception.

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4768)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>

Closes #4768 from liancheng/spark-6010 and squashes the following commits:

9002f0a [Cheng Lian] Fixes SPARK-6010
---
 .../sql/parquet/ParquetTableOperations.scala  | 20 +++++++++++++++++-
 .../spark/sql/parquet/ParquetTest.scala       |  5 +++++
 .../ParquetPartitionDiscoverySuite.scala      | 21 +++++++++++++++++++
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 65966458eb670..4dc13b036cd4e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -48,6 +48,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row, _}
 import org.apache.spark.sql.execution.{LeafNode, SparkPlan, UnaryNode}
+import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.{Logging, SerializableWritable, TaskContext}
 
 /**
@@ -459,13 +460,30 @@ private[parquet] class FilteringParquetRowInputFormat
     val getGlobalMetaData =
       classOf[ParquetFileWriter].getDeclaredMethod("getGlobalMetaData", classOf[JList[Footer]])
     getGlobalMetaData.setAccessible(true)
-    val globalMetaData = getGlobalMetaData.invoke(null, footers).asInstanceOf[GlobalMetaData]
+    var globalMetaData = getGlobalMetaData.invoke(null, footers).asInstanceOf[GlobalMetaData]
 
     if (globalMetaData == null) {
      val splits = mutable.ArrayBuffer.empty[ParquetInputSplit]
      return splits
     }
 
+    Option(globalMetaData.getKeyValueMetaData.get(RowReadSupport.SPARK_METADATA_KEY)).foreach {
+      schemas =>
+        val mergedSchema = schemas
+          .map(DataType.fromJson(_).asInstanceOf[StructType])
+          .reduce(_ merge _)
+          .json
+
+        val mergedMetadata = globalMetaData
+          .getKeyValueMetaData
+          .updated(RowReadSupport.SPARK_METADATA_KEY, setAsJavaSet(Set(mergedSchema)))
+
+        globalMetaData = new GlobalMetaData(
+          globalMetaData.getSchema,
+          mergedMetadata,
+          globalMetaData.getCreatedBy)
+    }
+
     val readContext = getReadSupport(configuration).init(
       new InitContext(configuration,
         globalMetaData.getKeyValueMetaData,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
index 0fa2fe90f9674..d6ea6679c5966 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
@@ -131,6 +131,11 @@ private[sql] trait ParquetTest {
     data.toDF().save(path.getCanonicalPath, "org.apache.spark.sql.parquet", SaveMode.Overwrite)
   }
 
+  protected def makeParquetFile[T <: Product: ClassTag: TypeTag](
+      df: DataFrame, path: File): Unit = {
+    df.save(path.getCanonicalPath, "org.apache.spark.sql.parquet", SaveMode.Overwrite)
+  }
+
   protected def makePartitionDir(
       basePath: File,
       defaultPartitionName: String,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
index 3bf0116c8f7e9..adb3c9391f6c2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -36,6 +36,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
   override val sqlContext: SQLContext = TestSQLContext
 
   import sqlContext._
+  import sqlContext.implicits._
 
   val defaultPartitionName = "__NULL__"
 
@@ -319,4 +320,24 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest {
       }
     }
   }
+
+  test("read partitioned table - merging compatible schemas") {
+    withTempDir { base =>
+      makeParquetFile(
+        (1 to 10).map(i => Tuple1(i)).toDF("intField"),
+        makePartitionDir(base, defaultPartitionName, "pi" -> 1))
+
+      makeParquetFile(
+        (1 to 10).map(i => (i, i.toString)).toDF("intField", "stringField"),
+        makePartitionDir(base, defaultPartitionName, "pi" -> 2))
+
+      load(base.getCanonicalPath, "org.apache.spark.sql.parquet").registerTempTable("t")
+
+      withTempTable("t") {
+        checkAnswer(
+          sql("SELECT * FROM t"),
+          (1 to 10).map(i => Row(i, null, 1)) ++ (1 to 10).map(i => Row(i, i.toString, 2)))
+      }
+    }
+  }
 }

From 12dbf98c5d270e3846e946592666160b1541d9dc Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 25 Feb 2015 15:22:33 -0800
Subject: [PATCH 309/357] [SPARK-5999][SQL] Remove duplicate Literal matching
 block

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #4760 from viirya/dup_literal and squashes the following commits:

06e7516 [Liang-Chi Hsieh] Remove duplicate Literal matching block.
---
 .../sql/catalyst/expressions/literals.scala   |  2 ++
 .../org/apache/spark/sql/functions.scala      | 20 +------------------
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 97bb96f48e2c7..9ff66563c8164 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -38,6 +38,8 @@ object Literal {
     case d: Date => Literal(DateUtils.fromJavaDate(d), DateType)
     case a: Array[Byte] => Literal(a, BinaryType)
     case null => Literal(null, NullType)
+    case _ =>
+      throw new RuntimeException("Unsupported literal type " + v.getClass + " " + v)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 4fdbfc6d22c9b..111e751588a8b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -74,25 +74,7 @@ object functions {
       case _ =>  // continue
     }
 
-    val literalExpr = literal match {
-      case v: Boolean => Literal(v, BooleanType)
-      case v: Byte => Literal(v, ByteType)
-      case v: Short => Literal(v, ShortType)
-      case v: Int => Literal(v, IntegerType)
-      case v: Long => Literal(v, LongType)
-      case v: Float => Literal(v, FloatType)
-      case v: Double => Literal(v, DoubleType)
-      case v: String => Literal(v, StringType)
-      case v: BigDecimal => Literal(Decimal(v), DecimalType.Unlimited)
-      case v: java.math.BigDecimal => Literal(Decimal(v), DecimalType.Unlimited)
-      case v: Decimal => Literal(v, DecimalType.Unlimited)
-      case v: java.sql.Timestamp => Literal(v, TimestampType)
-      case v: java.sql.Date => Literal(v, DateType)
-      case v: Array[Byte] => Literal(v, BinaryType)
-      case null => Literal(null, NullType)
-      case _ =>
-        throw new RuntimeException("Unsupported literal type " + literal.getClass + " " + literal)
-    }
+    val literalExpr = Literal(literal)
     Column(literalExpr)
   }
 

From 41e2e5acb749c25641f1f8dea5a2e1d8af319486 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 25 Feb 2015 15:37:13 -0800
Subject: [PATCH 310/357] [SPARK-5926] [SQL] make DataFrame.explain leverage
 queryExecution.logical

DataFrame.explain return wrong result when the query is DDL command.

For example, the following two queries should print out the same execution plan, but it not.
sql("create table tb as select * from src where key > 490").explain(true)
sql("explain extended create table tb as select * from src where key > 490")

This is because DataFrame.explain leverage logicalPlan which had been forced executed, we should use  the unexecuted plan queryExecution.logical.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #4707 from yanboliang/spark-5926 and squashes the following commits:

fa6db63 [Yanbo Liang] logicalPlan is not lazy
0e40a1b [Yanbo Liang] make DataFrame.explain leverage queryExecution.logical
---
 sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 04bf5d9b0f931..f045da305ca5d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -264,7 +264,7 @@ class DataFrame protected[sql](
    */
   def explain(extended: Boolean): Unit = {
     ExplainCommand(
-      logicalPlan,
+      queryExecution.logical,
       extended = extended).queryExecution.executedPlan.executeCollect().map {
       r => println(r.getString(0))
     }

From 46a044a36a2aff1306f7f677e952ce253ddbefac Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Wed, 25 Feb 2015 16:12:56 -0800
Subject: [PATCH 311/357] [SPARK-1182][Docs] Sort the configuration parameters
 in configuration.md

Sorts all configuration options present on the `configuration.md` page to ease readability.

Author: Brennon York <brennon.york@capitalone.com>

Closes #3863 from brennonyork/SPARK-1182 and squashes the following commits:

5696f21 [Brennon York] fixed merge conflict with port comments
81a7b10 [Brennon York] capitalized A in Allocation
e240486 [Brennon York] moved all spark.mesos properties into the running-on-mesos doc
7de5f75 [Brennon York] moved serialization from application to compression and serialization section
a16fec0 [Brennon York] moved shuffle settings from network to shuffle
f8fa286 [Brennon York] sorted encryption category
1023f15 [Brennon York] moved initialExecutors
e9d62aa [Brennon York] fixed akka.heartbeat.interval
25e6f6f [Brennon York] moved spark.executer.user*
4625ade [Brennon York] added spark.executor.extra* items
4ee5648 [Brennon York] fixed merge conflicts
1b49234 [Brennon York] sorting mishap
2b5758b [Brennon York] sorting mishap
6fbdf42 [Brennon York] sorting mishap
55dc6f8 [Brennon York] sorted security
ec34294 [Brennon York] sorted dynamic allocation
2a7c4a3 [Brennon York] sorted scheduling
aa9acdc [Brennon York] sorted networking
a4380b8 [Brennon York] sorted execution behavior
27f3919 [Brennon York] sorted compression and serialization
80a5bbb [Brennon York] sorted spark ui
3f32e5b [Brennon York] sorted shuffle behavior
6c51b38 [Brennon York] sorted runtime environment
efe9d6f [Brennon York] sorted application properties
---
 docs/configuration.md    | 1008 +++++++++++++++++++-------------------
 docs/running-on-mesos.md |   24 +-
 2 files changed, 508 insertions(+), 524 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index 81298514a7cb2..8dd2bad61344f 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -94,14 +94,6 @@ of the most common options to set are:
     The name of your application. This will appear in the UI and in log data.
   </td>
 </tr>
-<tr>
-  <td><code>spark.master</code></td>
-  <td>(none)</td>
-  <td>
-    The cluster manager to connect to. See the list of
-    <a href="submitting-applications.html#master-urls"> allowed master URL's</a>.
-  </td>
-</tr>
 <tr>
   <td><code>spark.driver.cores</code></td>
   <td>1</td>
@@ -109,27 +101,6 @@ of the most common options to set are:
     Number of cores to use for the driver process, only in cluster mode.
   </td>
 </tr>
-<tr>
-  <td><code>spark.driver.memory</code></td>
-  <td>512m</td>
-  <td>
-    Amount of memory to use for the driver process, i.e. where SparkContext is initialized.
-    (e.g. <code>512m</code>, <code>2g</code>).
-    
-    <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
-    directly in your application, because the driver JVM has already started at that point.
-    Instead, please set this through the <code>--driver-memory</code> command line option
-    or in your default properties file.</td>
-</tr>
-<tr>
-  <td><code>spark.executor.memory</code></td>
-  <td>512m</td>
-  <td>
-    Amount of memory to use per executor process, in the same format as JVM memory strings
-    (e.g. <code>512m</code>, <code>2g</code>).
-  </td>
-</tr>
-<tr>
   <td><code>spark.driver.maxResultSize</code></td>
   <td>1g</td>
   <td>
@@ -142,38 +113,35 @@ of the most common options to set are:
   </td>
 </tr>
 <tr>
-  <td><code>spark.serializer</code></td>
-  <td>org.apache.spark.serializer.<br />JavaSerializer</td>
+  <td><code>spark.driver.memory</code></td>
+  <td>512m</td>
   <td>
-    Class to use for serializing objects that will be sent over the network or need to be cached
-    in serialized form. The default of Java serialization works with any Serializable Java object
-    but is quite slow, so we recommend <a href="tuning.html">using
-    <code>org.apache.spark.serializer.KryoSerializer</code> and configuring Kryo serialization</a>
-    when speed is necessary. Can be any subclass of
-    <a href="api/scala/index.html#org.apache.spark.serializer.Serializer">
-    <code>org.apache.spark.Serializer</code></a>.
+    Amount of memory to use for the driver process, i.e. where SparkContext is initialized.
+    (e.g. <code>512m</code>, <code>2g</code>).
+    
+    <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
+    directly in your application, because the driver JVM has already started at that point.
+    Instead, please set this through the <code>--driver-memory</code> command line option
+    or in your default properties file.
   </td>
 </tr>
 <tr>
-  <td><code>spark.kryo.classesToRegister</code></td>
-  <td>(none)</td>
+  <td><code>spark.executor.memory</code></td>
+  <td>512m</td>
   <td>
-    If you use Kryo serialization, give a comma-separated list of custom class names to register
-    with Kryo.
-    See the <a href="tuning.html#data-serialization">tuning guide</a> for more details.
+    Amount of memory to use per executor process, in the same format as JVM memory strings
+    (e.g. <code>512m</code>, <code>2g</code>).
   </td>
 </tr>
 <tr>
-  <td><code>spark.kryo.registrator</code></td>
+  <td><code>spark.extraListeners</code></td>
   <td>(none)</td>
   <td>
-    If you use Kryo serialization, set this class to register your custom classes with Kryo. This
-    property is useful if you need to register your classes in a custom way, e.g. to specify a custom
-    field serializer. Otherwise <code>spark.kryo.classesToRegister</code> is simpler. It should be
-    set to a class that extends
-    <a href="api/scala/index.html#org.apache.spark.serializer.KryoRegistrator">
-    <code>KryoRegistrator</code></a>.
-    See the <a href="tuning.html#data-serialization">tuning guide</a> for more details.
+    A comma-separated list of classes that implement <code>SparkListener</code>; when initializing
+    SparkContext, instances of these classes will be created and registered with Spark's listener
+    bus.  If a class has a single-argument constructor that accepts a SparkConf, that constructor
+    will be called; otherwise, a zero-argument constructor will be called. If no valid constructor
+    can be found, the SparkContext creation will fail with an exception.
   </td>
 </tr>
 <tr>
@@ -196,14 +164,11 @@ of the most common options to set are:
   </td>
 </tr>
 <tr>
-  <td><code>spark.extraListeners</code></td>
+  <td><code>spark.master</code></td>
   <td>(none)</td>
   <td>
-    A comma-separated list of classes that implement <code>SparkListener</code>; when initializing
-    SparkContext, instances of these classes will be created and registered with Spark's listener
-    bus.  If a class has a single-argument constructor that accepts a SparkConf, that constructor
-    will be called; otherwise, a zero-argument constructor will be called. If no valid constructor
-    can be found, the SparkContext creation will fail with an exception.
+    The cluster manager to connect to. See the list of
+    <a href="submitting-applications.html#master-urls"> allowed master URL's</a>.
   </td>
 </tr>
 </table>
@@ -214,26 +179,26 @@ Apart from these, the following properties are also available, and may be useful
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
-  <td><code>spark.driver.extraJavaOptions</code></td>
+  <td><code>spark.driver.extraClassPath</code></td>
   <td>(none)</td>
   <td>
-    A string of extra JVM options to pass to the driver. For instance, GC settings or other logging.
-    
+    Extra classpath entries to append to the classpath of the driver.
+
     <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
     directly in your application, because the driver JVM has already started at that point.
-    Instead, please set this through the <code>--driver-java-options</code> command line option or in 
+    Instead, please set this through the <code>--driver-class-path</code> command line option or in 
     your default properties file.</td>
   </td>
 </tr>
 <tr>
-  <td><code>spark.driver.extraClassPath</code></td>
+  <td><code>spark.driver.extraJavaOptions</code></td>
   <td>(none)</td>
   <td>
-    Extra classpath entries to append to the classpath of the driver.
-
+    A string of extra JVM options to pass to the driver. For instance, GC settings or other logging.
+    
     <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
     directly in your application, because the driver JVM has already started at that point.
-    Instead, please set this through the <code>--driver-class-path</code> command line option or in 
+    Instead, please set this through the <code>--driver-java-options</code> command line option or in 
     your default properties file.</td>
   </td>
 </tr>
@@ -261,23 +226,22 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.executor.extraJavaOptions</code></td>
+  <td><code>spark.executor.extraClassPath</code></td>
   <td>(none)</td>
   <td>
-    A string of extra JVM options to pass to executors. For instance, GC settings or other
-    logging. Note that it is illegal to set Spark properties or heap size settings with this
-    option. Spark properties should be set using a SparkConf object or the
-    spark-defaults.conf file used with the spark-submit script. Heap size settings can be set
-    with spark.executor.memory.
+    Extra classpath entries to append to the classpath of executors. This exists primarily for 
+    backwards-compatibility with older versions of Spark. Users typically should not need to set 
+    this option.
   </td>
 </tr>
 <tr>
-  <td><code>spark.executor.extraClassPath</code></td>
+  <td><code>spark.executor.extraJavaOptions</code></td>
   <td>(none)</td>
   <td>
-    Extra classpath entries to append to the classpath of executors. This exists primarily
-    for backwards-compatibility with older versions of Spark. Users typically should not need
-    to set this option.
+    A string of extra JVM options to pass to executors. For instance, GC settings or other logging. 
+    Note that it is illegal to set Spark properties or heap size settings with this option. Spark 
+    properties should be set using a SparkConf object or the spark-defaults.conf file used with the 
+    spark-submit script. Heap size settings can be set with spark.executor.memory.
   </td>
 </tr>
 <tr>
@@ -287,6 +251,24 @@ Apart from these, the following properties are also available, and may be useful
     Set a special library path to use when launching executor JVM's.
   </td>
 </tr>
+<tr>
+  <td><code>spark.executor.logs.rolling.maxRetainedFiles</code></td>
+  <td>(none)</td>
+  <td>
+    Sets the number of latest rolling log files that are going to be retained by the system.
+    Older log files will be deleted. Disabled by default.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.executor.logs.rolling.size.maxBytes</code></td>
+  <td>(none)</td>
+  <td>
+    Set the max size of the file by which the executor logs will be rolled over.
+    Rolling is disabled by default. Value is set in terms of bytes.
+    See <code>spark.executor.logs.rolling.maxRetainedFiles</code>
+    for automatic cleaning of old logs.
+  </td>
+</tr>
 <tr>
   <td><code>spark.executor.logs.rolling.strategy</code></td>
   <td>(none)</td>
@@ -308,24 +290,6 @@ Apart from these, the following properties are also available, and may be useful
     for automatic cleaning of old logs.
   </td>
 </tr>
-<tr>
-  <td><code>spark.executor.logs.rolling.size.maxBytes</code></td>
-  <td>(none)</td>
-  <td>
-    Set the max size of the file by which the executor logs will be rolled over.
-    Rolling is disabled by default. Value is set in terms of bytes.
-    See <code>spark.executor.logs.rolling.maxRetainedFiles</code>
-    for automatic cleaning of old logs.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.executor.logs.rolling.maxRetainedFiles</code></td>
-  <td>(none)</td>
-  <td>
-    Sets the number of latest rolling log files that are going to be retained by the system.
-    Older log files will be deleted. Disabled by default.
-  </td>
-</tr>
 <tr>
   <td><code>spark.executor.userClassPathFirst</code></td>
   <td>false</td>
@@ -335,12 +299,11 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.python.worker.memory</code></td>
-  <td>512m</td>
+  <td><code>spark.executorEnv.[EnvironmentVariableName]</code></td>
+  <td>(none)</td>
   <td>
-    Amount of memory to use per python worker process during aggregation, in the same
-    format as JVM memory strings (e.g. <code>512m</code>, <code>2g</code>). If the memory
-    used during aggregation goes above this amount, it will spill the data into disks.
+    Add the environment variable specified by <code>EnvironmentVariableName</code> to the Executor
+    process. The user can specify multiple of these to set multiple environment variables.
   </td>
 </tr>
 <tr>
@@ -366,6 +329,15 @@ Apart from these, the following properties are also available, and may be useful
     automatically.
   </td>
 </tr>
+<tr>
+  <td><code>spark.python.worker.memory</code></td>
+  <td>512m</td>
+  <td>
+    Amount of memory to use per python worker process during aggregation, in the same
+    format as JVM memory strings (e.g. <code>512m</code>, <code>2g</code>). If the memory
+    used during aggregation goes above this amount, it will spill the data into disks.
+  </td>
+</tr>
 <tr>
   <td><code>spark.python.worker.reuse</code></td>
   <td>true</td>
@@ -376,40 +348,38 @@ Apart from these, the following properties are also available, and may be useful
     from JVM to Python worker for every task.
   </td>
 </tr>
+</table>
+
+#### Shuffle Behavior
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
-  <td><code>spark.executorEnv.[EnvironmentVariableName]</code></td>
-  <td>(none)</td>
+  <td><code>spark.reducer.maxMbInFlight</code></td>
+  <td>48</td>
   <td>
-    Add the environment variable specified by <code>EnvironmentVariableName</code> to the Executor
-    process. The user can specify multiple of these to set multiple environment variables.
+    Maximum size (in megabytes) of map outputs to fetch simultaneously from each reduce task. Since
+    each output requires us to create a buffer to receive it, this represents a fixed memory
+    overhead per reduce task, so keep it small unless you have a large amount of memory.
   </td>
 </tr>
 <tr>
-  <td><code>spark.mesos.executor.home</code></td>
-  <td>driver side <code>SPARK_HOME</code></td>
+  <td><code>spark.shuffle.blockTransferService</code></td>
+  <td>netty</td>
   <td>
-    Set the directory in which Spark is installed on the executors in Mesos. By default, the
-    executors will simply use the driver's Spark home directory, which may not be visible to
-    them. Note that this is only relevant if a Spark binary package is not specified through
-    <code>spark.executor.uri</code>.
+    Implementation to use for transferring shuffle and cached blocks between executors. There
+    are two implementations available: <code>netty</code> and <code>nio</code>. Netty-based
+    block transfer is intended to be simpler but equally efficient and is the default option
+    starting in 1.2.
   </td>
 </tr>
 <tr>
-  <td><code>spark.mesos.executor.memoryOverhead</code></td>
-  <td>executor memory * 0.07, with minimum of 384</td>
+  <td><code>spark.shuffle.compress</code></td>
+  <td>true</td>
   <td>
-    This value is an additive for <code>spark.executor.memory</code>, specified in MiB,
-    which is used to calculate the total Mesos task memory. A value of <code>384</code>
-    implies a 384MiB overhead. Additionally, there is a hard-coded 7% minimum
-    overhead. The final overhead will be the larger of either
-    `spark.mesos.executor.memoryOverhead` or 7% of `spark.executor.memory`.
+    Whether to compress map output files. Generally a good idea. Compression will use
+    <code>spark.io.compression.codec</code>.
   </td>
 </tr>
-</table>
-
-#### Shuffle Behavior
-<table class="table">
-<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
   <td><code>spark.shuffle.consolidateFiles</code></td>
   <td>false</td>
@@ -421,55 +391,46 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.shuffle.spill</code></td>
-  <td>true</td>
+  <td><code>spark.shuffle.file.buffer.kb</code></td>
+  <td>32</td>
   <td>
-    If set to "true", limits the amount of memory used during reduces by spilling data out to disk.
-    This spilling threshold is specified by <code>spark.shuffle.memoryFraction</code>.
+    Size of the in-memory buffer for each shuffle file output stream, in kilobytes. These buffers
+    reduce the number of disk seeks and system calls made in creating intermediate shuffle files.
   </td>
 </tr>
 <tr>
-  <td><code>spark.shuffle.spill.compress</code></td>
-  <td>true</td>
+  <td><code>spark.shuffle.io.maxRetries</code></td>
+  <td>3</td>
   <td>
-    Whether to compress data spilled during shuffles. Compression will use
-    <code>spark.io.compression.codec</code>.
+    (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is
+    set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC
+    pauses or transient network connectivity issues.
   </td>
 </tr>
 <tr>
-  <td><code>spark.shuffle.memoryFraction</code></td>
-  <td>0.2</td>
+  <td><code>spark.shuffle.io.numConnectionsPerPeer</code></td>
+  <td>1</td>
   <td>
-    Fraction of Java heap to use for aggregation and cogroups during shuffles, if
-    <code>spark.shuffle.spill</code> is true. At any given time, the collective size of
-    all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will
-    begin to spill to disk. If spills are often, consider increasing this value at the expense of
-    <code>spark.storage.memoryFraction</code>.
+    (Netty only) Connections between hosts are reused in order to reduce connection buildup for
+    large clusters. For clusters with many hard disks and few hosts, this may result in insufficient
+    concurrency to saturate all disks, and so users may consider increasing this value.
   </td>
 </tr>
 <tr>
-  <td><code>spark.shuffle.compress</code></td>
+  <td><code>spark.shuffle.io.preferDirectBufs</code></td>
   <td>true</td>
   <td>
-    Whether to compress map output files. Generally a good idea. Compression will use
-    <code>spark.io.compression.codec</code>.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.shuffle.file.buffer.kb</code></td>
-  <td>32</td>
-  <td>
-    Size of the in-memory buffer for each shuffle file output stream, in kilobytes. These buffers
-    reduce the number of disk seeks and system calls made in creating intermediate shuffle files.
+    (Netty only) Off-heap buffers are used to reduce garbage collection during shuffle and cache
+    block transfer. For environments where off-heap memory is tightly limited, users may wish to
+    turn this off to force all allocations from Netty to be on-heap.
   </td>
 </tr>
 <tr>
-  <td><code>spark.reducer.maxMbInFlight</code></td>
-  <td>48</td>
+  <td><code>spark.shuffle.io.retryWait</code></td>
+  <td>5</td>
   <td>
-    Maximum size (in megabytes) of map outputs to fetch simultaneously from each reduce task. Since
-    each output requires us to create a buffer to receive it, this represents a fixed memory
-    overhead per reduce task, so keep it small unless you have a large amount of memory.
+    (Netty only) Seconds to wait between retries of fetches. The maximum delay caused by retrying
+    is simply <code>maxRetries * retryWait</code>, by default 15 seconds.
   </td>
 </tr>
 <tr>
@@ -481,6 +442,17 @@ Apart from these, the following properties are also available, and may be useful
     the default option starting in 1.2.
   </td>
 </tr>
+<tr>
+  <td><code>spark.shuffle.memoryFraction</code></td>
+  <td>0.2</td>
+  <td>
+    Fraction of Java heap to use for aggregation and cogroups during shuffles, if
+    <code>spark.shuffle.spill</code> is true. At any given time, the collective size of
+    all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will
+    begin to spill to disk. If spills are often, consider increasing this value at the expense of
+    <code>spark.storage.memoryFraction</code>.
+  </td>
+</tr>
 <tr>
   <td><code>spark.shuffle.sort.bypassMergeThreshold</code></td>
   <td>200</td>
@@ -490,41 +462,49 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.shuffle.blockTransferService</code></td>
-  <td>netty</td>
+  <td><code>spark.shuffle.spill</code></td>
+  <td>true</td>
   <td>
-    Implementation to use for transferring shuffle and cached blocks between executors. There
-    are two implementations available: <code>netty</code> and <code>nio</code>. Netty-based
-    block transfer is intended to be simpler but equally efficient and is the default option
-    starting in 1.2.
+    If set to "true", limits the amount of memory used during reduces by spilling data out to disk.
+    This spilling threshold is specified by <code>spark.shuffle.memoryFraction</code>.
   </td>
 </tr>
-</table>
-
-#### Spark UI
+<tr>
+  <td><code>spark.shuffle.spill.compress</code></td>
+  <td>true</td>
+  <td>
+    Whether to compress data spilled during shuffles. Compression will use
+    <code>spark.io.compression.codec</code>.
+  </td>
+</tr>
+</table>
+
+#### Spark UI
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
-  <td><code>spark.ui.port</code></td>
-  <td>4040</td>
+  <td><code>spark.eventLog.compress</code></td>
+  <td>false</td>
   <td>
-    Port for your application's dashboard, which shows memory and workload data.
+    Whether to compress logged events, if <code>spark.eventLog.enabled</code> is true.
   </td>
 </tr>
 <tr>
-  <td><code>spark.ui.retainedStages</code></td>
-  <td>1000</td>
+  <td><code>spark.eventLog.dir</code></td>
+  <td>file:///tmp/spark-events</td>
   <td>
-    How many stages the Spark UI and status APIs remember before garbage
-    collecting.
+    Base directory in which Spark events are logged, if <code>spark.eventLog.enabled</code> is true.
+    Within this base directory, Spark creates a sub-directory for each application, and logs the
+    events specific to the application in this directory. Users may want to set this to
+    a unified location like an HDFS directory so history files can be read by the history server.
   </td>
 </tr>
 <tr>
-  <td><code>spark.ui.retainedJobs</code></td>
-  <td>1000</td>
+  <td><code>spark.eventLog.enabled</code></td>
+  <td>false</td>
   <td>
-    How many jobs the Spark UI and status APIs remember before garbage
-    collecting.
+    Whether to log Spark events, useful for reconstructing the Web UI after the application has
+    finished.
   </td>
 </tr>
 <tr>
@@ -535,28 +515,26 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.eventLog.enabled</code></td>
-  <td>false</td>
+  <td><code>spark.ui.port</code></td>
+  <td>4040</td>
   <td>
-    Whether to log Spark events, useful for reconstructing the Web UI after the application has
-    finished.
+    Port for your application's dashboard, which shows memory and workload data.
   </td>
 </tr>
 <tr>
-  <td><code>spark.eventLog.compress</code></td>
-  <td>false</td>
+  <td><code>spark.ui.retainedJobs</code></td>
+  <td>1000</td>
   <td>
-    Whether to compress logged events, if <code>spark.eventLog.enabled</code> is true.
+    How many jobs the Spark UI and status APIs remember before garbage
+    collecting.
   </td>
 </tr>
 <tr>
-  <td><code>spark.eventLog.dir</code></td>
-  <td>file:///tmp/spark-events</td>
+  <td><code>spark.ui.retainedStages</code></td>
+  <td>1000</td>
   <td>
-    Base directory in which Spark events are logged, if <code>spark.eventLog.enabled</code> is true.
-    Within this base directory, Spark creates a sub-directory for each application, and logs the
-    events specific to the application in this directory. Users may want to set this to
-    a unified location like an HDFS directory so history files can be read by the history server.
+    How many stages the Spark UI and status APIs remember before garbage
+    collecting.
   </td>
 </tr>
 </table>
@@ -572,12 +550,10 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.rdd.compress</code></td>
-  <td>false</td>
+  <td><code>spark.closure.serializer</code></td>
+  <td>org.apache.spark.serializer.<br />JavaSerializer</td>
   <td>
-    Whether to compress serialized RDD partitions (e.g. for
-    <code>StorageLevel.MEMORY_ONLY_SER</code>). Can save substantial space at the cost of some
-    extra CPU time.
+    Serializer class to use for closures. Currently only the Java serializer is supported.
   </td>
 </tr>
 <tr>
@@ -593,14 +569,6 @@ Apart from these, the following properties are also available, and may be useful
     and <code>org.apache.spark.io.SnappyCompressionCodec</code>.
   </td>
 </tr>
-<tr>
-  <td><code>spark.io.compression.snappy.block.size</code></td>
-  <td>32768</td>
-  <td>
-    Block size (in bytes) used in Snappy compression, in the case when Snappy compression codec
-    is used. Lowering this block size will also lower shuffle memory usage when Snappy is used.
-  </td>
-</tr>
 <tr>
   <td><code>spark.io.compression.lz4.block.size</code></td>
   <td>32768</td>
@@ -610,21 +578,20 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.closure.serializer</code></td>
-  <td>org.apache.spark.serializer.<br />JavaSerializer</td>
+  <td><code>spark.io.compression.snappy.block.size</code></td>
+  <td>32768</td>
   <td>
-    Serializer class to use for closures. Currently only the Java serializer is supported.
+    Block size (in bytes) used in Snappy compression, in the case when Snappy compression codec
+    is used. Lowering this block size will also lower shuffle memory usage when Snappy is used.
   </td>
 </tr>
 <tr>
-  <td><code>spark.serializer.objectStreamReset</code></td>
-  <td>100</td>
+  <td><code>spark.kryo.classesToRegister</code></td>
+  <td>(none)</td>
   <td>
-    When serializing using org.apache.spark.serializer.JavaSerializer, the serializer caches
-    objects to prevent writing redundant data, however that stops garbage collection of those
-    objects. By calling 'reset' you flush that info from the serializer, and allow old
-    objects to be collected. To turn off this periodic reset set it to -1.
-    By default it will reset the serializer every 100 objects.
+    If you use Kryo serialization, give a comma-separated list of custom class names to register
+    with Kryo.
+    See the <a href="tuning.html#data-serialization">tuning guide</a> for more details.
   </td>
 </tr>
 <tr>
@@ -649,12 +616,16 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.kryoserializer.buffer.mb</code></td>
-  <td>0.064</td>
+  <td><code>spark.kryo.registrator</code></td>
+  <td>(none)</td>
   <td>
-    Initial size of Kryo's serialization buffer, in megabytes. Note that there will be one buffer
-     <i>per core</i> on each worker. This buffer will grow up to
-     <code>spark.kryoserializer.buffer.max.mb</code> if needed.
+    If you use Kryo serialization, set this class to register your custom classes with Kryo. This
+    property is useful if you need to register your classes in a custom way, e.g. to specify a custom
+    field serializer. Otherwise <code>spark.kryo.classesToRegister</code> is simpler. It should be
+    set to a class that extends
+    <a href="api/scala/index.html#org.apache.spark.serializer.KryoRegistrator">
+    <code>KryoRegistrator</code></a>.
+    See the <a href="tuning.html#data-serialization">tuning guide</a> for more details.
   </td>
 </tr>
 <tr>
@@ -666,11 +637,80 @@ Apart from these, the following properties are also available, and may be useful
     inside Kryo.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kryoserializer.buffer.mb</code></td>
+  <td>0.064</td>
+  <td>
+    Initial size of Kryo's serialization buffer, in megabytes. Note that there will be one buffer
+     <i>per core</i> on each worker. This buffer will grow up to
+     <code>spark.kryoserializer.buffer.max.mb</code> if needed.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.rdd.compress</code></td>
+  <td>false</td>
+  <td>
+    Whether to compress serialized RDD partitions (e.g. for
+    <code>StorageLevel.MEMORY_ONLY_SER</code>). Can save substantial space at the cost of some
+    extra CPU time.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.serializer</code></td>
+  <td>org.apache.spark.serializer.<br />JavaSerializer</td>
+  <td>
+    Class to use for serializing objects that will be sent over the network or need to be cached
+    in serialized form. The default of Java serialization works with any Serializable Java object
+    but is quite slow, so we recommend <a href="tuning.html">using
+    <code>org.apache.spark.serializer.KryoSerializer</code> and configuring Kryo serialization</a>
+    when speed is necessary. Can be any subclass of
+    <a href="api/scala/index.html#org.apache.spark.serializer.Serializer">
+    <code>org.apache.spark.Serializer</code></a>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.serializer.objectStreamReset</code></td>
+  <td>100</td>
+  <td>
+    When serializing using org.apache.spark.serializer.JavaSerializer, the serializer caches
+    objects to prevent writing redundant data, however that stops garbage collection of those
+    objects. By calling 'reset' you flush that info from the serializer, and allow old
+    objects to be collected. To turn off this periodic reset set it to -1.
+    By default it will reset the serializer every 100 objects.
+  </td>
+</tr>
 </table>
 
 #### Execution Behavior
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.broadcast.blockSize</code></td>
+  <td>4096</td>
+  <td>
+    Size of each piece of a block in kilobytes for <code>TorrentBroadcastFactory</code>.
+    Too large a value decreases parallelism during broadcast (makes it slower); however, if it is
+    too small, <code>BlockManager</code> might take a performance hit.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.broadcast.factory</code></td>
+  <td>org.apache.spark.broadcast.<br />TorrentBroadcastFactory</td>
+  <td>
+    Which broadcast implementation to use.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.cleaner.ttl</code></td>
+  <td>(infinite)</td>
+  <td>
+    Duration (seconds) of how long Spark will remember any metadata (stages generated, tasks
+    generated, etc.). Periodic cleanups will ensure that metadata older than this duration will be
+    forgotten. This is useful for running Spark for many hours / days (for example, running 24/7 in
+    case of Spark Streaming applications). Note that any RDD that persists in memory for more than
+    this duration will be cleared as well.
+  </td>
+</tr>
 <tr>
   <td><code>spark.default.parallelism</code></td>
   <td>
@@ -689,19 +729,18 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.broadcast.factory</code></td>
-  <td>org.apache.spark.broadcast.<br />TorrentBroadcastFactory</td>
-  <td>
-    Which broadcast implementation to use.
-  </td>
+    <td><code>spark.executor.heartbeatInterval</code></td>
+    <td>10000</td>
+    <td>Interval (milliseconds) between each executor's heartbeats to the driver.  Heartbeats let
+    the driver know that the executor is still alive and update it with metrics for in-progress
+    tasks.</td>
 </tr>
 <tr>
-  <td><code>spark.broadcast.blockSize</code></td>
-  <td>4096</td>
+  <td><code>spark.files.fetchTimeout</code></td>
+  <td>60</td>
   <td>
-    Size of each piece of a block in kilobytes for <code>TorrentBroadcastFactory</code>.
-    Too large a value decreases parallelism during broadcast (makes it slower); however, if it is
-    too small, <code>BlockManager</code> might take a performance hit.
+    Communication timeout to use when fetching files added through SparkContext.addFile() from
+    the driver, in seconds.
   </td>
 </tr>
 <tr>
@@ -713,12 +752,23 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.files.fetchTimeout</code></td>
-  <td>60</td>
-  <td>
-    Communication timeout to use when fetching files added through SparkContext.addFile() from
-    the driver, in seconds.
-  </td>
+    <td><code>spark.hadoop.cloneConf</code></td>
+    <td>false</td>
+    <td>If set to true, clones a new Hadoop <code>Configuration</code> object for each task.  This
+    option should be enabled to work around <code>Configuration</code> thread-safety issues (see
+    <a href="https://issues.apache.org/jira/browse/SPARK-2546">SPARK-2546</a> for more details).
+    This is disabled by default in order to avoid unexpected performance regressions for jobs that
+    are not affected by these issues.</td>
+</tr>
+<tr>
+    <td><code>spark.hadoop.validateOutputSpecs</code></td>
+    <td>true</td>
+    <td>If set to true, validates the output specification (e.g. checking if the output directory already exists)
+    used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing
+    output directories. We recommend that users do not disable this except if trying to achieve compatibility with
+    previous versions of Spark. Simply use Hadoop's FileSystem API to delete output directories by hand.
+    This setting is ignored for jobs generated through Spark Streaming's StreamingContext, since
+    data may need to be rewritten to pre-existing output directories during checkpoint recovery.</td>
 </tr>
 <tr>
   <td><code>spark.storage.memoryFraction</code></td>
@@ -729,6 +779,15 @@ Apart from these, the following properties are also available, and may be useful
     increase it if you configure your own old generation size.
   </td>
 </tr>
+<tr>
+  <td><code>spark.storage.memoryMapThreshold</code></td>
+  <td>2097152</td>
+  <td>
+    Size of a block, in bytes, above which Spark memory maps when reading a block from disk.
+    This prevents Spark from memory mapping very small blocks. In general, memory
+    mapping has high overhead for blocks close to or below the page size of the operating system.
+  </td>
+</tr>
 <tr>
   <td><code>spark.storage.unrollFraction</code></td>
   <td>0.2</td>
@@ -747,15 +806,6 @@ Apart from these, the following properties are also available, and may be useful
     directories on Tachyon file system.
   </td>
 </tr>
-<tr>
-  <td><code>spark.storage.memoryMapThreshold</code></td>
-  <td>2097152</td>
-  <td>
-    Size of a block, in bytes, above which Spark memory maps when reading a block from disk.
-    This prevents Spark from memory mapping very small blocks. In general, memory
-    mapping has high overhead for blocks close to or below the page size of the operating system.
-  </td>
-</tr>
 <tr>
   <td><code>spark.tachyonStore.url</code></td>
   <td>tachyon://localhost:19998</td>
@@ -763,106 +813,19 @@ Apart from these, the following properties are also available, and may be useful
     The URL of the underlying Tachyon file system in the TachyonStore.
   </td>
 </tr>
-<tr>
-  <td><code>spark.cleaner.ttl</code></td>
-  <td>(infinite)</td>
-  <td>
-    Duration (seconds) of how long Spark will remember any metadata (stages generated, tasks
-    generated, etc.). Periodic cleanups will ensure that metadata older than this duration will be
-    forgotten. This is useful for running Spark for many hours / days (for example, running 24/7 in
-    case of Spark Streaming applications). Note that any RDD that persists in memory for more than
-    this duration will be cleared as well.
-  </td>
-</tr>
-<tr>
-    <td><code>spark.hadoop.validateOutputSpecs</code></td>
-    <td>true</td>
-    <td>If set to true, validates the output specification (e.g. checking if the output directory already exists)
-    used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing
-    output directories. We recommend that users do not disable this except if trying to achieve compatibility with
-    previous versions of Spark. Simply use Hadoop's FileSystem API to delete output directories by hand.
-    This setting is ignored for jobs generated through Spark Streaming's StreamingContext, since
-    data may need to be rewritten to pre-existing output directories during checkpoint recovery.</td>
-</tr>
-<tr>
-    <td><code>spark.hadoop.cloneConf</code></td>
-    <td>false</td>
-    <td>If set to true, clones a new Hadoop <code>Configuration</code> object for each task.  This
-    option should be enabled to work around <code>Configuration</code> thread-safety issues (see
-    <a href="https://issues.apache.org/jira/browse/SPARK-2546">SPARK-2546</a> for more details).
-    This is disabled by default in order to avoid unexpected performance regressions for jobs that
-    are not affected by these issues.</td>
-</tr>
-<tr>
-    <td><code>spark.executor.heartbeatInterval</code></td>
-    <td>10000</td>
-    <td>Interval (milliseconds) between each executor's heartbeats to the driver.  Heartbeats let
-    the driver know that the executor is still alive and update it with metrics for in-progress
-    tasks.</td>
-</tr>
 </table>
 
 #### Networking
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
-  <td><code>spark.driver.host</code></td>
-  <td>(local hostname)</td>
-  <td>
-    Hostname or IP address for the driver to listen on.
-    This is used for communicating with the executors and the standalone Master.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.driver.port</code></td>
-  <td>(random)</td>
-  <td>
-    Port for the driver to listen on.
-    This is used for communicating with the executors and the standalone Master.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.fileserver.port</code></td>
-  <td>(random)</td>
-  <td>
-    Port for the driver's HTTP file server to listen on.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.broadcast.port</code></td>
-  <td>(random)</td>
-  <td>
-    Port for the driver's HTTP broadcast server to listen on.
-    This is not relevant for torrent broadcast.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.replClassServer.port</code></td>
-  <td>(random)</td>
+  <td><code>spark.akka.failure-detector.threshold</code></td>
+  <td>300.0</td>
   <td>
-    Port for the driver's HTTP class server to listen on.
-    This is only relevant for the Spark shell.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.blockManager.port</code></td>
-  <td>(random)</td>
-  <td>
-    Port for all block managers to listen on. These exist on both the driver and the executors.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.executor.port</code></td>
-  <td>(random)</td>
-  <td>
-    Port for the executor to listen on. This is used for communicating with the driver.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.port.maxRetries</code></td>
-  <td>16</td>
-  <td>
-    Default maximum number of retries when binding to a port before giving up.
+     This is set to a larger value to disable failure detector that comes inbuilt akka. It can be
+     enabled again, if you plan to use this feature (Not recommended). This maps to akka's
+     `akka.remote.transport-failure-detector.threshold`. Tune this in combination of
+     `spark.akka.heartbeat.pauses` and `spark.akka.heartbeat.interval` if you need to.
   </td>
 </tr>
 <tr>
@@ -875,28 +838,18 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.akka.threads</code></td>
-  <td>4</td>
-  <td>
-    Number of actor threads to use for communication. Can be useful to increase on large clusters
-    when the driver has a lot of CPU cores.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.akka.timeout</code></td>
-  <td>100</td>
-  <td>
-    Communication timeout between Spark nodes, in seconds.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.network.timeout</code></td>
-  <td>120</td>
+  <td><code>spark.akka.heartbeat.interval</code></td>
+  <td>1000</td>
   <td>
-    Default timeout for all network interactions, in seconds. This config will be used in
-    place of <code>spark.core.connection.ack.wait.timeout</code>, <code>spark.akka.timeout</code>,
-    <code>spark.storage.blockManagerSlaveTimeoutMs</code> or
-    <code>spark.shuffle.io.connectionTimeout</code>, if they are not configured.
+    This is set to a larger value to disable the transport failure detector that comes built in to 
+    Akka. It can be enabled again, if you plan to use this feature (Not recommended). A larger 
+    interval value in seconds reduces network overhead and a smaller value ( ~ 1 s) might be more 
+    informative for Akka's failure detector. Tune this in combination of `spark.akka.heartbeat.pauses` 
+    if you need to. A likely positive use case for using failure detector would be: a sensistive 
+    failure detector can help evict rogue executors quickly. However this is usually not the case 
+    as GC pauses and network lags are expected in a real Spark cluster. Apart from that enabling 
+    this leads to a lot of exchanges of heart beats between nodes leading to flooding the network 
+    with those.
   </td>
 </tr>
 <tr>
@@ -910,136 +863,114 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.akka.heartbeat.interval</code></td>
-  <td>1000</td>
-  <td>
-    This is set to a larger value to disable the transport failure detector that comes built in to Akka.
-    It can be enabled again, if you plan to use this feature (Not recommended). A larger interval 
-    value in seconds reduces network overhead and a smaller value ( ~ 1 s) might be more informative 
-    for Akka's failure detector. Tune this in combination of `spark.akka.heartbeat.pauses` if you need
-    to. A likely positive use case for using failure detector would be: a sensistive failure detector
-    can help evict rogue executors quickly. However this is usually not the case as GC pauses
-    and network lags are expected in a real Spark cluster. Apart from that enabling this leads to 
-    a lot of exchanges of heart beats between nodes leading to flooding the network with those.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.shuffle.io.preferDirectBufs</code></td>
-  <td>true</td>
+  <td><code>spark.akka.threads</code></td>
+  <td>4</td>
   <td>
-    (Netty only) Off-heap buffers are used to reduce garbage collection during shuffle and cache
-    block transfer. For environments where off-heap memory is tightly limited, users may wish to
-    turn this off to force all allocations from Netty to be on-heap.
+    Number of actor threads to use for communication. Can be useful to increase on large clusters
+    when the driver has a lot of CPU cores.
   </td>
 </tr>
 <tr>
-  <td><code>spark.shuffle.io.numConnectionsPerPeer</code></td>
-  <td>1</td>
+  <td><code>spark.akka.timeout</code></td>
+  <td>100</td>
   <td>
-    (Netty only) Connections between hosts are reused in order to reduce connection buildup for
-    large clusters. For clusters with many hard disks and few hosts, this may result in insufficient
-    concurrency to saturate all disks, and so users may consider increasing this value.
+    Communication timeout between Spark nodes, in seconds.
   </td>
 </tr>
 <tr>
-  <td><code>spark.shuffle.io.maxRetries</code></td>
-  <td>3</td>
+  <td><code>spark.blockManager.port</code></td>
+  <td>(random)</td>
   <td>
-    (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is
-    set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC
-    pauses or transient network connectivity issues.
+    Port for all block managers to listen on. These exist on both the driver and the executors.
   </td>
 </tr>
 <tr>
-  <td><code>spark.shuffle.io.retryWait</code></td>
-  <td>5</td>
+  <td><code>spark.broadcast.port</code></td>
+  <td>(random)</td>
   <td>
-    (Netty only) Seconds to wait between retries of fetches. The maximum delay caused by retrying
-    is simply <code>maxRetries * retryWait</code>. The default maximum delay is therefore 
-    15 seconds, because the default value of <code>maxRetries</code> is 3, and the default
-    <code>retryWait</code> here is 5 seconds.
+    Port for the driver's HTTP broadcast server to listen on.
+    This is not relevant for torrent broadcast.
   </td>
 </tr>
-</table>
-
-#### Scheduling
-<table class="table">
-<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
-  <td><code>spark.task.cpus</code></td>
-  <td>1</td>
+  <td><code>spark.driver.host</code></td>
+  <td>(local hostname)</td>
   <td>
-    Number of cores to allocate for each task.
+    Hostname or IP address for the driver to listen on.
+    This is used for communicating with the executors and the standalone Master.
   </td>
 </tr>
 <tr>
-  <td><code>spark.task.maxFailures</code></td>
-  <td>4</td>
+  <td><code>spark.driver.port</code></td>
+  <td>(random)</td>
   <td>
-    Number of individual task failures before giving up on the job.
-    Should be greater than or equal to 1. Number of allowed retries = this value - 1.
+    Port for the driver to listen on.
+    This is used for communicating with the executors and the standalone Master.
   </td>
 </tr>
 <tr>
-  <td><code>spark.scheduler.mode</code></td>
-  <td>FIFO</td>
+  <td><code>spark.executor.port</code></td>
+  <td>(random)</td>
   <td>
-    The <a href="job-scheduling.html#scheduling-within-an-application">scheduling mode</a> between
-    jobs submitted to the same SparkContext. Can be set to <code>FAIR</code>
-    to use fair sharing instead of queueing jobs one after another. Useful for
-    multi-user services.
+    Port for the executor to listen on. This is used for communicating with the driver.
   </td>
 </tr>
 <tr>
-  <td><code>spark.cores.max</code></td>
-  <td>(not set)</td>
+  <td><code>spark.fileserver.port</code></td>
+  <td>(random)</td>
   <td>
-    When running on a <a href="spark-standalone.html">standalone deploy cluster</a> or a
-    <a href="running-on-mesos.html#mesos-run-modes">Mesos cluster in "coarse-grained"
-    sharing mode</a>, the maximum amount of CPU cores to request for the application from
-    across the cluster (not from each machine). If not set, the default will be
-    <code>spark.deploy.defaultCores</code> on Spark's standalone cluster manager, or
-    infinite (all available cores) on Mesos.
+    Port for the driver's HTTP file server to listen on.
   </td>
 </tr>
 <tr>
-  <td><code>spark.mesos.coarse</code></td>
-  <td>false</td>
+  <td><code>spark.network.timeout</code></td>
+  <td>120</td>
   <td>
-    If set to "true", runs over Mesos clusters in
-    <a href="running-on-mesos.html#mesos-run-modes">"coarse-grained" sharing mode</a>,
-    where Spark acquires one long-lived Mesos task on each machine instead of one Mesos task per
-    Spark task. This gives lower-latency scheduling for short queries, but leaves resources in use
-    for the whole duration of the Spark job.
+    Default timeout for all network interactions, in seconds. This config will be used in
+    place of <code>spark.core.connection.ack.wait.timeout</code>, <code>spark.akka.timeout</code>,
+    <code>spark.storage.blockManagerSlaveTimeoutMs</code> or
+    <code>spark.shuffle.io.connectionTimeout</code>, if they are not configured.
   </td>
 </tr>
 <tr>
-  <td><code>spark.speculation</code></td>
-  <td>false</td>
+  <td><code>spark.port.maxRetries</code></td>
+  <td>16</td>
   <td>
-    If set to "true", performs speculative execution of tasks. This means if one or more tasks are
-    running slowly in a stage, they will be re-launched.
+    Default maximum number of retries when binding to a port before giving up.
   </td>
 </tr>
 <tr>
-  <td><code>spark.speculation.interval</code></td>
-  <td>100</td>
+  <td><code>spark.replClassServer.port</code></td>
+  <td>(random)</td>
   <td>
-    How often Spark will check for tasks to speculate, in milliseconds.
+    Port for the driver's HTTP class server to listen on.
+    This is only relevant for the Spark shell.
   </td>
 </tr>
+</table>
+
+#### Scheduling
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
-  <td><code>spark.speculation.quantile</code></td>
-  <td>0.75</td>
+  <td><code>spark.cores.max</code></td>
+  <td>(not set)</td>
   <td>
-    Percentage of tasks which must be complete before speculation is enabled for a particular stage.
+    When running on a <a href="spark-standalone.html">standalone deploy cluster</a> or a
+    <a href="running-on-mesos.html#mesos-run-modes">Mesos cluster in "coarse-grained"
+    sharing mode</a>, the maximum amount of CPU cores to request for the application from
+    across the cluster (not from each machine). If not set, the default will be
+    <code>spark.deploy.defaultCores</code> on Spark's standalone cluster manager, or
+    infinite (all available cores) on Mesos.
   </td>
 </tr>
 <tr>
-  <td><code>spark.speculation.multiplier</code></td>
-  <td>1.5</td>
+  <td><code>spark.localExecution.enabled</code></td>
+  <td>false</td>
   <td>
-    How many times slower a task is than the median to be considered for speculation.
+    Enables Spark to run certain jobs, such as first() or take() on the driver, without sending
+    tasks to the cluster. This can make certain jobs execute very quickly, but may require
+    shipping a whole partition of data to the driver.
   </td>
 </tr>
 <tr>
@@ -1055,19 +986,19 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.locality.wait.process</code></td>
+  <td><code>spark.locality.wait.node</code></td>
   <td>spark.locality.wait</td>
   <td>
-    Customize the locality wait for process locality. This affects tasks that attempt to access
-    cached data in a particular executor process.
+    Customize the locality wait for node locality. For example, you can set this to 0 to skip
+    node locality and search immediately for rack locality (if your cluster has rack information).
   </td>
 </tr>
 <tr>
-  <td><code>spark.locality.wait.node</code></td>
+  <td><code>spark.locality.wait.process</code></td>
   <td>spark.locality.wait</td>
   <td>
-    Customize the locality wait for node locality. For example, you can set this to 0 to skip
-    node locality and search immediately for rack locality (if your cluster has rack information).
+    Customize the locality wait for process locality. This affects tasks that attempt to access
+    cached data in a particular executor process.
   </td>
 </tr>
 <tr>
@@ -1078,14 +1009,14 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.scheduler.revive.interval</code></td>
-  <td>1000</td>
+  <td><code>spark.scheduler.maxRegisteredResourcesWaitingTime</code></td>
+  <td>30000</td>
   <td>
-    The interval length for the scheduler to revive the worker resource offers to run tasks
+    Maximum amount of time to wait for resources to register before scheduling begins
     (in milliseconds).
   </td>
 </tr>
-</tr>
+<tr>
   <td><code>spark.scheduler.minRegisteredResourcesRatio</code></td>
   <td>0.0 for Mesos and Standalone mode, 0.8 for YARN</td>
   <td>
@@ -1098,25 +1029,70 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.scheduler.maxRegisteredResourcesWaitingTime</code></td>
-  <td>30000</td>
+  <td><code>spark.scheduler.mode</code></td>
+  <td>FIFO</td>
   <td>
-    Maximum amount of time to wait for resources to register before scheduling begins
+    The <a href="job-scheduling.html#scheduling-within-an-application">scheduling mode</a> between
+    jobs submitted to the same SparkContext. Can be set to <code>FAIR</code>
+    to use fair sharing instead of queueing jobs one after another. Useful for
+    multi-user services.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.scheduler.revive.interval</code></td>
+  <td>1000</td>
+  <td>
+    The interval length for the scheduler to revive the worker resource offers to run tasks
     (in milliseconds).
   </td>
 </tr>
 <tr>
-  <td><code>spark.localExecution.enabled</code></td>
+  <td><code>spark.speculation</code></td>
   <td>false</td>
   <td>
-    Enables Spark to run certain jobs, such as first() or take() on the driver, without sending
-    tasks to the cluster. This can make certain jobs execute very quickly, but may require
-    shipping a whole partition of data to the driver.
+    If set to "true", performs speculative execution of tasks. This means if one or more tasks are
+    running slowly in a stage, they will be re-launched.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.speculation.interval</code></td>
+  <td>100</td>
+  <td>
+    How often Spark will check for tasks to speculate, in milliseconds.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.speculation.multiplier</code></td>
+  <td>1.5</td>
+  <td>
+    How many times slower a task is than the median to be considered for speculation.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.speculation.quantile</code></td>
+  <td>0.75</td>
+  <td>
+    Percentage of tasks which must be complete before speculation is enabled for a particular stage.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.task.cpus</code></td>
+  <td>1</td>
+  <td>
+    Number of cores to allocate for each task.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.task.maxFailures</code></td>
+  <td>4</td>
+  <td>
+    Number of individual task failures before giving up on the job.
+    Should be greater than or equal to 1. Number of allowed retries = this value - 1.
   </td>
 </tr>
 </table>
 
-#### Dynamic allocation
+#### Dynamic Allocation
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
@@ -1136,10 +1112,19 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
+  <td><code>spark.dynamicAllocation.executorIdleTimeout</code></td>
+  <td>600</td>
+  <td>
+    If dynamic allocation is enabled and an executor has been idle for more than this duration
+    (in seconds), the executor will be removed. For more detail, see this
+    <a href="job-scheduling.html#resource-allocation-policy">description</a>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.dynamicAllocation.initialExecutors</code></td>
   <td><code>spark.dynamicAllocation.minExecutors</code></td>
-  <td>0</td>
   <td>
-    Lower bound for the number of executors if dynamic allocation is enabled.
+    Initial number of executors to run if dynamic allocation is enabled.
   </td>
 </tr>
 <tr>
@@ -1150,10 +1135,10 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.dynamicAllocation.maxExecutors</code></td>
   <td><code>spark.dynamicAllocation.minExecutors</code></td>
+  <td>0</td>
   <td>
-    Initial number of executors to run if dynamic allocation is enabled.
+    Lower bound for the number of executors if dynamic allocation is enabled.
   </td>
 </tr>
 <tr>
@@ -1174,20 +1159,30 @@ Apart from these, the following properties are also available, and may be useful
     <a href="job-scheduling.html#resource-allocation-policy">description</a>.
   </td>
 </tr>
-<tr>
-  <td><code>spark.dynamicAllocation.executorIdleTimeout</code></td>
-  <td>600</td>
-  <td>
-    If dynamic allocation is enabled and an executor has been idle for more than this duration
-    (in seconds), the executor will be removed. For more detail, see this
-    <a href="job-scheduling.html#resource-allocation-policy">description</a>.
-  </td>
-</tr>
 </table>
 
 #### Security
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.acls.enable</code></td>
+  <td>false</td>
+  <td>
+    Whether Spark acls should are enabled. If enabled, this checks to see if the user has
+    access permissions to view or modify the job.  Note this requires the user to be known,
+    so if the user comes across as null no checks are done. Filters can be used with the UI
+    to authenticate and set the user.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.admin.acls</code></td>
+  <td>Empty</td>
+  <td>
+    Comma separated list of users/administrators that have view and modify access to all Spark jobs.
+    This can be used if you run on a shared cluster and have a set of administrators or devs who
+    help debug when things work.
+  </td>
+</tr>
 <tr>
   <td><code>spark.authenticate</code></td>
   <td>false</td>
@@ -1204,6 +1199,15 @@ Apart from these, the following properties are also available, and may be useful
     not running on YARN and authentication is enabled.
   </td>
 </tr>
+<tr>
+  <td><code>spark.core.connection.ack.wait.timeout</code></td>
+  <td>60</td>
+  <td>
+    Number of seconds for the connection to wait for ack to occur before timing
+    out and giving up. To avoid unwilling timeout caused by long pause like GC,
+    you can set larger value.
+  </td>
+</tr>
 <tr>
   <td><code>spark.core.connection.auth.wait.timeout</code></td>
   <td>30</td>
@@ -1213,12 +1217,11 @@ Apart from these, the following properties are also available, and may be useful
   </td>
 </tr>
 <tr>
-  <td><code>spark.core.connection.ack.wait.timeout</code></td>
-  <td>60</td>
+  <td><code>spark.modify.acls</code></td>
+  <td>Empty</td>
   <td>
-    Number of seconds for the connection to wait for ack to occur before timing
-    out and giving up. To avoid unwilling timeout caused by long pause like GC,
-    you can set larger value.
+    Comma separated list of users that have modify access to the Spark job. By default only the
+    user that started the Spark job has access to modify it (kill it for example).
   </td>
 </tr>
 <tr>
@@ -1235,16 +1238,6 @@ Apart from these, the following properties are also available, and may be useful
     <code>-Dspark.com.test.filter1.params='param1=foo,param2=testing'</code>
   </td>
 </tr>
-<tr>
-  <td><code>spark.acls.enable</code></td>
-  <td>false</td>
-  <td>
-    Whether Spark acls should are enabled. If enabled, this checks to see if the user has
-    access permissions to view or modify the job.  Note this requires the user to be known,
-    so if the user comes across as null no checks are done. Filters can be used with the UI
-    to authenticate and set the user.
-  </td>
-</tr>
 <tr>
   <td><code>spark.ui.view.acls</code></td>
   <td>Empty</td>
@@ -1253,23 +1246,6 @@ Apart from these, the following properties are also available, and may be useful
     user that started the Spark job has view access.
   </td>
 </tr>
-<tr>
-  <td><code>spark.modify.acls</code></td>
-  <td>Empty</td>
-  <td>
-    Comma separated list of users that have modify access to the Spark job. By default only the
-    user that started the Spark job has access to modify it (kill it for example).
-  </td>
-</tr>
-<tr>
-  <td><code>spark.admin.acls</code></td>
-  <td>Empty</td>
-  <td>
-    Comma separated list of users/administrators that have view and modify access to all Spark jobs.
-    This can be used if you run on a shared cluster and have a set of administrators or devs who
-    help debug when things work.
-  </td>
-</tr>
 </table>
 
 #### Encryption
@@ -1293,6 +1269,23 @@ Apart from these, the following properties are also available, and may be useful
             file server.</p>
         </td>
     </tr>
+    <tr>
+        <td><code>spark.ssl.enabledAlgorithms</code></td>
+        <td>Empty</td>
+        <td>
+            A comma separated list of ciphers. The specified ciphers must be supported by JVM.
+            The reference list of protocols one can find on
+            <a href="https://blogs.oracle.com/java-platform-group/entry/diagnosing_tls_ssl_and_https">this</a>
+            page.
+        </td>
+    </tr>
+    <tr>
+        <td><code>spark.ssl.keyPassword</code></td>
+        <td>None</td>
+        <td>
+            A password to the private key in key-store.
+        </td>
+    </tr>
     <tr>
         <td><code>spark.ssl.keyStore</code></td>
         <td>None</td>
@@ -1309,10 +1302,12 @@ Apart from these, the following properties are also available, and may be useful
         </td>
     </tr>
     <tr>
-        <td><code>spark.ssl.keyPassword</code></td>
+        <td><code>spark.ssl.protocol</code></td>
         <td>None</td>
         <td>
-            A password to the private key in key-store.
+            A protocol name. The protocol must be supported by JVM. The reference list of protocols
+            one can find on <a href="https://blogs.oracle.com/java-platform-group/entry/diagnosing_tls_ssl_and_https">this</a>
+            page.
         </td>
     </tr>
     <tr>
@@ -1330,25 +1325,6 @@ Apart from these, the following properties are also available, and may be useful
             A password to the trust-store.
         </td>
     </tr>
-    <tr>
-        <td><code>spark.ssl.protocol</code></td>
-        <td>None</td>
-        <td>
-            A protocol name. The protocol must be supported by JVM. The reference list of protocols
-            one can find on <a href="https://blogs.oracle.com/java-platform-group/entry/diagnosing_tls_ssl_and_https">this</a>
-            page.
-        </td>
-    </tr>
-    <tr>
-        <td><code>spark.ssl.enabledAlgorithms</code></td>
-        <td>Empty</td>
-        <td>
-            A comma separated list of ciphers. The specified ciphers must be supported by JVM.
-            The reference list of protocols one can find on
-            <a href="https://blogs.oracle.com/java-platform-group/entry/diagnosing_tls_ssl_and_https">this</a>
-            page.
-        </td>
-    </tr>
 </table>
 
 
diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index 78358499fd01f..db1173a06b0b1 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -197,7 +197,11 @@ See the [configuration page](configuration.html) for information on Spark config
   <td><code>spark.mesos.coarse</code></td>
   <td>false</td>
   <td>
-    Set the run mode for Spark on Mesos. For more information about the run mode, refer to #Mesos Run Mode section above.
+    If set to "true", runs over Mesos clusters in
+    <a href="running-on-mesos.html#mesos-run-modes">"coarse-grained" sharing mode</a>,
+    where Spark acquires one long-lived Mesos task on each machine instead of one Mesos task per
+    Spark task. This gives lower-latency scheduling for short queries, but leaves resources in use
+    for the whole duration of the Spark job.
   </td>
 </tr>
 <tr>
@@ -211,19 +215,23 @@ See the [configuration page](configuration.html) for information on Spark config
 </tr>
 <tr>
   <td><code>spark.mesos.executor.home</code></td>
-  <td>SPARK_HOME</td>
+  <td>driver side <code>SPARK_HOME</code></td>
   <td>
-    The location where the mesos executor will look for Spark binaries to execute, and uses the SPARK_HOME setting on default.
-    This variable is only used when no spark.executor.uri is provided, and assumes Spark is installed on the specified location
-    on each slave.
+    Set the directory in which Spark is installed on the executors in Mesos. By default, the
+    executors will simply use the driver's Spark home directory, which may not be visible to
+    them. Note that this is only relevant if a Spark binary package is not specified through
+    <code>spark.executor.uri</code>.
   </td>
 </tr>
 <tr>
   <td><code>spark.mesos.executor.memoryOverhead</code></td>
-  <td>384</td>
+  <td>executor memory * 0.07, with minimum of 384</td>
   <td>
-    The amount of memory that Mesos executor will request for the task to account for the overhead of running the executor itself.
-    The final total amount of memory allocated is the maximum value between executor memory plus memoryOverhead, and overhead fraction (1.07) plus the executor memory.
+    This value is an additive for <code>spark.executor.memory</code>, specified in MiB,
+    which is used to calculate the total Mesos task memory. A value of <code>384</code>
+    implies a 384MiB overhead. Additionally, there is a hard-coded 7% minimum
+    overhead. The final overhead will be the larger of either
+    `spark.mesos.executor.memoryOverhead` or 7% of `spark.executor.memory`.
   </td>
 </tr>
 </table>

From d20559b157743981b9c09e286f2aaff8cbefab59 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Wed, 25 Feb 2015 16:13:17 -0800
Subject: [PATCH 312/357] [SPARK-5974] [SPARK-5980] [mllib] [python] [docs]
 Update ML guide with save/load, Python GBT

* Add GradientBoostedTrees Python examples to ML guide
  * I ran these in the pyspark shell, and they worked.
* Add save/load to examples in ML guide
* Added note to python docs about predict,transform not working within RDD actions,transformations in some cases (See SPARK-5981)

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #4750 from jkbradley/SPARK-5974 and squashes the following commits:

c410e38 [Joseph K. Bradley] Added note to LabeledPoint about attributes
bcae18b [Joseph K. Bradley] Added import of models for save/load examples in ml guide.  Fixed line length for tree.py, feature.py (but not other ML Pyspark files yet).
6d81c3e [Joseph K. Bradley] completed python GBT examples
9903309 [Joseph K. Bradley] Added note to python docs about predict,transform not working within RDD actions,transformations in some cases
c7dfad8 [Joseph K. Bradley] Added model save/load to ML guide.  Added GBT examples to ML guide
---
 docs/mllib-classification-regression.md |   9 +-
 docs/mllib-collaborative-filtering.md   |   9 ++
 docs/mllib-decision-tree.md             |  20 +++
 docs/mllib-ensembles.md                 |  94 +++++++++++++-
 docs/mllib-linear-methods.md            |  21 +++-
 docs/mllib-naive-bayes.md               |  10 +-
 python/pyspark/mllib/feature.py         |  67 ++++++----
 python/pyspark/mllib/regression.py      |   7 +-
 python/pyspark/mllib/tree.py            | 156 ++++++++++++++----------
 9 files changed, 296 insertions(+), 97 deletions(-)

diff --git a/docs/mllib-classification-regression.md b/docs/mllib-classification-regression.md
index 5b9b4dd83b774..8e91d62f4a907 100644
--- a/docs/mllib-classification-regression.md
+++ b/docs/mllib-classification-regression.md
@@ -17,13 +17,13 @@ the supported algorithms for each type of problem.
   </thead>
   <tbody>
     <tr>
-      <td>Binary Classification</td><td>linear SVMs, logistic regression, decision trees, naive Bayes</td>
+      <td>Binary Classification</td><td>linear SVMs, logistic regression, decision trees, random forests, gradient-boosted trees, naive Bayes</td>
     </tr>
     <tr>
-      <td>Multiclass Classification</td><td>decision trees, naive Bayes</td>
+      <td>Multiclass Classification</td><td>decision trees, random forests, naive Bayes</td>
     </tr>
     <tr>
-      <td>Regression</td><td>linear least squares, Lasso, ridge regression, decision trees, isotonic regression</td>
+      <td>Regression</td><td>linear least squares, Lasso, ridge regression, decision trees, random forests, gradient-boosted trees, isotonic regression</td>
     </tr>
   </tbody>
 </table>
@@ -34,5 +34,8 @@ More details for these methods can be found here:
   * [binary classification (SVMs, logistic regression)](mllib-linear-methods.html#binary-classification)
   * [linear regression (least squares, Lasso, ridge)](mllib-linear-methods.html#linear-least-squares-lasso-and-ridge-regression)
 * [Decision trees](mllib-decision-tree.html)
+* [Ensembles of decision trees](mllib-ensembles.html)
+  * [random forests](mllib-ensembles.html#random-forests)
+  * [gradient-boosted trees](mllib-ensembles.html#gradient-boosted-trees-gbts)
 * [Naive Bayes](mllib-naive-bayes.html)
 * [Isotonic regression](mllib-isotonic-regression.html)
diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index ef18cec9371d6..935cd8dad3b25 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -66,6 +66,7 @@ recommendation model by measuring the Mean Squared Error of rating prediction.
 
 {% highlight scala %}
 import org.apache.spark.mllib.recommendation.ALS
+import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
 import org.apache.spark.mllib.recommendation.Rating
 
 // Load and parse the data
@@ -95,6 +96,9 @@ val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
   err * err
 }.mean()
 println("Mean Squared Error = " + MSE)
+
+model.save("myModelPath")
+val sameModel = MatrixFactorizationModel.load("myModelPath")
 {% endhighlight %}
 
 If the rating matrix is derived from another source of information (e.g., it is inferred from
@@ -181,6 +185,9 @@ public class CollaborativeFiltering {
       }
     ).rdd()).mean();
     System.out.println("Mean Squared Error = " + MSE);
+
+    model.save("myModelPath");
+    MatrixFactorizationModel sameModel = MatrixFactorizationModel.load("myModelPath");
   }
 }
 {% endhighlight %}
@@ -191,6 +198,8 @@ In the following example we load rating data. Each row consists of a user, a pro
 We use the default ALS.train() method which assumes ratings are explicit. We evaluate the
 recommendation by measuring the Mean Squared Error of rating prediction.
 
+Note that the Python API does not yet support model save/load but will in the future.
+
 {% highlight python %}
 from pyspark.mllib.recommendation import ALS, Rating
 
diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md
index 6675133a810db..4695d1cde4901 100644
--- a/docs/mllib-decision-tree.md
+++ b/docs/mllib-decision-tree.md
@@ -194,6 +194,7 @@ maximum tree depth of 5. The test error is calculated to measure the algorithm a
 <div data-lang="scala">
 {% highlight scala %}
 import org.apache.spark.mllib.tree.DecisionTree
+import org.apache.spark.mllib.tree.model.DecisionTreeModel
 import org.apache.spark.mllib.util.MLUtils
 
 // Load and parse the data file.
@@ -221,6 +222,9 @@ val labelAndPreds = testData.map { point =>
 val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
 println("Test Error = " + testErr)
 println("Learned classification tree model:\n" + model.toDebugString)
+
+model.save("myModelPath")
+val sameModel = DecisionTreeModel.load("myModelPath")
 {% endhighlight %}
 </div>
 
@@ -279,10 +283,16 @@ Double testErr =
   }).count() / testData.count();
 System.out.println("Test Error: " + testErr);
 System.out.println("Learned classification tree model:\n" + model.toDebugString());
+
+model.save("myModelPath");
+DecisionTreeModel sameModel = DecisionTreeModel.load("myModelPath");
 {% endhighlight %}
 </div>
 
 <div data-lang="python">
+
+Note that the Python API does not yet support model save/load but will in the future.
+
 {% highlight python %}
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.tree import DecisionTree
@@ -324,6 +334,7 @@ depth of 5. The Mean Squared Error (MSE) is computed at the end to evaluate
 <div data-lang="scala">
 {% highlight scala %}
 import org.apache.spark.mllib.tree.DecisionTree
+import org.apache.spark.mllib.tree.model.DecisionTreeModel
 import org.apache.spark.mllib.util.MLUtils
 
 // Load and parse the data file.
@@ -350,6 +361,9 @@ val labelsAndPredictions = testData.map { point =>
 val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
 println("Test Mean Squared Error = " + testMSE)
 println("Learned regression tree model:\n" + model.toDebugString)
+
+model.save("myModelPath")
+val sameModel = DecisionTreeModel.load("myModelPath")
 {% endhighlight %}
 </div>
 
@@ -414,10 +428,16 @@ Double testMSE =
   }) / data.count();
 System.out.println("Test Mean Squared Error: " + testMSE);
 System.out.println("Learned regression tree model:\n" + model.toDebugString());
+
+model.save("myModelPath");
+DecisionTreeModel sameModel = DecisionTreeModel.load("myModelPath");
 {% endhighlight %}
 </div>
 
 <div data-lang="python">
+
+Note that the Python API does not yet support model save/load but will in the future.
+
 {% highlight python %}
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.tree import DecisionTree
diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md
index 00040e6073d0d..ddae84165f8a9 100644
--- a/docs/mllib-ensembles.md
+++ b/docs/mllib-ensembles.md
@@ -98,6 +98,7 @@ The test error is calculated to measure the algorithm accuracy.
 <div data-lang="scala">
 {% highlight scala %}
 import org.apache.spark.mllib.tree.RandomForest
+import org.apache.spark.mllib.tree.model.RandomForestModel
 import org.apache.spark.mllib.util.MLUtils
 
 // Load and parse the data file.
@@ -127,6 +128,9 @@ val labelAndPreds = testData.map { point =>
 val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
 println("Test Error = " + testErr)
 println("Learned classification forest model:\n" + model.toDebugString)
+
+model.save("myModelPath")
+val sameModel = RandomForestModel.load("myModelPath")
 {% endhighlight %}
 </div>
 
@@ -188,10 +192,16 @@ Double testErr =
   }).count() / testData.count();
 System.out.println("Test Error: " + testErr);
 System.out.println("Learned classification forest model:\n" + model.toDebugString());
+
+model.save("myModelPath");
+RandomForestModel sameModel = RandomForestModel.load("myModelPath");
 {% endhighlight %}
 </div>
 
 <div data-lang="python">
+
+Note that the Python API does not yet support model save/load but will in the future.
+
 {% highlight python %}
 from pyspark.mllib.tree import RandomForest
 from pyspark.mllib.util import MLUtils
@@ -235,6 +245,7 @@ The Mean Squared Error (MSE) is computed at the end to evaluate
 <div data-lang="scala">
 {% highlight scala %}
 import org.apache.spark.mllib.tree.RandomForest
+import org.apache.spark.mllib.tree.model.RandomForestModel
 import org.apache.spark.mllib.util.MLUtils
 
 // Load and parse the data file.
@@ -264,6 +275,9 @@ val labelsAndPredictions = testData.map { point =>
 val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
 println("Test Mean Squared Error = " + testMSE)
 println("Learned regression forest model:\n" + model.toDebugString)
+
+model.save("myModelPath")
+val sameModel = RandomForestModel.load("myModelPath")
 {% endhighlight %}
 </div>
 
@@ -328,10 +342,16 @@ Double testMSE =
   }) / testData.count();
 System.out.println("Test Mean Squared Error: " + testMSE);
 System.out.println("Learned regression forest model:\n" + model.toDebugString());
+
+model.save("myModelPath");
+RandomForestModel sameModel = RandomForestModel.load("myModelPath");
 {% endhighlight %}
 </div>
 
 <div data-lang="python">
+
+Note that the Python API does not yet support model save/load but will in the future.
+
 {% highlight python %}
 from pyspark.mllib.tree import RandomForest
 from pyspark.mllib.util import MLUtils
@@ -441,8 +461,6 @@ iterations.
 
 ### Examples
 
-GBTs currently have APIs in Scala and Java.  Examples in both languages are shown below.
-
 #### Classification
 
 The example below demonstrates how to load a
@@ -457,6 +475,7 @@ The test error is calculated to measure the algorithm accuracy.
 {% highlight scala %}
 import org.apache.spark.mllib.tree.GradientBoostedTrees
 import org.apache.spark.mllib.tree.configuration.BoostingStrategy
+import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
 import org.apache.spark.mllib.util.MLUtils
 
 // Load and parse the data file.
@@ -484,6 +503,9 @@ val labelAndPreds = testData.map { point =>
 val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
 println("Test Error = " + testErr)
 println("Learned classification GBT model:\n" + model.toDebugString)
+
+model.save("myModelPath")
+val sameModel = GradientBoostedTreesModel.load("myModelPath")
 {% endhighlight %}
 </div>
 
@@ -545,6 +567,38 @@ Double testErr =
   }).count() / testData.count();
 System.out.println("Test Error: " + testErr);
 System.out.println("Learned classification GBT model:\n" + model.toDebugString());
+
+model.save("myModelPath");
+GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load("myModelPath");
+{% endhighlight %}
+</div>
+
+<div data-lang="python">
+
+Note that the Python API does not yet support model save/load but will in the future.
+
+{% highlight python %}
+from pyspark.mllib.tree import GradientBoostedTrees
+from pyspark.mllib.util import MLUtils
+
+# Load and parse the data file.
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+# Split the data into training and test sets (30% held out for testing)
+(trainingData, testData) = data.randomSplit([0.7, 0.3])
+
+# Train a GradientBoostedTrees model.
+#  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
+#         (b) Use more iterations in practice.
+model = GradientBoostedTrees.trainClassifier(trainingData,
+    categoricalFeaturesInfo={}, numIterations=3)
+
+# Evaluate model on test instances and compute test error
+predictions = model.predict(testData.map(lambda x: x.features))
+labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
+testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
+print('Test Error = ' + str(testErr))
+print('Learned classification GBT model:')
+print(model.toDebugString())
 {% endhighlight %}
 </div>
 
@@ -565,6 +619,7 @@ The Mean Squared Error (MSE) is computed at the end to evaluate
 {% highlight scala %}
 import org.apache.spark.mllib.tree.GradientBoostedTrees
 import org.apache.spark.mllib.tree.configuration.BoostingStrategy
+import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
 import org.apache.spark.mllib.util.MLUtils
 
 // Load and parse the data file.
@@ -591,6 +646,9 @@ val labelsAndPredictions = testData.map { point =>
 val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
 println("Test Mean Squared Error = " + testMSE)
 println("Learned regression GBT model:\n" + model.toDebugString)
+
+model.save("myModelPath")
+val sameModel = GradientBoostedTreesModel.load("myModelPath")
 {% endhighlight %}
 </div>
 
@@ -658,6 +716,38 @@ Double testMSE =
   }) / data.count();
 System.out.println("Test Mean Squared Error: " + testMSE);
 System.out.println("Learned regression GBT model:\n" + model.toDebugString());
+
+model.save("myModelPath");
+GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load("myModelPath");
+{% endhighlight %}
+</div>
+
+<div data-lang="python">
+
+Note that the Python API does not yet support model save/load but will in the future.
+
+{% highlight python %}
+from pyspark.mllib.tree import GradientBoostedTrees
+from pyspark.mllib.util import MLUtils
+
+# Load and parse the data file.
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+# Split the data into training and test sets (30% held out for testing)
+(trainingData, testData) = data.randomSplit([0.7, 0.3])
+
+# Train a GradientBoostedTrees model.
+#  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
+#         (b) Use more iterations in practice.
+model = GradientBoostedTrees.trainRegressor(trainingData,
+    categoricalFeaturesInfo={}, numIterations=3)
+
+# Evaluate model on test instances and compute test error
+predictions = model.predict(testData.map(lambda x: x.features))
+labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
+testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
+print('Test Mean Squared Error = ' + str(testMSE))
+print('Learned regression GBT model:')
+print(model.toDebugString())
 {% endhighlight %}
 </div>
 
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index 44b7f67c57734..d9fc63b37d116 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -190,7 +190,7 @@ error.
 
 {% highlight scala %}
 import org.apache.spark.SparkContext
-import org.apache.spark.mllib.classification.SVMWithSGD
+import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.linalg.Vectors
@@ -222,6 +222,9 @@ val metrics = new BinaryClassificationMetrics(scoreAndLabels)
 val auROC = metrics.areaUnderROC()
 
 println("Area under ROC = " + auROC)
+
+model.save("myModelPath")
+val sameModel = SVMModel.load("myModelPath")
 {% endhighlight %}
 
 The `SVMWithSGD.train()` method by default performs L2 regularization with the
@@ -304,6 +307,9 @@ public class SVMClassifier {
     double auROC = metrics.areaUnderROC();
     
     System.out.println("Area under ROC = " + auROC);
+
+    model.save("myModelPath");
+    SVMModel sameModel = SVMModel.load("myModelPath");
   }
 }
 {% endhighlight %}
@@ -338,6 +344,8 @@ a dependency.
 The following example shows how to load a sample dataset, build Logistic Regression model,
 and make predictions with the resulting model to compute the training error.
 
+Note that the Python API does not yet support model save/load but will in the future.
+
 {% highlight python %}
 from pyspark.mllib.classification import LogisticRegressionWithSGD
 from pyspark.mllib.regression import LabeledPoint
@@ -391,8 +399,9 @@ values. We compute the mean squared error at the end to evaluate
 [goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit).
 
 {% highlight scala %}
-import org.apache.spark.mllib.regression.LinearRegressionWithSGD
 import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.regression.LinearRegressionModel
+import org.apache.spark.mllib.regression.LinearRegressionWithSGD
 import org.apache.spark.mllib.linalg.Vectors
 
 // Load and parse the data
@@ -413,6 +422,9 @@ val valuesAndPreds = parsedData.map { point =>
 }
 val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.mean()
 println("training Mean Squared Error = " + MSE)
+
+model.save("myModelPath")
+val sameModel = LinearRegressionModel.load("myModelPath")
 {% endhighlight %}
 
 [`RidgeRegressionWithSGD`](api/scala/index.html#org.apache.spark.mllib.regression.RidgeRegressionWithSGD)
@@ -483,6 +495,9 @@ public class LinearRegression {
       }
     ).rdd()).mean();
     System.out.println("training Mean Squared Error = " + MSE);
+
+    model.save("myModelPath");
+    LinearRegressionModel sameModel = LinearRegressionModel.load("myModelPath");
   }
 }
 {% endhighlight %}
@@ -494,6 +509,8 @@ The example then uses LinearRegressionWithSGD to build a simple linear model to
 values. We compute the mean squared error at the end to evaluate
 [goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit).
 
+Note that the Python API does not yet support model save/load but will in the future.
+
 {% highlight python %}
 from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
 from numpy import array
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index d5b044d94fdd7..81173255b590d 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -37,7 +37,7 @@ smoothing parameter `lambda` as input, and output a
 can be used for evaluation and prediction.
 
 {% highlight scala %}
-import org.apache.spark.mllib.classification.NaiveBayes
+import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 
@@ -55,6 +55,9 @@ val model = NaiveBayes.train(training, lambda = 1.0)
 
 val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
 val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
+
+model.save("myModelPath")
+val sameModel = NaiveBayesModel.load("myModelPath")
 {% endhighlight %}
 </div>
 
@@ -93,6 +96,9 @@ double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>,
       return pl._1().equals(pl._2());
     }
   }).count() / (double) test.count();
+
+model.save("myModelPath");
+NaiveBayesModel sameModel = NaiveBayesModel.load("myModelPath");
 {% endhighlight %}
 </div>
 
@@ -105,6 +111,8 @@ smoothing parameter `lambda` as input, and output a
 [NaiveBayesModel](api/python/pyspark.mllib.classification.NaiveBayesModel-class.html), which can be
 used for evaluation and prediction.
 
+Note that the Python API does not yet support model save/load but will in the future.
+
 <!-- TODO: Make Python's example consistent with Scala's and Java's. -->
 {% highlight python %}
 from pyspark.mllib.regression import LabeledPoint
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 10df6288065b8..0ffe092a07365 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -58,7 +58,8 @@ class Normalizer(VectorTransformer):
     For any 1 <= `p` < float('inf'), normalizes samples using
     sum(abs(vector) :sup:`p`) :sup:`(1/p)` as norm.
 
-    For `p` = float('inf'), max(abs(vector)) will be used as norm for normalization.
+    For `p` = float('inf'), max(abs(vector)) will be used as norm for
+    normalization.
 
     >>> v = Vectors.dense(range(3))
     >>> nor = Normalizer(1)
@@ -120,9 +121,14 @@ def transform(self, vector):
         """
         Applies standardization transformation on a vector.
 
+        Note: In Python, transform cannot currently be used within
+              an RDD transformation or action.
+              Call transform directly on the RDD instead.
+
         :param vector: Vector or RDD of Vector to be standardized.
-        :return: Standardized vector. If the variance of a column is zero,
-                it will return default `0.0` for the column with zero variance.
+        :return: Standardized vector. If the variance of a column is
+                 zero, it will return default `0.0` for the column with
+                 zero variance.
         """
         return JavaVectorTransformer.transform(self, vector)
 
@@ -148,9 +154,10 @@ def __init__(self, withMean=False, withStd=True):
         """
         :param withMean: False by default. Centers the data with mean
                  before scaling. It will build a dense output, so this
-                 does not work on sparse input and will raise an exception.
-        :param withStd: True by default. Scales the data to unit standard
-                 deviation.
+                 does not work on sparse input and will raise an
+                 exception.
+        :param withStd: True by default. Scales the data to unit
+                 standard deviation.
         """
         if not (withMean or withStd):
             warnings.warn("Both withMean and withStd are false. The model does nothing.")
@@ -159,10 +166,11 @@ def __init__(self, withMean=False, withStd=True):
 
     def fit(self, dataset):
         """
-        Computes the mean and variance and stores as a model to be used for later scaling.
+        Computes the mean and variance and stores as a model to be used
+        for later scaling.
 
-        :param data: The data used to compute the mean and variance to build
-                    the transformation model.
+        :param data: The data used to compute the mean and variance
+                 to build the transformation model.
         :return: a StandardScalarModel
         """
         dataset = dataset.map(_convert_to_vector)
@@ -174,7 +182,8 @@ class HashingTF(object):
     """
     .. note:: Experimental
 
-    Maps a sequence of terms to their term frequencies using the hashing trick.
+    Maps a sequence of terms to their term frequencies using the hashing
+    trick.
 
     Note: the terms must be hashable (can not be dict/set/list...).
 
@@ -195,8 +204,9 @@ def indexOf(self, term):
 
     def transform(self, document):
         """
-        Transforms the input document (list of terms) to term frequency vectors,
-        or transform the RDD of document to RDD of term frequency vectors.
+        Transforms the input document (list of terms) to term frequency
+        vectors, or transform the RDD of document to RDD of term
+        frequency vectors.
         """
         if isinstance(document, RDD):
             return document.map(self.transform)
@@ -220,7 +230,12 @@ def transform(self, x):
         the terms which occur in fewer than `minDocFreq`
         documents will have an entry of 0.
 
-        :param x: an RDD of term frequency vectors or a term frequency vector
+        Note: In Python, transform cannot currently be used within
+              an RDD transformation or action.
+              Call transform directly on the RDD instead.
+
+        :param x: an RDD of term frequency vectors or a term frequency
+                 vector
         :return: an RDD of TF-IDF vectors or a TF-IDF vector
         """
         if isinstance(x, RDD):
@@ -241,9 +256,9 @@ class IDF(object):
     of documents that contain term `t`.
 
     This implementation supports filtering out terms which do not appear
-    in a minimum number of documents (controlled by the variable `minDocFreq`).
-    For terms that are not in at least `minDocFreq` documents, the IDF is
-    found as 0, resulting in TF-IDFs of 0.
+    in a minimum number of documents (controlled by the variable
+    `minDocFreq`). For terms that are not in at least `minDocFreq`
+    documents, the IDF is found as 0, resulting in TF-IDFs of 0.
 
     >>> n = 4
     >>> freqs = [Vectors.sparse(n, (1, 3), (1.0, 2.0)),
@@ -325,15 +340,16 @@ class Word2Vec(object):
     The vector representation can be used as features in
     natural language processing and machine learning algorithms.
 
-    We used skip-gram model in our implementation and hierarchical softmax
-    method to train the model. The variable names in the implementation
-    matches the original C implementation.
+    We used skip-gram model in our implementation and hierarchical
+    softmax method to train the model. The variable names in the
+    implementation matches the original C implementation.
 
-    For original C implementation, see https://code.google.com/p/word2vec/
+    For original C implementation,
+    see https://code.google.com/p/word2vec/
     For research papers, see
     Efficient Estimation of Word Representations in Vector Space
-    and
-    Distributed Representations of Words and Phrases and their Compositionality.
+    and Distributed Representations of Words and Phrases and their
+    Compositionality.
 
     >>> sentence = "a b " * 100 + "a c " * 10
     >>> localDoc = [sentence, sentence]
@@ -374,15 +390,16 @@ def setLearningRate(self, learningRate):
 
     def setNumPartitions(self, numPartitions):
         """
-        Sets number of partitions (default: 1). Use a small number for accuracy.
+        Sets number of partitions (default: 1). Use a small number for
+        accuracy.
         """
         self.numPartitions = numPartitions
         return self
 
     def setNumIterations(self, numIterations):
         """
-        Sets number of iterations (default: 1), which should be smaller than or equal to number of
-        partitions.
+        Sets number of iterations (default: 1), which should be smaller
+        than or equal to number of partitions.
         """
         self.numIterations = numIterations
         return self
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 21751cc68f308..66617abb85670 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -31,8 +31,11 @@ class LabeledPoint(object):
     The features and labels of a data point.
 
     :param label: Label for this data point.
-    :param features: Vector of features for this point (NumPy array, list,
-        pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix)
+    :param features: Vector of features for this point (NumPy array,
+             list, pyspark.mllib.linalg.SparseVector, or scipy.sparse
+             column matrix)
+
+    Note: 'label' and 'features' are accessible as class attributes.
     """
 
     def __init__(self, label, features):
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 02d551b87dcc0..73618f0449ad4 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -33,6 +33,10 @@ def predict(self, x):
         """
         Predict values for a single data point or an RDD of points using
         the model trained.
+
+        Note: In Python, predict cannot currently be used within an RDD
+              transformation or action.
+              Call predict directly on the RDD instead.
         """
         if isinstance(x, RDD):
             return self.call("predict", x.map(_convert_to_vector))
@@ -48,7 +52,8 @@ def numTrees(self):
 
     def totalNumNodes(self):
         """
-        Get total number of nodes, summed over all trees in the ensemble.
+        Get total number of nodes, summed over all trees in the
+        ensemble.
         """
         return self.call("totalNumNodes")
 
@@ -71,6 +76,10 @@ def predict(self, x):
         """
         Predict the label of one or more examples.
 
+        Note: In Python, predict cannot currently be used within an RDD
+              transformation or action.
+              Call predict directly on the RDD instead.
+
         :param x:  Data point (feature vector),
                    or an RDD of data points (feature vectors).
         """
@@ -99,7 +108,8 @@ class DecisionTree(object):
     """
     .. note:: Experimental
 
-    Learning algorithm for a decision tree model for classification or regression.
+    Learning algorithm for a decision tree model for classification or
+    regression.
     """
 
     @classmethod
@@ -176,17 +186,17 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
 
         :param data: Training data: RDD of LabeledPoint.
                      Labels are real numbers.
-        :param categoricalFeaturesInfo: Map from categorical feature index
-                                        to number of categories.
-                                        Any feature not in this map
-                                        is treated as continuous.
+        :param categoricalFeaturesInfo: Map from categorical feature
+                 index to number of categories.
+                 Any feature not in this map is treated as continuous.
         :param impurity: Supported values: "variance"
         :param maxDepth: Max depth of tree.
-                         E.g., depth 0 means 1 leaf node.
-                         Depth 1 means 1 internal node + 2 leaf nodes.
-        :param maxBins: Number of bins used for finding splits at each node.
-        :param minInstancesPerNode: Min number of instances required at child
-                                    nodes to create the parent split
+                 E.g., depth 0 means 1 leaf node.
+                 Depth 1 means 1 internal node + 2 leaf nodes.
+        :param maxBins: Number of bins used for finding splits at each
+                 node.
+        :param minInstancesPerNode: Min number of instances required at
+                 child nodes to create the parent split
         :param minInfoGain: Min info gain required to create a split
         :return: DecisionTreeModel
 
@@ -229,7 +239,8 @@ class RandomForest(object):
     """
     .. note:: Experimental
 
-    Learning algorithm for a random forest model for classification or regression.
+    Learning algorithm for a random forest model for classification or
+    regression.
     """
 
     supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird")
@@ -256,26 +267,33 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,
         Method to train a decision tree model for binary or multiclass
         classification.
 
-        :param data: Training dataset: RDD of LabeledPoint. Labels should take
-               values {0, 1, ..., numClasses-1}.
+        :param data: Training dataset: RDD of LabeledPoint. Labels
+                 should take values {0, 1, ..., numClasses-1}.
         :param numClasses: number of classes for classification.
-        :param categoricalFeaturesInfo: Map storing arity of categorical features.
-               E.g., an entry (n -> k) indicates that feature n is categorical
-               with k categories indexed from 0: {0, 1, ..., k-1}.
+        :param categoricalFeaturesInfo: Map storing arity of categorical
+                 features.  E.g., an entry (n -> k) indicates that
+                 feature n is categorical with k categories indexed
+                 from 0: {0, 1, ..., k-1}.
         :param numTrees: Number of trees in the random forest.
-        :param featureSubsetStrategy: Number of features to consider for splits at
-               each node.
-               Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
-               If "auto" is set, this parameter is set based on numTrees:
-               if numTrees == 1, set to "all";
-               if numTrees > 1 (forest) set to "sqrt".
-        :param impurity: Criterion used for information gain calculation.
+        :param featureSubsetStrategy: Number of features to consider for
+                 splits at each node.
+                 Supported: "auto" (default), "all", "sqrt", "log2",
+                  "onethird".
+                 If "auto" is set, this parameter is set based on
+                 numTrees:
+                   if numTrees == 1, set to "all";
+                   if numTrees > 1 (forest) set to "sqrt".
+        :param impurity: Criterion used for information gain
+                 calculation.
                Supported values: "gini" (recommended) or "entropy".
-        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node;
-               depth 1 means 1 internal node + 2 leaf nodes. (default: 4)
-        :param maxBins: maximum number of bins used for splitting features
+        :param maxDepth: Maximum depth of the tree.
+                 E.g., depth 0 means 1 leaf node; depth 1 means
+                 1 internal node + 2 leaf nodes. (default: 4)
+        :param maxBins: maximum number of bins used for splitting
+                 features
                (default: 100)
-        :param seed: Random seed for bootstrapping and choosing feature subsets.
+        :param seed: Random seed for bootstrapping and choosing feature
+                 subsets.
         :return: RandomForestModel that can be used for prediction
 
         Example usage:
@@ -337,19 +355,24 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
                {0, 1, ..., k-1}.
         :param numTrees: Number of trees in the random forest.
         :param featureSubsetStrategy: Number of features to consider for
-               splits at each node.
-               Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
-               If "auto" is set, this parameter is set based on numTrees:
-               if numTrees == 1, set to "all";
-               if numTrees > 1 (forest) set to "onethird" for regression.
-        :param impurity: Criterion used for information gain calculation.
-               Supported values: "variance".
-        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1
-               leaf node; depth 1 means 1 internal node + 2 leaf nodes.
-               (default: 4)
-        :param maxBins: maximum number of bins used for splitting features
-               (default: 100)
-        :param seed: Random seed for bootstrapping and choosing feature subsets.
+                 splits at each node.
+                 Supported: "auto" (default), "all", "sqrt", "log2",
+                   "onethird".
+                 If "auto" is set, this parameter is set based on
+                 numTrees:
+                   if numTrees == 1, set to "all";
+                   if numTrees > 1 (forest) set to "onethird" for
+                     regression.
+        :param impurity: Criterion used for information gain
+                 calculation.
+                 Supported values: "variance".
+        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
+                 1 leaf node; depth 1 means 1 internal node + 2 leaf
+                 nodes. (default: 4)
+        :param maxBins: maximum number of bins used for splitting
+                 features (default: 100)
+        :param seed: Random seed for bootstrapping and choosing feature
+                 subsets.
         :return: RandomForestModel that can be used for prediction
 
         Example usage:
@@ -395,7 +418,8 @@ class GradientBoostedTrees(object):
     """
     .. note:: Experimental
 
-    Learning algorithm for a gradient boosted trees model for classification or regression.
+    Learning algorithm for a gradient boosted trees model for
+    classification or regression.
     """
 
     @classmethod
@@ -411,24 +435,29 @@ def _train(cls, data, algo, categoricalFeaturesInfo,
     def trainClassifier(cls, data, categoricalFeaturesInfo,
                         loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3):
         """
-        Method to train a gradient-boosted trees model for classification.
+        Method to train a gradient-boosted trees model for
+        classification.
 
-        :param data: Training dataset: RDD of LabeledPoint. Labels should take values {0, 1}.
+        :param data: Training dataset: RDD of LabeledPoint.
+                 Labels should take values {0, 1}.
         :param categoricalFeaturesInfo: Map storing arity of categorical
                features. E.g., an entry (n -> k) indicates that feature
                n is categorical with k categories indexed from 0:
                {0, 1, ..., k-1}.
-        :param loss: Loss function used for minimization during gradient boosting.
-                     Supported: {"logLoss" (default), "leastSquaresError", "leastAbsoluteError"}.
+        :param loss: Loss function used for minimization during gradient
+                 boosting. Supported: {"logLoss" (default),
+                 "leastSquaresError", "leastAbsoluteError"}.
         :param numIterations: Number of iterations of boosting.
                               (default: 100)
-        :param learningRate: Learning rate for shrinking the contribution of each estimator.
-                             The learning rate should be between in the interval (0, 1]
-                             (default: 0.1)
-        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1
-               leaf node; depth 1 means 1 internal node + 2 leaf nodes.
-               (default: 3)
-        :return: GradientBoostedTreesModel that can be used for prediction
+        :param learningRate: Learning rate for shrinking the
+                 contribution of each estimator. The learning rate
+                 should be between in the interval (0, 1].
+                 (default: 0.1)
+        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
+                 1 leaf node; depth 1 means 1 internal node + 2 leaf
+                 nodes. (default: 3)
+        :return: GradientBoostedTreesModel that can be used for
+                   prediction
 
         Example usage:
 
@@ -472,17 +501,20 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
                features. E.g., an entry (n -> k) indicates that feature
                n is categorical with k categories indexed from 0:
                {0, 1, ..., k-1}.
-        :param loss: Loss function used for minimization during gradient boosting.
-                     Supported: {"logLoss" (default), "leastSquaresError", "leastAbsoluteError"}.
+        :param loss: Loss function used for minimization during gradient
+                 boosting. Supported: {"logLoss" (default),
+                 "leastSquaresError", "leastAbsoluteError"}.
         :param numIterations: Number of iterations of boosting.
                               (default: 100)
-        :param learningRate: Learning rate for shrinking the contribution of each estimator.
-                             The learning rate should be between in the interval (0, 1]
-                             (default: 0.1)
-        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1
-               leaf node; depth 1 means 1 internal node + 2 leaf nodes.
-               (default: 3)
-        :return: GradientBoostedTreesModel that can be used for prediction
+        :param learningRate: Learning rate for shrinking the
+                 contribution of each estimator. The learning rate
+                 should be between in the interval (0, 1].
+                 (default: 0.1)
+        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
+                 1 leaf node; depth 1 means 1 internal node + 2 leaf
+                 nodes.  (default: 3)
+        :return: GradientBoostedTreesModel that can be used for
+                   prediction
 
         Example usage:
 

From e43139f40309995b1133c7ef2936ab858b7b44fc Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 25 Feb 2015 23:43:29 -0800
Subject: [PATCH 313/357] [SPARK-5976][MLLIB] Add partitioner to factors
 returned by ALS

The model trained by ALS requires partitioning information to do quick lookup of a user/item factor for making recommendation on individual requests. In the new implementation, we didn't set partitioners in the factors returned by ALS, which would cause performance regression.

srowen coderxiang

Author: Xiangrui Meng <meng@databricks.com>

Closes #4748 from mengxr/SPARK-5976 and squashes the following commits:

9373a09 [Xiangrui Meng] add partitioner to factors returned by ALS
260f183 [Xiangrui Meng] add a test for partitioner
---
 .../apache/spark/ml/recommendation/ALS.scala  | 55 +++++++++++--------
 .../spark/ml/recommendation/ALSSuite.scala    | 32 ++++++++++-
 2 files changed, 64 insertions(+), 23 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index c2ec716f08b7c..7bb69df65362b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -29,7 +29,7 @@ import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
 import org.jblas.DoubleMatrix
 import org.netlib.util.intW
 
-import org.apache.spark.{HashPartitioner, Logging, Partitioner}
+import org.apache.spark.{Logging, Partitioner}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
@@ -501,8 +501,8 @@ object ALS extends Logging {
     require(intermediateRDDStorageLevel != StorageLevel.NONE,
       "ALS is not designed to run without persisting intermediate RDDs.")
     val sc = ratings.sparkContext
-    val userPart = new HashPartitioner(numUserBlocks)
-    val itemPart = new HashPartitioner(numItemBlocks)
+    val userPart = new ALSPartitioner(numUserBlocks)
+    val itemPart = new ALSPartitioner(numItemBlocks)
     val userLocalIndexEncoder = new LocalIndexEncoder(userPart.numPartitions)
     val itemLocalIndexEncoder = new LocalIndexEncoder(itemPart.numPartitions)
     val solver = if (nonnegative) new NNLSSolver else new CholeskySolver
@@ -550,13 +550,23 @@ object ALS extends Logging {
     val userIdAndFactors = userInBlocks
       .mapValues(_.srcIds)
       .join(userFactors)
-      .values
+      .mapPartitions({ items =>
+        items.flatMap { case (_, (ids, factors)) =>
+          ids.view.zip(factors)
+        }
+      // Preserve the partitioning because IDs are consistent with the partitioners in userInBlocks
+      // and userFactors.
+      }, preservesPartitioning = true)
       .setName("userFactors")
       .persist(finalRDDStorageLevel)
     val itemIdAndFactors = itemInBlocks
       .mapValues(_.srcIds)
       .join(itemFactors)
-      .values
+      .mapPartitions({ items =>
+        items.flatMap { case (_, (ids, factors)) =>
+          ids.view.zip(factors)
+        }
+      }, preservesPartitioning = true)
       .setName("itemFactors")
       .persist(finalRDDStorageLevel)
     if (finalRDDStorageLevel != StorageLevel.NONE) {
@@ -569,13 +579,7 @@ object ALS extends Logging {
       itemOutBlocks.unpersist()
       blockRatings.unpersist()
     }
-    val userOutput = userIdAndFactors.flatMap { case (ids, factors) =>
-      ids.view.zip(factors)
-    }
-    val itemOutput = itemIdAndFactors.flatMap { case (ids, factors) =>
-      ids.view.zip(factors)
-    }
-    (userOutput, itemOutput)
+    (userIdAndFactors, itemIdAndFactors)
   }
 
   /**
@@ -995,15 +999,15 @@ object ALS extends Logging {
           "Converting to local indices took " + (System.nanoTime() - start) / 1e9 + " seconds.")
         val dstLocalIndices = dstIds.map(dstIdToLocalIndex.apply)
         (srcBlockId, (dstBlockId, srcIds, dstLocalIndices, ratings))
-    }.groupByKey(new HashPartitioner(srcPart.numPartitions))
-        .mapValues { iter =>
-      val builder =
-        new UncompressedInBlockBuilder[ID](new LocalIndexEncoder(dstPart.numPartitions))
-      iter.foreach { case (dstBlockId, srcIds, dstLocalIndices, ratings) =>
-        builder.add(dstBlockId, srcIds, dstLocalIndices, ratings)
-      }
-      builder.build().compress()
-    }.setName(prefix + "InBlocks")
+    }.groupByKey(new ALSPartitioner(srcPart.numPartitions))
+      .mapValues { iter =>
+        val builder =
+          new UncompressedInBlockBuilder[ID](new LocalIndexEncoder(dstPart.numPartitions))
+        iter.foreach { case (dstBlockId, srcIds, dstLocalIndices, ratings) =>
+          builder.add(dstBlockId, srcIds, dstLocalIndices, ratings)
+        }
+        builder.build().compress()
+      }.setName(prefix + "InBlocks")
       .persist(storageLevel)
     val outBlocks = inBlocks.mapValues { case InBlock(srcIds, dstPtrs, dstEncodedIndices, _) =>
       val encoder = new LocalIndexEncoder(dstPart.numPartitions)
@@ -1064,7 +1068,7 @@ object ALS extends Logging {
           (dstBlockId, (srcBlockId, activeIndices.map(idx => srcFactors(idx))))
         }
     }
-    val merged = srcOut.groupByKey(new HashPartitioner(dstInBlocks.partitions.length))
+    val merged = srcOut.groupByKey(new ALSPartitioner(dstInBlocks.partitions.length))
     dstInBlocks.join(merged).mapValues {
       case (InBlock(dstIds, srcPtrs, srcEncodedIndices, ratings), srcFactors) =>
         val sortedSrcFactors = new Array[FactorBlock](numSrcBlocks)
@@ -1149,4 +1153,11 @@ object ALS extends Logging {
       encoded & localIndexMask
     }
   }
+
+  /**
+   * Partitioner used by ALS. We requires that getPartition is a projection. That is, for any key k,
+   * we have getPartition(getPartition(k)) = getPartition(k). Since the the default HashPartitioner
+   * satisfies this requirement, we simply use a type alias here.
+   */
+  private[recommendation] type ALSPartitioner = org.apache.spark.HashPartitioner
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index 376c3626f9bbb..bb86bafc0eb0a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -25,7 +25,7 @@ import scala.collection.mutable.ArrayBuffer
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 import org.scalatest.FunSuite
 
-import org.apache.spark.Logging
+import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.ml.recommendation.ALS._
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
@@ -455,4 +455,34 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
     assert(isNonnegative(itemFactors))
     // TODO: Validate the solution.
   }
+
+  test("als partitioner is a projection") {
+    for (p <- Seq(1, 10, 100, 1000)) {
+      val part = new ALSPartitioner(p)
+      var k = 0
+      while (k < p) {
+        assert(k === part.getPartition(k))
+        assert(k === part.getPartition(k.toLong))
+        k += 1
+      }
+    }
+  }
+
+  test("partitioner in returned factors") {
+    val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
+    val (userFactors, itemFactors) = ALS.train(
+      ratings, rank = 2, maxIter = 4, numUserBlocks = 3, numItemBlocks = 4)
+    for ((tpe, factors) <- Seq(("User", userFactors), ("Item", itemFactors))) {
+      assert(userFactors.partitioner.isDefined, s"$tpe factors should have partitioner.")
+      val part = userFactors.partitioner.get
+      userFactors.mapPartitionsWithIndex { (idx, items) =>
+        items.foreach { case (id, _) =>
+          if (part.getPartition(id) != idx) {
+            throw new SparkException(s"$tpe with ID $id should not be in partition $idx.")
+          }
+        }
+        Iterator.empty
+      }.count()
+    }
+  }
 }

From 51a6f9097bb475cb518ca766a46c7226640cf58e Mon Sep 17 00:00:00 2001
From: Judy Nash <judynash@microsoft.com>
Date: Thu, 26 Feb 2015 11:14:37 +0000
Subject: [PATCH 314/357] [SPARK-5914] to run spark-submit requiring only user
 perm on windows

Because windows on-default does not grant read permission to jars except to admin, spark-submit would fail with "ClassNotFound" exception if user runs slave service with only user permission.
This fix is to add read permission to owner of the jar (which would be the slave service account in windows )

Author: Judy Nash <judynash@microsoft.com>

Closes #4742 from judynash/SPARK-5914 and squashes the following commits:

e288e56 [Judy Nash] Fix spacing and refactor code
1de3c0e [Judy Nash] [SPARK-5914] Enable spark-submit to run requiring only user permission on windows
---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 4803ff9403b1d..1396f167eb8be 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -440,6 +440,12 @@ private[spark] object Utils extends Logging {
     }
     // Make the file executable - That's necessary for scripts
     FileUtil.chmod(targetFile.getAbsolutePath, "a+x")
+
+    // Windows does not grant read permission by default to non-admin users
+    // Add read permission to owner explicitly
+    if (isWindows) {
+      FileUtil.chmod(targetFile.getAbsolutePath, "u+r")
+    }
   }
 
   /**

From f02394d06473889d0d7897c4583239e6e136ff46 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Thu, 26 Feb 2015 22:39:49 +0800
Subject: [PATCH 315/357] [SPARK-6023][SQL] ParquetConversions fails to replace
 the destination MetastoreRelation of an InsertIntoTable node to
 ParquetRelation2

JIRA: https://issues.apache.org/jira/browse/SPARK-6023

Author: Yin Huai <yhuai@databricks.com>

Closes #4782 from yhuai/parquetInsertInto and squashes the following commits:

ae7e806 [Yin Huai] Convert MetastoreRelation in InsertIntoTable and InsertIntoHiveTable.
ba543cd [Yin Huai] More tests.
50b6d0f [Yin Huai] Update error messages.
346780c [Yin Huai] Failed test.
---
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  21 +++
 .../spark/sql/parquet/parquetSuites.scala     | 138 +++++++++++++++++-
 2 files changed, 152 insertions(+), 7 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 2cc8d65d3cb79..8af5a4848fd44 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -440,6 +440,17 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
           val attributedRewrites = relation.output.zip(parquetRelation.output)
           (relation, parquetRelation, attributedRewrites)
 
+        // Write path
+        case InsertIntoHiveTable(relation: MetastoreRelation, _, _, _)
+          // Inserting into partitioned table is not supported in Parquet data source (yet).
+          if !relation.hiveQlTable.isPartitioned &&
+            hive.convertMetastoreParquet &&
+            hive.conf.parquetUseDataSourceApi &&
+            relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet") =>
+          val parquetRelation = convertToParquetRelation(relation)
+          val attributedRewrites = relation.output.zip(parquetRelation.output)
+          (relation, parquetRelation, attributedRewrites)
+
         // Read path
         case p @ PhysicalOperation(_, _, relation: MetastoreRelation)
             if hive.convertMetastoreParquet &&
@@ -464,6 +475,16 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
 
           withAlias
         }
+        case InsertIntoTable(r: MetastoreRelation, partition, child, overwrite)
+          if relationMap.contains(r) => {
+          val parquetRelation = relationMap(r)
+          InsertIntoTable(parquetRelation, partition, child, overwrite)
+        }
+        case InsertIntoHiveTable(r: MetastoreRelation, partition, child, overwrite)
+          if relationMap.contains(r) => {
+          val parquetRelation = relationMap(r)
+          InsertIntoTable(parquetRelation, partition, child, overwrite)
+        }
         case other => other.transformExpressions {
           case a: Attribute if a.resolved => attributedRewrites.getOrElse(a, a)
         }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
index 653f4b47367c4..80fd5cda20e20 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
@@ -24,11 +24,11 @@ import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.{SQLConf, QueryTest}
 import org.apache.spark.sql.catalyst.expressions.Row
-import org.apache.spark.sql.execution.PhysicalRDD
-import org.apache.spark.sql.hive.execution.HiveTableScan
+import org.apache.spark.sql.execution.{ExecutedCommand, PhysicalRDD}
+import org.apache.spark.sql.hive.execution.{InsertIntoHiveTable, HiveTableScan}
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
-import org.apache.spark.sql.sources.LogicalRelation
+import org.apache.spark.sql.sources.{InsertIntoDataSource, LogicalRelation}
 
 // The data where the partitioning key exists only in the directory structure.
 case class ParquetData(intField: Int, stringField: String)
@@ -93,6 +93,11 @@ class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
       sql(s"ALTER TABLE partitioned_parquet_with_key ADD PARTITION (p=$p)")
     }
 
+    val rdd1 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
+    jsonRDD(rdd1).registerTempTable("jt")
+    val rdd2 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":[$i, null]}"""))
+    jsonRDD(rdd2).registerTempTable("jt_array")
+
     setConf("spark.sql.hive.convertMetastoreParquet", "true")
   }
 
@@ -100,6 +105,8 @@ class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
     sql("DROP TABLE partitioned_parquet")
     sql("DROP TABLE partitioned_parquet_with_key")
     sql("DROP TABLE normal_parquet")
+    sql("DROP TABLE IF EXISTS jt")
+    sql("DROP TABLE IF EXISTS jt_array")
     setConf("spark.sql.hive.convertMetastoreParquet", "false")
   }
 
@@ -122,9 +129,6 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
   override def beforeAll(): Unit = {
     super.beforeAll()
 
-    val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
-    jsonRDD(rdd).registerTempTable("jt")
-
     sql(
       """
         |create table test_parquet
@@ -143,7 +147,6 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
 
   override def afterAll(): Unit = {
     super.afterAll()
-    sql("DROP TABLE IF EXISTS jt")
     sql("DROP TABLE IF EXISTS test_parquet")
 
     setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
@@ -238,6 +241,70 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
 
     sql("DROP TABLE IF EXISTS test_parquet_ctas")
   }
+
+  test("MetastoreRelation in InsertIntoTable will be converted") {
+    sql(
+      """
+        |create table test_insert_parquet
+        |(
+        |  intField INT
+        |)
+        |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+        |STORED AS
+        |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+        |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+      """.stripMargin)
+
+    val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt")
+    df.queryExecution.executedPlan match {
+      case ExecutedCommand(
+        InsertIntoDataSource(
+          LogicalRelation(r: ParquetRelation2), query, overwrite)) => // OK
+      case o => fail("test_insert_parquet should be converted to a " +
+        s"${classOf[ParquetRelation2].getCanonicalName} and " +
+        s"${classOf[InsertIntoDataSource].getCanonicalName} is expcted as the SparkPlan." +
+        s"However, found a ${o.toString} ")
+    }
+
+    checkAnswer(
+      sql("SELECT intField FROM test_insert_parquet WHERE test_insert_parquet.intField > 5"),
+      sql("SELECT a FROM jt WHERE jt.a > 5").collect()
+    )
+
+    sql("DROP TABLE IF EXISTS test_insert_parquet")
+  }
+
+  test("MetastoreRelation in InsertIntoHiveTable will be converted") {
+    sql(
+      """
+        |create table test_insert_parquet
+        |(
+        |  int_array array<int>
+        |)
+        |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+        |STORED AS
+        |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+        |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+      """.stripMargin)
+
+    val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt_array")
+    df.queryExecution.executedPlan match {
+      case ExecutedCommand(
+        InsertIntoDataSource(
+          LogicalRelation(r: ParquetRelation2), query, overwrite)) => // OK
+      case o => fail("test_insert_parquet should be converted to a " +
+        s"${classOf[ParquetRelation2].getCanonicalName} and " +
+        s"${classOf[InsertIntoDataSource].getCanonicalName} is expcted as the SparkPlan." +
+        s"However, found a ${o.toString} ")
+    }
+
+    checkAnswer(
+      sql("SELECT int_array FROM test_insert_parquet"),
+      sql("SELECT a FROM jt_array").collect()
+    )
+
+    sql("DROP TABLE IF EXISTS test_insert_parquet")
+  }
 }
 
 class ParquetDataSourceOffMetastoreSuite extends ParquetMetastoreSuiteBase {
@@ -252,6 +319,63 @@ class ParquetDataSourceOffMetastoreSuite extends ParquetMetastoreSuiteBase {
     super.afterAll()
     setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString)
   }
+
+  test("MetastoreRelation in InsertIntoTable will not be converted") {
+    sql(
+      """
+        |create table test_insert_parquet
+        |(
+        |  intField INT
+        |)
+        |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+        |STORED AS
+        |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+        |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+      """.stripMargin)
+
+    val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt")
+    df.queryExecution.executedPlan match {
+      case insert: InsertIntoHiveTable => // OK
+      case o => fail(s"The SparkPlan should be ${classOf[InsertIntoHiveTable].getCanonicalName}. " +
+        s"However, found ${o.toString}.")
+    }
+
+    checkAnswer(
+      sql("SELECT intField FROM test_insert_parquet WHERE test_insert_parquet.intField > 5"),
+      sql("SELECT a FROM jt WHERE jt.a > 5").collect()
+    )
+
+    sql("DROP TABLE IF EXISTS test_insert_parquet")
+  }
+
+  // TODO: enable it after the fix of SPARK-5950.
+  ignore("MetastoreRelation in InsertIntoHiveTable will not be converted") {
+    sql(
+      """
+        |create table test_insert_parquet
+        |(
+        |  int_array array<int>
+        |)
+        |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+        |STORED AS
+        |  INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+        |  OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+      """.stripMargin)
+
+    val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt_array")
+    df.queryExecution.executedPlan match {
+      case insert: InsertIntoHiveTable => // OK
+      case o => fail(s"The SparkPlan should be ${classOf[InsertIntoHiveTable].getCanonicalName}. " +
+        s"However, found ${o.toString}.")
+    }
+
+    checkAnswer(
+      sql("SELECT int_array FROM test_insert_parquet"),
+      sql("SELECT a FROM jt_array").collect()
+    )
+
+    sql("DROP TABLE IF EXISTS test_insert_parquet")
+  }
 }
 
 /**

From 192e42a2933eb283e12bfdfb46e2ef895228af4a Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Fri, 27 Feb 2015 01:01:32 +0800
Subject: [PATCH 316/357] [SPARK-6016][SQL] Cannot read the parquet table after
 overwriting the existing table when spark.sql.parquet.cacheMetadata=true

Please see JIRA (https://issues.apache.org/jira/browse/SPARK-6016) for details of the bug.

Author: Yin Huai <yhuai@databricks.com>

Closes #4775 from yhuai/parquetFooterCache and squashes the following commits:

78787b1 [Yin Huai] Remove footerCache in FilteringParquetRowInputFormat.
dff6fba [Yin Huai] Failed unit test.
---
 .../sql/parquet/ParquetTableOperations.scala  | 49 +++----------------
 .../apache/spark/sql/parquet/newParquet.scala |  8 ++-
 .../spark/sql/parquet/parquetSuites.scala     | 27 ++++++++++
 3 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 4dc13b036cd4e..9061d3f5fee4d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -374,8 +374,6 @@ private[parquet] class AppendingParquetOutputFormat(offset: Int)
 private[parquet] class FilteringParquetRowInputFormat
   extends parquet.hadoop.ParquetInputFormat[Row] with Logging {
 
-  private var footers: JList[Footer] = _
-
   private var fileStatuses = Map.empty[Path, FileStatus]
 
   override def createRecordReader(
@@ -396,46 +394,15 @@ private[parquet] class FilteringParquetRowInputFormat
     }
   }
 
-  override def getFooters(jobContext: JobContext): JList[Footer] = {
-    import org.apache.spark.sql.parquet.FilteringParquetRowInputFormat.footerCache
-
-    if (footers eq null) {
-      val conf = ContextUtil.getConfiguration(jobContext)
-      val cacheMetadata = conf.getBoolean(SQLConf.PARQUET_CACHE_METADATA, true)
-      val statuses = listStatus(jobContext)
-      fileStatuses = statuses.map(file => file.getPath -> file).toMap
-      if (statuses.isEmpty) {
-        footers = Collections.emptyList[Footer]
-      } else if (!cacheMetadata) {
-        // Read the footers from HDFS
-        footers = getFooters(conf, statuses)
-      } else {
-        // Read only the footers that are not in the footerCache
-        val foundFooters = footerCache.getAllPresent(statuses)
-        val toFetch = new ArrayList[FileStatus]
-        for (s <- statuses) {
-          if (!foundFooters.containsKey(s)) {
-            toFetch.add(s)
-          }
-        }
-        val newFooters = new mutable.HashMap[FileStatus, Footer]
-        if (toFetch.size > 0) {
-          val startFetch = System.currentTimeMillis
-          val fetched = getFooters(conf, toFetch)
-          logInfo(s"Fetched $toFetch footers in ${System.currentTimeMillis - startFetch} ms")
-          for ((status, i) <- toFetch.zipWithIndex) {
-            newFooters(status) = fetched.get(i)
-          }
-          footerCache.putAll(newFooters)
-        }
-        footers = new ArrayList[Footer](statuses.size)
-        for (status <- statuses) {
-          footers.add(newFooters.getOrElse(status, foundFooters.get(status)))
-        }
-      }
-    }
+  // This is only a temporary solution sicne we need to use fileStatuses in
+  // both getClientSideSplits and getTaskSideSplits. It can be removed once we get rid of these
+  // two methods.
+  override def getSplits(jobContext: JobContext): JList[InputSplit] = {
+    // First set fileStatuses.
+    val statuses = listStatus(jobContext)
+    fileStatuses = statuses.map(file => file.getPath -> file).toMap
 
-    footers
+    super.getSplits(jobContext)
   }
 
   // TODO Remove this method and related code once PARQUET-16 is fixed
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 16b771344bfcd..e648618468d5d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -200,7 +200,7 @@ private[sql] case class ParquetRelation2(
     private var commonMetadataStatuses: Array[FileStatus] = _
 
     // Parquet footer cache.
-    private var footers: Map[FileStatus, Footer] = _
+    var footers: Map[FileStatus, Footer] = _
 
     // `FileStatus` objects of all data files (Parquet part-files).
     var dataStatuses: Array[FileStatus] = _
@@ -400,6 +400,7 @@ private[sql] case class ParquetRelation2(
     } else {
       metadataCache.dataStatuses.toSeq
     }
+    val selectedFooters = selectedFiles.map(metadataCache.footers)
 
     // FileInputFormat cannot handle empty lists.
     if (selectedFiles.nonEmpty) {
@@ -447,11 +448,16 @@ private[sql] case class ParquetRelation2(
         @transient
         val cachedStatus = selectedFiles
 
+        @transient
+        val cachedFooters = selectedFooters
+
         // Overridden so we can inject our own cached files statuses.
         override def getPartitions: Array[SparkPartition] = {
           val inputFormat = if (cacheMetadata) {
             new FilteringParquetRowInputFormat {
               override def listStatus(jobContext: JobContext): JList[FileStatus] = cachedStatus
+
+              override def getFooters(jobContext: JobContext): JList[Footer] = cachedFooters
             }
           } else {
             new FilteringParquetRowInputFormat
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
index 80fd5cda20e20..6a9d9daf6750c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.hive.execution.{InsertIntoHiveTable, HiveTableScan}
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.sql.sources.{InsertIntoDataSource, LogicalRelation}
+import org.apache.spark.sql.SaveMode
 
 // The data where the partitioning key exists only in the directory structure.
 case class ParquetData(intField: Int, stringField: String)
@@ -409,6 +410,32 @@ class ParquetSourceSuiteBase extends ParquetPartitioningTest {
       )
     """)
   }
+
+  test("SPARK-6016 make sure to use the latest footers") {
+    sql("drop table if exists spark_6016_fix")
+
+    // Create a DataFrame with two partitions. So, the created table will have two parquet files.
+    val df1 = jsonRDD(sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i}"""), 2))
+    df1.saveAsTable("spark_6016_fix", "parquet", SaveMode.Overwrite)
+    checkAnswer(
+      sql("select * from spark_6016_fix"),
+      (1 to 10).map(i => Row(i))
+    )
+
+    // Create a DataFrame with four partitions. So, the created table will have four parquet files.
+    val df2 = jsonRDD(sparkContext.parallelize((1 to 10).map(i => s"""{"b":$i}"""), 4))
+    df2.saveAsTable("spark_6016_fix", "parquet", SaveMode.Overwrite)
+    // For the bug of SPARK-6016, we are caching two outdated footers for df1. Then,
+    // since the new table has four parquet files, we are trying to read new footers from two files
+    // and then merge metadata in footers of these four (two outdated ones and two latest one),
+    // which will cause an error.
+    checkAnswer(
+      sql("select * from spark_6016_fix"),
+      (1 to 10).map(i => Row(i))
+    )
+
+    sql("drop table spark_6016_fix")
+  }
 }
 
 class ParquetDataSourceOnSourceSuite extends ParquetSourceSuiteBase {

From df3d559b32f1ceb8ca3491e2a1169c56a6faab58 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 26 Feb 2015 17:35:03 +0000
Subject: [PATCH 317/357] [SPARK-5801] [core] Avoid creating nested
 directories.

Cache the value of the local root dirs to use for storing local data,
so that the same directories are reused.

Also, to avoid an extra level of nesting, use a different env variable
to propagate the local dirs from the Worker to the executors. And make
the executor directory use a different name.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #4747 from vanzin/SPARK-5801 and squashes the following commits:

e0114e1 [Marcelo Vanzin] Update unit test.
18ee0a7 [Marcelo Vanzin] [SPARK-5801] [core] Avoid creating nested directories.
---
 .../spark/deploy/worker/ExecutorRunner.scala  |  2 +-
 .../apache/spark/deploy/worker/Worker.scala   |  4 ++--
 .../scala/org/apache/spark/util/Utils.scala   | 23 +++++++++++++++++++
 .../apache/spark/storage/LocalDirsSuite.scala |  8 +++++--
 4 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
index bea04cd542fd1..6653aca0a0f06 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
@@ -135,7 +135,7 @@ private[spark] class ExecutorRunner(
       logInfo("Launch command: " + command.mkString("\"", "\" \"", "\""))
 
       builder.directory(executorDir)
-      builder.environment.put("SPARK_LOCAL_DIRS", appLocalDirs.mkString(","))
+      builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))
       // In case we are running this from within the Spark Shell, avoid creating a "scala"
       // parent process for the executor command
       builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 10929eb516041..2473a90aa9309 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -345,11 +345,11 @@ private[spark] class Worker(
           }
 
           // Create local dirs for the executor. These are passed to the executor via the
-          // SPARK_LOCAL_DIRS environment variable, and deleted by the Worker when the
+          // SPARK_EXECUTOR_DIRS environment variable, and deleted by the Worker when the
           // application finishes.
           val appLocalDirs = appDirectories.get(appId).getOrElse {
             Utils.getOrCreateLocalRootDirs(conf).map { dir =>
-              Utils.createDirectory(dir).getAbsolutePath()
+              Utils.createDirectory(dir, namePrefix = "executor").getAbsolutePath()
             }.toSeq
           }
           appDirectories(appId) = appLocalDirs
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 1396f167eb8be..4644088f19f4b 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -63,6 +63,7 @@ private[spark] object Utils extends Logging {
   val random = new Random()
 
   private val MAX_DIR_CREATION_ATTEMPTS: Int = 10
+  @volatile private var localRootDirs: Array[String] = null
 
   /** Serialize an object using Java serialization */
   def serialize[T](o: T): Array[Byte] = {
@@ -683,14 +684,31 @@ private[spark] object Utils extends Logging {
    * and returns only the directories that exist / could be created.
    *
    * If no directories could be created, this will return an empty list.
+   *
+   * This method will cache the local directories for the application when it's first invoked.
+   * So calling it multiple times with a different configuration will always return the same
+   * set of directories.
    */
   private[spark] def getOrCreateLocalRootDirs(conf: SparkConf): Array[String] = {
+    if (localRootDirs == null) {
+      this.synchronized {
+        if (localRootDirs == null) {
+          localRootDirs = getOrCreateLocalRootDirsImpl(conf)
+        }
+      }
+    }
+    localRootDirs
+  }
+
+  private def getOrCreateLocalRootDirsImpl(conf: SparkConf): Array[String] = {
     if (isRunningInYarnContainer(conf)) {
       // If we are in yarn mode, systems can have different disk layouts so we must set it
       // to what Yarn on this system said was available. Note this assumes that Yarn has
       // created the directories already, and that they are secured so that only the
       // user has access to them.
       getYarnLocalDirs(conf).split(",")
+    } else if (conf.getenv("SPARK_EXECUTOR_DIRS") != null) {
+      conf.getenv("SPARK_EXECUTOR_DIRS").split(File.pathSeparator)
     } else {
       // In non-Yarn mode (or for the driver in yarn-client mode), we cannot trust the user
       // configuration to point to a secure directory. So create a subdirectory with restricted
@@ -734,6 +752,11 @@ private[spark] object Utils extends Logging {
     localDirs
   }
 
+  /** Used by unit tests. Do not call from other places. */
+  private[spark] def clearLocalRootDirs(): Unit = {
+    localRootDirs = null
+  }
+
   /**
    * Shuffle the elements of a collection into a random order, returning the
    * result in a new collection. Unlike scala.util.Random.shuffle, this method
diff --git a/core/src/test/scala/org/apache/spark/storage/LocalDirsSuite.scala b/core/src/test/scala/org/apache/spark/storage/LocalDirsSuite.scala
index 8cf951adb354b..82a82e23eecf2 100644
--- a/core/src/test/scala/org/apache/spark/storage/LocalDirsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/LocalDirsSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.storage
 import java.io.File
 
 import org.apache.spark.util.Utils
-import org.scalatest.FunSuite
+import org.scalatest.{BeforeAndAfter, FunSuite}
 
 import org.apache.spark.SparkConf
 
@@ -28,7 +28,11 @@ import org.apache.spark.SparkConf
 /**
  * Tests for the spark.local.dir and SPARK_LOCAL_DIRS configuration options.
  */
-class LocalDirsSuite extends FunSuite {
+class LocalDirsSuite extends FunSuite with BeforeAndAfter {
+
+  before {
+    Utils.clearLocalRootDirs()
+  }
 
   test("Utils.getLocalDir() returns a valid directory, even if some local dirs are missing") {
     // Regression test for SPARK-2974

From 2358657547016d647cdd2e2d363426fcd8d3e9ff Mon Sep 17 00:00:00 2001
From: Jacky Li <jacky.likun@huawei.com>
Date: Thu, 26 Feb 2015 10:40:58 -0800
Subject: [PATCH 318/357] [SPARK-6007][SQL] Add numRows param in
 DataFrame.show()

It is useful to let the user decide the number of rows to show in DataFrame.show

Author: Jacky Li <jacky.likun@huawei.com>

Closes #4767 from jackylk/show and squashes the following commits:

a0e0f4b [Jacky Li] fix testcase
7cdbe91 [Jacky Li] modify according to comment
bb54537 [Jacky Li] for Java compatibility
d7acc18 [Jacky Li] modify according to comments
981be52 [Jacky Li] add numRows param in DataFrame.show()
---
 python/pyspark/sql/dataframe.py                     |  6 +++---
 .../main/scala/org/apache/spark/sql/DataFrame.scala | 13 ++++++++++---
 .../org/apache/spark/sql/JavaDataFrameSuite.java    |  9 +++++++++
 .../scala/org/apache/spark/sql/DataFrameSuite.scala |  5 +++++
 4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 6d42410020b64..aec99017fbdc1 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -272,9 +272,9 @@ def isLocal(self):
         """
         return self._jdf.isLocal()
 
-    def show(self):
+    def show(self, n=20):
         """
-        Print the first 20 rows.
+        Print the first n rows.
 
         >>> df
         DataFrame[age: int, name: string]
@@ -283,7 +283,7 @@ def show(self):
         2   Alice
         5   Bob
         """
-        print self._jdf.showString().encode('utf8', 'ignore')
+        print self._jdf.showString(n).encode('utf8', 'ignore')
 
     def __repr__(self):
         return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index f045da305ca5d..060ab5e9a0cfa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -159,9 +159,10 @@ class DataFrame protected[sql](
 
   /**
    * Internal API for Python
+   * @param numRows Number of rows to show
    */
-  private[sql] def showString(): String = {
-    val data = take(20)
+  private[sql] def showString(numRows: Int): String = {
+    val data = take(numRows)
     val numCols = schema.fieldNames.length
 
     // For cells that are beyond 20 characters, replace it with the first 17 and "..."
@@ -293,9 +294,15 @@ class DataFrame protected[sql](
    *   1983  03    0.410516        0.442194
    *   1984  04    0.450090        0.483521
    * }}}
+   * @param numRows Number of rows to show
    * @group basic
    */
-  def show(): Unit = println(showString())
+  def show(numRows: Int): Unit = println(showString(numRows))
+
+  /**
+   * Displays the top 20 rows of [[DataFrame]] in a tabular form.
+   */
+  def show(): Unit = show(20)
 
   /**
    * Cartesian join with another [[DataFrame]].
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index c1c51f80d6586..2d586f784ac5a 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -20,6 +20,7 @@
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
+import org.junit.Ignore;
 import org.junit.Test;
 
 import org.apache.spark.sql.*;
@@ -81,4 +82,12 @@ public void testVarargMethods() {
     df.groupBy().agg(countDistinct(col("key"), col("value")));
     df.select(coalesce(col("key")));
   }
+
+  @Ignore
+  public void testShow() {
+    // This test case is intended ignored, but to make sure it compiles correctly
+    DataFrame df = context.table("testData");
+    df.show();
+    df.show(1000);
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index c392a553c03f3..ff441ef26f9c0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -441,4 +441,9 @@ class DataFrameSuite extends QueryTest {
     checkAnswer(df.select(df("key")), testData.select('key).collect().toSeq)
   }
 
+  ignore("show") {
+    // This test case is intended ignored, but to make sure it compiles correctly
+    testData.select($"*").show()
+    testData.select($"*").show(1000)
+  }
 }

From cfff397f0adb27ca102cca43a7696e9fb1819ee0 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 26 Feb 2015 10:51:47 -0800
Subject: [PATCH 319/357] [SPARK-6004][MLlib] Pick the best model when training
 GradientBoostedTrees with validation

Since the validation error does not change monotonically, in practice, it should be proper to pick the best model when training GradientBoostedTrees with validation instead of stopping it early.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #4763 from viirya/gbt_record_model and squashes the following commits:

452e049 [Liang-Chi Hsieh] Address comment.
ea2fae2 [Liang-Chi Hsieh] Pick the best model when training GradientBoostedTrees with validation.
---
 .../spark/mllib/tree/GradientBoostedTrees.scala      | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index b4466ff40937f..a9c93e181e3ce 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -251,9 +251,15 @@ object GradientBoostedTrees extends Logging {
 
     logInfo("Internal timing for DecisionTree:")
     logInfo(s"$timer")
-
-    new GradientBoostedTreesModel(
-      boostingStrategy.treeStrategy.algo, baseLearners, baseLearnerWeights)
+    if (validate) {
+      new GradientBoostedTreesModel(
+        boostingStrategy.treeStrategy.algo,
+        baseLearners.slice(0, bestM),
+        baseLearnerWeights.slice(0, bestM))
+    } else {
+      new GradientBoostedTreesModel(
+        boostingStrategy.treeStrategy.algo, baseLearners, baseLearnerWeights)
+    }
   }
 
 }

From 7fa960e653a905fc48d4097b49ce560cff919fa2 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 26 Feb 2015 11:54:17 -0800
Subject: [PATCH 320/357] [SPARK-5363] Fix bug in PythonRDD: remove() inside
 iterator is not safe

Removing elements from a mutable HashSet while iterating over it can cause the
iteration to incorrectly skip over entries that were not removed. If this
happened, PythonRDD would write fewer broadcast variables than the Python
worker was expecting to read, which would cause the Python worker to hang
indefinitely.

Author: Davies Liu <davies@databricks.com>

Closes #4776 from davies/fix_hang and squashes the following commits:

a4384a5 [Davies Liu] fix bug: remvoe() inside iterator is not safe
---
 .../org/apache/spark/api/python/PythonRDD.scala     | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index dcb6e6313a1d2..b1cec0f6472b0 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -219,14 +219,13 @@ private[spark] class PythonRDD(
         val oldBids = PythonRDD.getWorkerBroadcasts(worker)
         val newBids = broadcastVars.map(_.id).toSet
         // number of different broadcasts
-        val cnt = oldBids.diff(newBids).size + newBids.diff(oldBids).size
+        val toRemove = oldBids.diff(newBids)
+        val cnt = toRemove.size + newBids.diff(oldBids).size
         dataOut.writeInt(cnt)
-        for (bid <- oldBids) {
-          if (!newBids.contains(bid)) {
-            // remove the broadcast from worker
-            dataOut.writeLong(- bid - 1)  // bid >= 0
-            oldBids.remove(bid)
-          }
+        for (bid <- toRemove) {
+          // remove the broadcast from worker
+          dataOut.writeLong(- bid - 1)  // bid >= 0
+          oldBids.remove(bid)
         }
         for (broadcast <- broadcastVars) {
           if (!oldBids.contains(broadcast.id)) {

From cd5c8d7bbd3ea410df08af6cdd3833e0ed4b91a0 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Thu, 26 Feb 2015 12:56:54 -0800
Subject: [PATCH 321/357] SPARK-4704 [CORE] SparkSubmitDriverBootstrap doesn't
 flush output

Join on output threads to make sure any lingering output from process reaches stdout, stderr before exiting

CC andrewor14 since I believe he created this section of code

Author: Sean Owen <sowen@cloudera.com>

Closes #4788 from srowen/SPARK-4704 and squashes the following commits:

ad7114e [Sean Owen] Join on output threads to make sure any lingering output from process reaches stdout, stderr before exiting
---
 .../apache/spark/deploy/SparkSubmitDriverBootstrapper.scala   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitDriverBootstrapper.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitDriverBootstrapper.scala
index 2eab9981845e8..311048cdaa324 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitDriverBootstrapper.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitDriverBootstrapper.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.deploy
 
-import java.io.File
-
 import scala.collection.JavaConversions._
 
 import org.apache.spark.util.{RedirectThread, Utils}
@@ -164,6 +162,8 @@ private[spark] object SparkSubmitDriverBootstrapper {
       }
     }
     val returnCode = process.waitFor()
+    stdoutThread.join()
+    stderrThread.join()
     sys.exit(returnCode)
   }
 

From 10094a523e3993b775111ae9b22ca31cc0d76e03 Mon Sep 17 00:00:00 2001
From: Li Zhihui <zhihui.li@intel.com>
Date: Thu, 26 Feb 2015 13:07:07 -0800
Subject: [PATCH 322/357] Modify default value description for
 spark.scheduler.minRegisteredResourcesRatio on docs.

The configuration is not supported in mesos mode now.
See https://github.com/apache/spark/pull/1462

Author: Li Zhihui <zhihui.li@intel.com>

Closes #4781 from li-zhihui/fixdocconf and squashes the following commits:

63e7a44 [Li Zhihui] Modify default value description for spark.scheduler.minRegisteredResourcesRatio on docs.
---
 docs/configuration.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index 8dd2bad61344f..c11787b17eb8c 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1018,7 +1018,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.scheduler.minRegisteredResourcesRatio</code></td>
-  <td>0.0 for Mesos and Standalone mode, 0.8 for YARN</td>
+  <td>0.8 for YARN mode; 0.0 otherwise</td>
   <td>
     The minimum ratio of registered resources (registered resources / total expected resources)
     (resources are executors in yarn mode, CPU cores in standalone mode)

From 8942b522d8a3269a2a357e3a274ed4b3e66ebdde Mon Sep 17 00:00:00 2001
From: xukun 00228947 <xukun.xu@huawei.com>
Date: Thu, 26 Feb 2015 13:24:00 -0800
Subject: [PATCH 323/357] [SPARK-3562]Periodic cleanup event logs

Author: xukun 00228947 <xukun.xu@huawei.com>

Closes #4214 from viper-kun/cleaneventlog and squashes the following commits:

7a5b9c5 [xukun 00228947] fix issue
31674ee [xukun 00228947] fix issue
6e3d06b [xukun 00228947] fix issue
373f3b9 [xukun 00228947] fix issue
71782b5 [xukun 00228947] fix issue
5b45035 [xukun 00228947] fix issue
70c28d6 [xukun 00228947] fix issues
adcfe86 [xukun 00228947] Periodic cleanup event logs
---
 .../scala/org/apache/spark/SparkConf.scala    |   8 +-
 .../deploy/history/FsHistoryProvider.scala    | 112 ++++++++++++------
 docs/monitoring.md                            |  25 +++-
 3 files changed, 110 insertions(+), 35 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 0dbd26146cb13..0f4922ab4e310 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -362,7 +362,13 @@ private[spark] object SparkConf extends Logging {
       DeprecatedConfig("spark.files.userClassPathFirst", "spark.executor.userClassPathFirst",
         "1.3"),
       DeprecatedConfig("spark.yarn.user.classpath.first", null, "1.3",
-        "Use spark.{driver,executor}.userClassPathFirst instead."))
+        "Use spark.{driver,executor}.userClassPathFirst instead."),
+      DeprecatedConfig("spark.history.fs.updateInterval",
+        "spark.history.fs.update.interval.seconds",
+        "1.3", "Use spark.history.fs.update.interval.seconds instead"),
+      DeprecatedConfig("spark.history.updateInterval",
+        "spark.history.fs.update.interval.seconds",
+        "1.3", "Use spark.history.fs.update.interval.seconds instead"))
     configs.map { x => (x.oldName, x) }.toMap
   }
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 885fa0fdbf85b..1aaa7b72735ab 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -17,9 +17,13 @@
 
 package org.apache.spark.deploy.history
 
-import java.io.{BufferedInputStream, FileNotFoundException, InputStream}
+import java.io.{IOException, BufferedInputStream, FileNotFoundException, InputStream}
+import java.util.concurrent.{Executors, TimeUnit}
 
 import scala.collection.mutable
+import scala.concurrent.duration.Duration
+
+import com.google.common.util.concurrent.ThreadFactoryBuilder
 
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.fs.permission.AccessControlException
@@ -44,8 +48,15 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
   private val NOT_STARTED = "<Not Started>"
 
   // Interval between each check for event log updates
-  private val UPDATE_INTERVAL_MS = conf.getInt("spark.history.fs.updateInterval",
-    conf.getInt("spark.history.updateInterval", 10)) * 1000
+  private val UPDATE_INTERVAL_MS = conf.getOption("spark.history.fs.update.interval.seconds")
+    .orElse(conf.getOption(SparkConf.translateConfKey("spark.history.fs.updateInterval", true)))
+    .orElse(conf.getOption(SparkConf.translateConfKey("spark.history.updateInterval", true)))
+    .map(_.toInt)
+    .getOrElse(10) * 1000
+
+  // Interval between each cleaner checks for event logs to delete
+  private val CLEAN_INTERVAL_MS = conf.getLong("spark.history.fs.cleaner.interval.seconds",
+    DEFAULT_SPARK_HISTORY_FS_CLEANER_INTERVAL_S) * 1000
 
   private val logDir = conf.getOption("spark.history.fs.logDirectory")
     .map { d => Utils.resolveURI(d).toString }
@@ -53,8 +64,11 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
 
   private val fs = Utils.getHadoopFileSystem(logDir, SparkHadoopUtil.get.newConfiguration(conf))
 
-  // A timestamp of when the disk was last accessed to check for log updates
-  private var lastLogCheckTimeMs = -1L
+  // Used by check event thread and clean log thread.
+  // Scheduled thread pool size must be one, otherwise it will have concurrent issues about fs
+  // and applications between check task and clean task.
+  private val pool = Executors.newScheduledThreadPool(1, new ThreadFactoryBuilder()
+    .setNameFormat("spark-history-task-%d").setDaemon(true).build())
 
   // The modification time of the newest log detected during the last scan. This is used
   // to ignore logs that are older during subsequent scans, to avoid processing data that
@@ -73,25 +87,13 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
   private[history] val APPLICATION_COMPLETE = "APPLICATION_COMPLETE"
 
   /**
-   * A background thread that periodically checks for event log updates on disk.
-   *
-   * If a log check is invoked manually in the middle of a period, this thread re-adjusts the
-   * time at which it performs the next log check to maintain the same period as before.
-   *
-   * TODO: Add a mechanism to update manually.
+   * Return a runnable that performs the given operation on the event logs.
+   * This operation is expected to be executed periodically.
    */
-  private val logCheckingThread = new Thread("LogCheckingThread") {
-    override def run() = Utils.logUncaughtExceptions {
-      while (true) {
-        val now = getMonotonicTimeMs()
-        if (now - lastLogCheckTimeMs > UPDATE_INTERVAL_MS) {
-          Thread.sleep(UPDATE_INTERVAL_MS)
-        } else {
-          // If the user has manually checked for logs recently, wait until
-          // UPDATE_INTERVAL_MS after the last check time
-          Thread.sleep(lastLogCheckTimeMs + UPDATE_INTERVAL_MS - now)
-        }
-        checkForLogs()
+  private def getRunner(operateFun: () => Unit): Runnable = {
+    new Runnable() {
+      override def run() = Utils.logUncaughtExceptions {
+        operateFun()
       }
     }
   }
@@ -113,12 +115,17 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
         "Logging directory specified is not a directory: %s".format(logDir))
     }
 
-    checkForLogs()
-
     // Disable the background thread during tests.
     if (!conf.contains("spark.testing")) {
-      logCheckingThread.setDaemon(true)
-      logCheckingThread.start()
+      // A task that periodically checks for event log updates on disk.
+      pool.scheduleAtFixedRate(getRunner(checkForLogs), 0, UPDATE_INTERVAL_MS,
+        TimeUnit.MILLISECONDS)
+
+      if (conf.getBoolean("spark.history.fs.cleaner.enabled", false)) {
+        // A task that periodically cleans event logs on disk.
+        pool.scheduleAtFixedRate(getRunner(cleanLogs), 0, CLEAN_INTERVAL_MS,
+          TimeUnit.MILLISECONDS)
+      }
     }
   }
 
@@ -163,9 +170,6 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
    * applications that haven't been updated since last time the logs were checked.
    */
   private[history] def checkForLogs(): Unit = {
-    lastLogCheckTimeMs = getMonotonicTimeMs()
-    logDebug("Checking for logs. Time is now %d.".format(lastLogCheckTimeMs))
-
     try {
       var newLastModifiedTime = lastModifiedTime
       val statusList = Option(fs.listStatus(new Path(logDir))).map(_.toSeq)
@@ -230,6 +234,45 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
     }
   }
 
+  /**
+   * Delete event logs from the log directory according to the clean policy defined by the user.
+   */
+  private def cleanLogs(): Unit = {
+    try {
+      val statusList = Option(fs.listStatus(new Path(logDir))).map(_.toSeq)
+        .getOrElse(Seq[FileStatus]())
+      val maxAge = conf.getLong("spark.history.fs.cleaner.maxAge.seconds",
+        DEFAULT_SPARK_HISTORY_FS_MAXAGE_S) * 1000
+
+      val now = System.currentTimeMillis()
+      val appsToRetain = new mutable.LinkedHashMap[String, FsApplicationHistoryInfo]()
+
+      applications.values.foreach { info =>
+        if (now - info.lastUpdated <= maxAge) {
+          appsToRetain += (info.id -> info)
+        }
+      }
+
+      applications = appsToRetain
+
+      // Scan all logs from the log directory.
+      // Only directories older than the specified max age will be deleted
+      statusList.foreach { dir =>
+        try {
+          if (now - dir.getModificationTime() > maxAge) {
+            // if path is a directory and set to  true,
+            // the directory is deleted else throws an exception
+            fs.delete(dir.getPath, true)
+          }
+        } catch {
+          case t: IOException => logError(s"IOException in cleaning logs of $dir", t)
+        }
+      }
+    } catch {
+      case t: Exception => logError("Exception in cleaning logs", t)
+    }
+  }
+
   /**
    * Comparison function that defines the sort order for the application listing.
    *
@@ -336,9 +379,6 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
     }
   }
 
-  /** Returns the system's mononotically increasing time. */
-  private def getMonotonicTimeMs(): Long = System.nanoTime() / (1000 * 1000)
-
   /**
    * Return true when the application has completed.
    */
@@ -354,6 +394,12 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
 
 private object FsHistoryProvider {
   val DEFAULT_LOG_DIR = "file:/tmp/spark-events"
+
+  // One day
+  val DEFAULT_SPARK_HISTORY_FS_CLEANER_INTERVAL_S = Duration(1, TimeUnit.DAYS).toSeconds
+
+  // One week
+  val DEFAULT_SPARK_HISTORY_FS_MAXAGE_S = Duration(7, TimeUnit.DAYS).toSeconds
 }
 
 private class FsApplicationHistoryInfo(
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 009a344dff4bb..37ede476c187d 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -86,7 +86,7 @@ follows:
     </td>
   </tr>
   <tr>
-    <td>spark.history.fs.updateInterval</td>
+    <td>spark.history.fs.update.interval.seconds</td>
     <td>10</td>
     <td>
       The period, in seconds, at which information displayed by this history server is updated.
@@ -145,6 +145,29 @@ follows:
       If disabled, no access control checks are made. 
     </td>
   </tr>
+  <tr>
+    <td>spark.history.fs.cleaner.enabled</td>
+    <td>false</td>
+    <td>
+      Specifies whether the History Server should periodically clean up event logs from storage.
+    </td>
+  </tr>
+  <tr>
+    <td>spark.history.fs.cleaner.interval.seconds</td>
+    <td>86400</td>
+    <td>
+      How often the job history cleaner checks for files to delete, in seconds. Defaults to 86400 (one day).
+      Files are only deleted if they are older than spark.history.fs.cleaner.maxAge.seconds.
+    </td>
+  </tr>
+  <tr>
+    <td>spark.history.fs.cleaner.maxAge.seconds</td>
+    <td>3600 * 24 * 7</td>
+    <td>
+      Job history files older than this many seconds will be deleted when the history cleaner runs.
+      Defaults to 3600 * 24 * 7 (1 week).
+    </td>
+  </tr>
 </table>
 
 Note that in all of these UIs, the tables are sortable by clicking their headers,

From aa63f633d39efa8c29095295f161eaad5495071d Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 26 Feb 2015 13:46:07 -0800
Subject: [PATCH 324/357] [SPARK-6027][SPARK-5546] Fixed --jar and --packages
 not working for KafkaUtils and improved error message

The problem with SPARK-6027 in short is that JARs like the kafka-assembly.jar does not work in python as the added JAR is not visible in the classloader used by Py4J. Py4J uses Class.forName(), which does not uses the systemclassloader, but the JARs are only visible in the Thread's contextclassloader. So this back uses the context class loader to create the KafkaUtils dstream object. This works for both cases where the Kafka libraries are added with --jars spark-streaming-kafka-assembly.jar or with --packages spark-streaming-kafka

Also improves the error message.

davies

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #4779 from tdas/kafka-python-fix and squashes the following commits:

fb16b04 [Tathagata Das] Removed import
c1fdf35 [Tathagata Das] Fixed long line and improved documentation
7b88be8 [Tathagata Das] Fixed --jar not working for KafkaUtils and improved error message
---
 .../spark/streaming/kafka/KafkaUtils.scala    | 29 ++++++++++++-
 python/pyspark/streaming/kafka.py             | 42 ++++++++++++-------
 2 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index af04bc6576148..62a659518943d 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -27,7 +27,7 @@ import scala.collection.JavaConversions._
 
 import kafka.common.TopicAndPartition
 import kafka.message.MessageAndMetadata
-import kafka.serializer.{Decoder, StringDecoder}
+import kafka.serializer.{DefaultDecoder, Decoder, StringDecoder}
 
 import org.apache.spark.api.java.function.{Function => JFunction}
 import org.apache.spark.{SparkContext, SparkException}
@@ -532,3 +532,30 @@ object KafkaUtils {
     )
   }
 }
+
+/**
+ * This is a helper class that wraps the KafkaUtils.createStream() into more
+ * Python-friendly class and function so that it can be easily
+ * instantiated and called from Python's KafkaUtils (see SPARK-6027).
+ *
+ * The zero-arg constructor helps instantiate this class from the Class object
+ * classOf[KafkaUtilsPythonHelper].newInstance(), and the createStream()
+ * takes care of known parameters instead of passing them from Python
+ */
+private class KafkaUtilsPythonHelper {
+  def createStream(
+      jssc: JavaStreamingContext,
+      kafkaParams: JMap[String, String],
+      topics: JMap[String, JInt],
+      storageLevel: StorageLevel): JavaPairReceiverInputDStream[Array[Byte], Array[Byte]] = {
+    KafkaUtils.createStream[Array[Byte], Array[Byte], DefaultDecoder, DefaultDecoder](
+      jssc,
+      classOf[Array[Byte]],
+      classOf[Array[Byte]],
+      classOf[DefaultDecoder],
+      classOf[DefaultDecoder],
+      kafkaParams,
+      topics,
+      storageLevel)
+  }
+}
diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py
index 19ad71f99d4d5..0002dc10e8a17 100644
--- a/python/pyspark/streaming/kafka.py
+++ b/python/pyspark/streaming/kafka.py
@@ -16,7 +16,7 @@
 #
 
 from py4j.java_collections import MapConverter
-from py4j.java_gateway import java_import, Py4JError
+from py4j.java_gateway import java_import, Py4JError, Py4JJavaError
 
 from pyspark.storagelevel import StorageLevel
 from pyspark.serializers import PairDeserializer, NoOpSerializer
@@ -50,8 +50,6 @@ def createStream(ssc, zkQuorum, groupId, topics, kafkaParams={},
         :param valueDecoder:  A function used to decode value (default is utf8_decoder)
         :return: A DStream object
         """
-        java_import(ssc._jvm, "org.apache.spark.streaming.kafka.KafkaUtils")
-
         kafkaParams.update({
             "zookeeper.connect": zkQuorum,
             "group.id": groupId,
@@ -63,20 +61,34 @@ def createStream(ssc, zkQuorum, groupId, topics, kafkaParams={},
         jparam = MapConverter().convert(kafkaParams, ssc.sparkContext._gateway._gateway_client)
         jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
 
-        def getClassByName(name):
-            return ssc._jvm.org.apache.spark.util.Utils.classForName(name)
-
         try:
-            array = getClassByName("[B")
-            decoder = getClassByName("kafka.serializer.DefaultDecoder")
-            jstream = ssc._jvm.KafkaUtils.createStream(ssc._jssc, array, array, decoder, decoder,
-                                                       jparam, jtopics, jlevel)
-        except Py4JError, e:
+            # Use KafkaUtilsPythonHelper to access Scala's KafkaUtils (see SPARK-6027)
+            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader()\
+                .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
+            helper = helperClass.newInstance()
+            jstream = helper.createStream(ssc._jssc, jparam, jtopics, jlevel)
+        except Py4JJavaError, e:
             # TODO: use --jar once it also work on driver
-            if not e.message or 'call a package' in e.message:
-                print "No kafka package, please put the assembly jar into classpath:"
-                print " $ bin/spark-submit --driver-class-path external/kafka-assembly/target/" + \
-                      "scala-*/spark-streaming-kafka-assembly-*.jar"
+            if 'ClassNotFoundException' in str(e.java_exception):
+                print """
+________________________________________________________________________________________________
+
+  Spark Streaming's Kafka libraries not found in class path. Try one of the following.
+
+  1. Include the Kafka library and its dependencies with in the
+     spark-submit command as
+
+     $ bin/spark-submit --packages org.apache.spark:spark-streaming-kafka:%s ...
+
+  2. Download the JAR of the artifact from Maven Central http://search.maven.org/,
+     Group Id = org.apache.spark, Artifact Id = spark-streaming-kafka-assembly, Version = %s.
+     Then, innclude the jar in the spark-submit command as
+
+     $ bin/spark-submit --jars <spark-streaming-kafka-assembly.jar> ...
+
+________________________________________________________________________________________________
+
+""" % (ssc.sparkContext.version, ssc.sparkContext.version)
             raise e
         ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
         stream = DStream(jstream, ssc, ser)

From 5f3238b3b0157091d28803aa3b1d248dfa6cdc59 Mon Sep 17 00:00:00 2001
From: Cheolsoo Park <cheolsoop@netflix.com>
Date: Thu, 26 Feb 2015 13:53:49 -0800
Subject: [PATCH 325/357] [SPARK-6018] [YARN] NoSuchMethodError in Spark app is
 swallowed by YARN AM

Author: Cheolsoo Park <cheolsoop@netflix.com>

Closes #4773 from piaozhexiu/SPARK-6018 and squashes the following commits:

2a919d5 [Cheolsoo Park] Rename e with cause to avoid duplicate names
1e71d2d [Cheolsoo Park] Replace placeholder with throwable
eb5750d [Cheolsoo Park] NoSuchMethodError in Spark app is swallowed by YARN AM
---
 .../org/apache/spark/deploy/yarn/ApplicationMaster.scala    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index a9bf861d160c1..20fc19166ac4e 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -485,12 +485,12 @@ private[spark] class ApplicationMaster(
             e.getCause match {
               case _: InterruptedException =>
                 // Reporter thread can interrupt to stop user class
-              case e: Exception =>
+              case cause: Throwable =>
                 finish(FinalApplicationStatus.FAILED,
                   ApplicationMaster.EXIT_EXCEPTION_USER_CLASS,
-                  "User class threw exception: " + e.getMessage)
+                  "User class threw exception: " + cause.getMessage)
                 // re-throw to get it logged
-                throw e
+                throw cause
             }
         }
       }

From 3fb53c0298761ba227890525ae79ce4ec6300deb Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Thu, 26 Feb 2015 14:08:56 -0800
Subject: [PATCH 326/357] SPARK-4300 [CORE] Race condition during SparkWorker
 shutdown

Close appender saving stdout/stderr before destroying process to avoid exception on reading closed input stream.
(This also removes a redundant `waitFor()` although it was harmless)

CC tdas since I think you wrote this method.

Author: Sean Owen <sowen@cloudera.com>

Closes #4787 from srowen/SPARK-4300 and squashes the following commits:

e0cdabf [Sean Owen] Close appender saving stdout/stderr before destroying process to avoid exception on reading closed input stream
---
 .../scala/org/apache/spark/deploy/worker/ExecutorRunner.scala  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
index 6653aca0a0f06..066d46c4473eb 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
@@ -85,14 +85,13 @@ private[spark] class ExecutorRunner(
     var exitCode: Option[Int] = None
     if (process != null) {
       logInfo("Killing process!")
-      process.destroy()
-      process.waitFor()
       if (stdoutAppender != null) {
         stdoutAppender.stop()
       }
       if (stderrAppender != null) {
         stderrAppender.stop()
       }
+      process.destroy()
       exitCode = Some(process.waitFor())
     }
     worker ! ExecutorStateChanged(appId, execId, state, message, exitCode)

From c871e2dae0182e914135560d14304242e1f97f7e Mon Sep 17 00:00:00 2001
From: moussa taifi <moutai10@gmail.com>
Date: Thu, 26 Feb 2015 14:19:43 -0800
Subject: [PATCH 327/357] Add a note for context termination for History server
 on Yarn

The history server on Yarn only shows completed jobs. This adds a note concerning the needed explicit context termination at the end of a spark job which is a best practice anyway.
Related to SPARK-2972 and SPARK-3458

Author: moussa taifi <moutai10@gmail.com>

Closes #4721 from moutai/add-history-server-note-for-closing-the-spark-context and squashes the following commits:

9f5b6c3 [moussa taifi] Fix upper case typo for YARN
3ad3db4 [moussa taifi] Add context termination for History server on Yarn
---
 docs/monitoring.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/monitoring.md b/docs/monitoring.md
index 37ede476c187d..6816671ffbf46 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -173,6 +173,8 @@ follows:
 Note that in all of these UIs, the tables are sortable by clicking their headers,
 making it easy to identify slow tasks, data skew, etc.
 
+Note that the history server only displays completed Spark jobs. One way to signal the completion of a Spark job is to stop the Spark Context explicitly (`sc.stop()`), or in Python using the `with SparkContext() as sc:` to handle the Spark Context setup and tear down, and still show the job history on the UI.
+
 # Metrics
 
 Spark has a configurable metrics system based on the 

From b38dec2ffdf724ff4e181cc8c7427d074b442670 Mon Sep 17 00:00:00 2001
From: "mohit.goyal" <mohit.goyal@guavus.com>
Date: Thu, 26 Feb 2015 14:27:47 -0800
Subject: [PATCH 328/357] [SPARK-5951][YARN] Remove unreachable driver memory
 properties in yarn client mode

Remove unreachable driver memory properties in yarn client mode

Author: mohit.goyal <mohit.goyal@guavus.com>

Closes #4730 from zuxqoj/master and squashes the following commits:

977dc96 [mohit.goyal] remove not rechable deprecated variables in yarn client mode
---
 .../scheduler/cluster/YarnClientSchedulerBackend.scala      | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
index f1b5aafac4066..8abdc26b43806 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
@@ -79,18 +79,12 @@ private[spark] class YarnClientSchedulerBackend(
       )
     // Warn against the following deprecated environment variables: env var -> suggestion
     val deprecatedEnvVars = Map(
-      "SPARK_MASTER_MEMORY" -> "SPARK_DRIVER_MEMORY or --driver-memory through spark-submit",
       "SPARK_WORKER_INSTANCES" -> "SPARK_WORKER_INSTANCES or --num-executors through spark-submit",
       "SPARK_WORKER_MEMORY" -> "SPARK_EXECUTOR_MEMORY or --executor-memory through spark-submit",
       "SPARK_WORKER_CORES" -> "SPARK_EXECUTOR_CORES or --executor-cores through spark-submit")
-    // Do the same for deprecated properties: property -> suggestion
-    val deprecatedProps = Map("spark.master.memory" -> "--driver-memory through spark-submit")
     optionTuples.foreach { case (optionName, envVar, sparkProp) =>
       if (sc.getConf.contains(sparkProp)) {
         extraArgs += (optionName, sc.getConf.get(sparkProp))
-        if (deprecatedProps.contains(sparkProp)) {
-          logWarning(s"NOTE: $sparkProp is deprecated. Use ${deprecatedProps(sparkProp)} instead.")
-        }
       } else if (System.getenv(envVar) != null) {
         extraArgs += (optionName, System.getenv(envVar))
         if (deprecatedEnvVars.contains(envVar)) {

From e60ad2f4c47b011be7a3198689ac2b82ee317d96 Mon Sep 17 00:00:00 2001
From: tedyu <yuzhihong@gmail.com>
Date: Thu, 26 Feb 2015 23:26:07 +0000
Subject: [PATCH 329/357] SPARK-6045 RecordWriter should be checked against
 null in PairRDDFunctio...

...ns#saveAsNewAPIHadoopDataset

Author: tedyu <yuzhihong@gmail.com>

Closes #4794 from tedyu/master and squashes the following commits:

2632a57 [tedyu] SPARK-6045 RecordWriter should be checked against null in PairRDDFunctions#saveAsNewAPIHadoopDataset
2d8d4b1 [tedyu] SPARK-6045 RecordWriter should be checked against null in PairRDDFunctions#saveAsNewAPIHadoopDataset
---
 core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala | 1 +
 1 file changed, 1 insertion(+)

diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 955b42c3baaa1..6b4f097ea9ae5 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -993,6 +993,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context)
 
       val writer = format.getRecordWriter(hadoopContext).asInstanceOf[NewRecordWriter[K,V]]
+      require(writer != null, "Unable to obtain RecordWriter")
       var recordsWritten = 0L
       try {
         while (iter.hasNext) {

From fbc469473dd529eb72046186b85dd8fc2b7c5bb5 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Thu, 26 Feb 2015 17:35:09 -0800
Subject: [PATCH 330/357] SPARK-4579 [WEBUI] Scheduling Delay appears negative

Ensure scheduler delay handles unfinished task case, and ensure delay is never negative even due to rounding

Author: Sean Owen <sowen@cloudera.com>

Closes #4796 from srowen/SPARK-4579 and squashes the following commits:

ad6713c [Sean Owen] Ensure scheduler delay handles unfinished task case, and ensure delay is never negative even due to rounding
---
 .../scala/org/apache/spark/ui/jobs/StagePage.scala  | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index d752434ad58ae..110f8780a9a12 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -626,15 +626,16 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
   }
 
   private def getSchedulerDelay(info: TaskInfo, metrics: TaskMetrics): Long = {
-    val totalExecutionTime = {
-      if (info.gettingResultTime > 0) {
-        (info.gettingResultTime - info.launchTime)
+    val totalExecutionTime =
+      if (info.gettingResult) {
+        info.gettingResultTime - info.launchTime
+      } else if (info.finished) {
+        info.finishTime - info.launchTime
       } else {
-        (info.finishTime - info.launchTime)
+        0
       }
-    }
     val executorOverhead = (metrics.executorDeserializeTime +
       metrics.resultSerializationTime)
-    totalExecutionTime - metrics.executorRunTime - executorOverhead
+    math.max(0, totalExecutionTime - metrics.executorRunTime - executorOverhead)
   }
 }

From 18f2098433e0bfef9497bacd601fdf098ed03eab Mon Sep 17 00:00:00 2001
From: Hong Shen <hongshen@tencent.com>
Date: Thu, 26 Feb 2015 18:43:23 -0800
Subject: [PATCH 331/357]  [SPARK-5529][CORE]Add expireDeadHosts in
 HeartbeatReceiver

If a blockManager has not send heartBeat more than 120s, BlockManagerMasterActor will remove it. But coarseGrainedSchedulerBackend can only remove executor after an DisassociatedEvent.  We should expireDeadHosts at HeartbeatReceiver.

Author: Hong Shen <hongshen@tencent.com>

Closes #4363 from shenh062326/my_change3 and squashes the following commits:

2c9a46a [Hong Shen] Change some code style.
1a042ff [Hong Shen] Change some code style.
2dc456e [Hong Shen] Change some code style.
d221493 [Hong Shen] Fix test failed
7448ac6 [Hong Shen] A minor change in sparkContext and heartbeatReceiver
b904aed [Hong Shen] Fix failed test
52725af [Hong Shen] Remove assert in SparkContext.killExecutors
5bedcb8 [Hong Shen] Remove assert in SparkContext.killExecutors
a858fb5 [Hong Shen] A minor change in HeartbeatReceiver
3e221d9 [Hong Shen] A minor change in HeartbeatReceiver
6bab7aa [Hong Shen] Change a code style.
07952f3 [Hong Shen] Change configs name and code style.
ce9257e [Hong Shen] Fix test failed
bccd515 [Hong Shen] Fix test failed
8e77408 [Hong Shen] Fix test failed
c1dfda1 [Hong Shen] Fix test failed
e197e20 [Hong Shen] Fix test failed
fb5df97 [Hong Shen] Remove ExpireDeadHosts in BlockManagerMessages
b5c0441 [Hong Shen] Remove expireDeadHosts in BlockManagerMasterActor
c922cb0 [Hong Shen] Add expireDeadHosts in HeartbeatReceiver
---
 .../org/apache/spark/HeartbeatReceiver.scala  | 65 +++++++++++++++++--
 .../scala/org/apache/spark/SparkContext.scala | 15 +++--
 .../spark/scheduler/TaskScheduler.scala       |  6 +-
 .../spark/scheduler/TaskSchedulerImpl.scala   |  2 +-
 .../storage/BlockManagerMasterActor.scala     | 36 +---------
 .../spark/storage/BlockManagerMessages.scala  |  2 -
 .../spark/scheduler/DAGSchedulerSuite.scala   |  2 +
 7 files changed, 79 insertions(+), 49 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
index 83ae57b7f1516..69178da1a7773 100644
--- a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
+++ b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
@@ -17,33 +17,86 @@
 
 package org.apache.spark
 
-import akka.actor.Actor
+import scala.concurrent.duration._
+import scala.collection.mutable
+
+import akka.actor.{Actor, Cancellable}
+
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.storage.BlockManagerId
-import org.apache.spark.scheduler.TaskScheduler
+import org.apache.spark.scheduler.{SlaveLost, TaskScheduler}
 import org.apache.spark.util.ActorLogReceive
 
 /**
  * A heartbeat from executors to the driver. This is a shared message used by several internal
- * components to convey liveness or execution information for in-progress tasks.
+ * components to convey liveness or execution information for in-progress tasks. It will also 
+ * expire the hosts that have not heartbeated for more than spark.network.timeout.
  */
 private[spark] case class Heartbeat(
     executorId: String,
     taskMetrics: Array[(Long, TaskMetrics)], // taskId -> TaskMetrics
     blockManagerId: BlockManagerId)
 
+private[spark] case object ExpireDeadHosts 
+    
 private[spark] case class HeartbeatResponse(reregisterBlockManager: Boolean)
 
 /**
  * Lives in the driver to receive heartbeats from executors..
  */
-private[spark] class HeartbeatReceiver(scheduler: TaskScheduler)
+private[spark] class HeartbeatReceiver(sc: SparkContext, scheduler: TaskScheduler)
   extends Actor with ActorLogReceive with Logging {
 
+  // executor ID -> timestamp of when the last heartbeat from this executor was received
+  private val executorLastSeen = new mutable.HashMap[String, Long]
+  
+  private val executorTimeoutMs = sc.conf.getLong("spark.network.timeout", 
+    sc.conf.getLong("spark.storage.blockManagerSlaveTimeoutMs", 120)) * 1000
+  
+  private val checkTimeoutIntervalMs = sc.conf.getLong("spark.network.timeoutInterval",
+    sc.conf.getLong("spark.storage.blockManagerTimeoutIntervalMs", 60)) * 1000
+  
+  private var timeoutCheckingTask: Cancellable = null
+  
+  override def preStart(): Unit = {
+    import context.dispatcher
+    timeoutCheckingTask = context.system.scheduler.schedule(0.seconds,
+      checkTimeoutIntervalMs.milliseconds, self, ExpireDeadHosts)
+    super.preStart()
+  }
+  
   override def receiveWithLogging = {
     case Heartbeat(executorId, taskMetrics, blockManagerId) =>
-      val response = HeartbeatResponse(
-        !scheduler.executorHeartbeatReceived(executorId, taskMetrics, blockManagerId))
+      val unknownExecutor = !scheduler.executorHeartbeatReceived(
+        executorId, taskMetrics, blockManagerId)
+      val response = HeartbeatResponse(reregisterBlockManager = unknownExecutor)
+      executorLastSeen(executorId) = System.currentTimeMillis()
       sender ! response
+    case ExpireDeadHosts =>
+      expireDeadHosts()
+  }
+
+  private def expireDeadHosts(): Unit = {
+    logTrace("Checking for hosts with no recent heartbeats in HeartbeatReceiver.")
+    val now = System.currentTimeMillis()
+    for ((executorId, lastSeenMs) <- executorLastSeen) {
+      if (now - lastSeenMs > executorTimeoutMs) {
+        logWarning(s"Removing executor $executorId with no recent heartbeats: " +
+          s"${now - lastSeenMs} ms exceeds timeout $executorTimeoutMs ms")
+        scheduler.executorLost(executorId, SlaveLost("Executor heartbeat " +
+          "timed out after ${now - lastSeenMs} ms"))
+        if (sc.supportDynamicAllocation) {
+          sc.killExecutor(executorId)
+        }
+        executorLastSeen.remove(executorId)
+      }
+    }
+  }
+  
+  override def postStop(): Unit = {
+    if (timeoutCheckingTask != null) {
+      timeoutCheckingTask.cancel()
+    }
+    super.postStop()
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 930d4bea4785b..d3948d4e6d91b 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -351,7 +351,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   private[spark] var (schedulerBackend, taskScheduler) =
     SparkContext.createTaskScheduler(this, master)
   private val heartbeatReceiver = env.actorSystem.actorOf(
-    Props(new HeartbeatReceiver(taskScheduler)), "HeartbeatReceiver")
+    Props(new HeartbeatReceiver(this, taskScheduler)), "HeartbeatReceiver")
   @volatile private[spark] var dagScheduler: DAGScheduler = _
   try {
     dagScheduler = new DAGScheduler(this)
@@ -398,7 +398,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   private val dynamicAllocationTesting = conf.getBoolean("spark.dynamicAllocation.testing", false)
   private[spark] val executorAllocationManager: Option[ExecutorAllocationManager] =
     if (dynamicAllocationEnabled) {
-      assert(master.contains("yarn") || dynamicAllocationTesting,
+      assert(supportDynamicAllocation,
         "Dynamic allocation of executors is currently only supported in YARN mode")
       Some(new ExecutorAllocationManager(this, listenerBus, conf))
     } else {
@@ -1122,6 +1122,13 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     postEnvironmentUpdate()
   }
 
+  /**
+   * Return whether dynamically adjusting the amount of resources allocated to
+   * this application is supported. This is currently only available for YARN.
+   */
+  private[spark] def supportDynamicAllocation = 
+    master.contains("yarn") || dynamicAllocationTesting
+
   /**
    * :: DeveloperApi ::
    * Register a listener to receive up-calls from events that happen during execution.
@@ -1155,7 +1162,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    */
   @DeveloperApi
   override def requestExecutors(numAdditionalExecutors: Int): Boolean = {
-    assert(master.contains("yarn") || dynamicAllocationTesting,
+    assert(supportDynamicAllocation,
       "Requesting executors is currently only supported in YARN mode")
     schedulerBackend match {
       case b: CoarseGrainedSchedulerBackend =>
@@ -1173,7 +1180,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    */
   @DeveloperApi
   override def killExecutors(executorIds: Seq[String]): Boolean = {
-    assert(master.contains("yarn") || dynamicAllocationTesting,
+    assert(supportDynamicAllocation,
       "Killing executors is currently only supported in YARN mode")
     schedulerBackend match {
       case b: CoarseGrainedSchedulerBackend =>
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala
index f095915352b17..ed3418676e077 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala
@@ -73,5 +73,9 @@ private[spark] trait TaskScheduler {
    * @return An application ID
    */
   def applicationId(): String = appId
-
+  
+  /**
+   * Process a lost executor
+   */
+  def executorLost(executorId: String, reason: ExecutorLossReason): Unit
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 54f8fcfc416d1..7a9cf1c2e7f30 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -436,7 +436,7 @@ private[spark] class TaskSchedulerImpl(
     }
   }
 
-  def executorLost(executorId: String, reason: ExecutorLossReason) {
+  override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {
     var failedExecutor: Option[String] = None
 
     synchronized {
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala
index 64133464d8daa..787b0f96bec32 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala
@@ -24,7 +24,7 @@ import scala.collection.JavaConversions._
 import scala.concurrent.Future
 import scala.concurrent.duration._
 
-import akka.actor.{Actor, ActorRef, Cancellable}
+import akka.actor.{Actor, ActorRef}
 import akka.pattern.ask
 
 import org.apache.spark.{Logging, SparkConf, SparkException}
@@ -52,19 +52,6 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus
 
   private val akkaTimeout = AkkaUtils.askTimeout(conf)
 
-  val slaveTimeout = conf.getLong("spark.storage.blockManagerSlaveTimeoutMs", 120 * 1000)
-
-  val checkTimeoutInterval = conf.getLong("spark.storage.blockManagerTimeoutIntervalMs", 60000)
-
-  var timeoutCheckingTask: Cancellable = null
-
-  override def preStart() {
-    import context.dispatcher
-    timeoutCheckingTask = context.system.scheduler.schedule(0.seconds,
-      checkTimeoutInterval.milliseconds, self, ExpireDeadHosts)
-    super.preStart()
-  }
-
   override def receiveWithLogging = {
     case RegisterBlockManager(blockManagerId, maxMemSize, slaveActor) =>
       register(blockManagerId, maxMemSize, slaveActor)
@@ -118,14 +105,8 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus
 
     case StopBlockManagerMaster =>
       sender ! true
-      if (timeoutCheckingTask != null) {
-        timeoutCheckingTask.cancel()
-      }
       context.stop(self)
 
-    case ExpireDeadHosts =>
-      expireDeadHosts()
-
     case BlockManagerHeartbeat(blockManagerId) =>
       sender ! heartbeatReceived(blockManagerId)
 
@@ -207,21 +188,6 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf, listenerBus
     logInfo(s"Removing block manager $blockManagerId")
   }
 
-  private def expireDeadHosts() {
-    logTrace("Checking for hosts with no recent heart beats in BlockManagerMaster.")
-    val now = System.currentTimeMillis()
-    val minSeenTime = now - slaveTimeout
-    val toRemove = new mutable.HashSet[BlockManagerId]
-    for (info <- blockManagerInfo.values) {
-      if (info.lastSeenMs < minSeenTime && !info.blockManagerId.isDriver) {
-        logWarning("Removing BlockManager " + info.blockManagerId + " with no recent heart beats: "
-          + (now - info.lastSeenMs) + "ms exceeds " + slaveTimeout + "ms")
-        toRemove += info.blockManagerId
-      }
-    }
-    toRemove.foreach(removeBlockManager)
-  }
-
   private def removeExecutor(execId: String) {
     logInfo("Trying to remove executor " + execId + " from BlockManagerMaster.")
     blockManagerIdByExecutor.get(execId).foreach(removeBlockManager)
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
index 3f32099d08cc9..48247453edef0 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
@@ -109,6 +109,4 @@ private[spark] object BlockManagerMessages {
     extends ToBlockManagerMaster
 
   case class BlockManagerHeartbeat(blockManagerId: BlockManagerId) extends ToBlockManagerMaster
-
-  case object ExpireDeadHosts extends ToBlockManagerMaster
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 4bf7f9e647d55..30119ce5d4eec 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -96,6 +96,7 @@ class DAGSchedulerSuite extends FunSuiteLike  with BeforeAndAfter with LocalSpar
     }
     override def setDAGScheduler(dagScheduler: DAGScheduler) = {}
     override def defaultParallelism() = 2
+    override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
   }
 
   /** Length of time to wait while draining listener events. */
@@ -386,6 +387,7 @@ class DAGSchedulerSuite extends FunSuiteLike  with BeforeAndAfter with LocalSpar
       override def defaultParallelism() = 2
       override def executorHeartbeatReceived(execId: String, taskMetrics: Array[(Long, TaskMetrics)],
         blockManagerId: BlockManagerId): Boolean = true
+      override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
     }
     val noKillScheduler = new DAGScheduler(
       sc,

From 4ad5153f5449319a7e82c9013ccff4494ab58ef1 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 27 Feb 2015 11:06:47 +0800
Subject: [PATCH 332/357] [SPARK-6037][SQL] Avoiding duplicate Parquet schema
 merging

`FilteringParquetRowInputFormat` manually merges Parquet schemas before computing splits. However, it is duplicate because the schemas are already merged in `ParquetRelation2`. We don't need to re-merge them at `InputFormat`.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #4786 from viirya/dup_parquet_schemas_merge and squashes the following commits:

ef78a5a [Liang-Chi Hsieh] Avoiding duplicate Parquet schema merging.
---
 .../sql/parquet/ParquetTableOperations.scala  | 23 ++++++-------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 9061d3f5fee4d..4e4f647767dc9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -434,22 +434,13 @@ private[parquet] class FilteringParquetRowInputFormat
      return splits
     }
 
-    Option(globalMetaData.getKeyValueMetaData.get(RowReadSupport.SPARK_METADATA_KEY)).foreach {
-      schemas =>
-        val mergedSchema = schemas
-          .map(DataType.fromJson(_).asInstanceOf[StructType])
-          .reduce(_ merge _)
-          .json
-
-        val mergedMetadata = globalMetaData
-          .getKeyValueMetaData
-          .updated(RowReadSupport.SPARK_METADATA_KEY, setAsJavaSet(Set(mergedSchema)))
-
-        globalMetaData = new GlobalMetaData(
-          globalMetaData.getSchema,
-          mergedMetadata,
-          globalMetaData.getCreatedBy)
-    }
+    val metadata = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
+    val mergedMetadata = globalMetaData
+      .getKeyValueMetaData
+      .updated(RowReadSupport.SPARK_METADATA_KEY, setAsJavaSet(Set(metadata)))
+
+    globalMetaData = new GlobalMetaData(globalMetaData.getSchema,
+      mergedMetadata, globalMetaData.getCreatedBy)
 
     val readContext = getReadSupport(configuration).init(
       new InitContext(configuration,

From 5e5ad6558d60cfbf360708584e883e80d363e33e Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Thu, 26 Feb 2015 20:46:05 -0800
Subject: [PATCH 333/357] [SPARK-6024][SQL] When a data source table has too
 many columns, it's schema cannot be stored in metastore.

JIRA: https://issues.apache.org/jira/browse/SPARK-6024

Author: Yin Huai <yhuai@databricks.com>

Closes #4795 from yhuai/wideSchema and squashes the following commits:

4882e6f [Yin Huai] Address comments.
73e71b4 [Yin Huai] Address comments.
143927a [Yin Huai] Simplify code.
cc1d472 [Yin Huai] Make the schema wider.
12bacae [Yin Huai] If the JSON string of a schema is too large, split it before storing it in metastore.
e9b4f70 [Yin Huai] Failed test.
---
 .../scala/org/apache/spark/sql/SQLConf.scala  | 10 +++++++
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 29 +++++++++++++++----
 .../sql/hive/MetastoreDataSourcesSuite.scala  | 21 ++++++++++++++
 3 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index a08c0f5ce3ff4..4815620c6fe57 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -51,6 +51,11 @@ private[spark] object SQLConf {
 
   // This is used to set the default data source
   val DEFAULT_DATA_SOURCE_NAME = "spark.sql.sources.default"
+  // This is used to control the when we will split a schema's JSON string to multiple pieces
+  // in order to fit the JSON string in metastore's table property (by default, the value has
+  // a length restriction of 4000 characters). We will split the JSON string of a schema
+  // to its length exceeds the threshold.
+  val SCHEMA_STRING_LENGTH_THRESHOLD = "spark.sql.sources.schemaStringLengthThreshold"
 
   // Whether to perform eager analysis when constructing a dataframe.
   // Set to false when debugging requires the ability to look at invalid query plans.
@@ -177,6 +182,11 @@ private[sql] class SQLConf extends Serializable {
   private[spark] def defaultDataSourceName: String =
     getConf(DEFAULT_DATA_SOURCE_NAME, "org.apache.spark.sql.parquet")
 
+  // Do not use a value larger than 4000 as the default value of this property.
+  // See the comments of SCHEMA_STRING_LENGTH_THRESHOLD above for more information.
+  private[spark] def schemaStringLengthThreshold: Int =
+    getConf(SCHEMA_STRING_LENGTH_THRESHOLD, "4000").toInt
+
   private[spark] def dataFrameEagerAnalysis: Boolean =
     getConf(DATAFRAME_EAGER_ANALYSIS, "true").toBoolean
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 8af5a4848fd44..d3ad364328265 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -69,13 +69,23 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
         val table = synchronized {
           client.getTable(in.database, in.name)
         }
-        val schemaString = table.getProperty("spark.sql.sources.schema")
         val userSpecifiedSchema =
-          if (schemaString == null) {
-            None
-          } else {
-            Some(DataType.fromJson(schemaString).asInstanceOf[StructType])
+          Option(table.getProperty("spark.sql.sources.schema.numParts")).map { numParts =>
+            val parts = (0 until numParts.toInt).map { index =>
+              val part = table.getProperty(s"spark.sql.sources.schema.part.${index}")
+              if (part == null) {
+                throw new AnalysisException(
+                  s"Could not read schema from the metastore because it is corrupted " +
+                  s"(missing part ${index} of the schema).")
+              }
+
+              part
+            }
+            // Stick all parts back to a single schema string in the JSON representation
+            // and convert it back to a StructType.
+            DataType.fromJson(parts.mkString).asInstanceOf[StructType]
           }
+
         // It does not appear that the ql client for the metastore has a way to enumerate all the
         // SerDe properties directly...
         val options = table.getTTable.getSd.getSerdeInfo.getParameters.toMap
@@ -119,7 +129,14 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
 
     tbl.setProperty("spark.sql.sources.provider", provider)
     if (userSpecifiedSchema.isDefined) {
-      tbl.setProperty("spark.sql.sources.schema", userSpecifiedSchema.get.json)
+      val threshold = hive.conf.schemaStringLengthThreshold
+      val schemaJsonString = userSpecifiedSchema.get.json
+      // Split the JSON string.
+      val parts = schemaJsonString.grouped(threshold).toSeq
+      tbl.setProperty("spark.sql.sources.schema.numParts", parts.size.toString)
+      parts.zipWithIndex.foreach { case (part, index) =>
+        tbl.setProperty(s"spark.sql.sources.schema.part.${index}", part)
+      }
     }
     options.foreach { case (key, value) => tbl.setSerdeParam(key, value) }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 0bd82773f3a55..00306f1cd7f86 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -591,4 +591,25 @@ class MetastoreDataSourcesSuite extends QueryTest with BeforeAndAfterEach {
       setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalUseDataSource)
     }
   }
+
+  test("SPARK-6024 wide schema support") {
+    // We will need 80 splits for this schema if the threshold is 4000.
+    val schema = StructType((1 to 5000).map(i => StructField(s"c_${i}", StringType, true)))
+    assert(
+      schema.json.size > conf.schemaStringLengthThreshold,
+      "To correctly test the fix of SPARK-6024, the value of " +
+      s"spark.sql.sources.schemaStringLengthThreshold needs to be less than ${schema.json.size}")
+    // Manually create a metastore data source table.
+    catalog.createDataSourceTable(
+      tableName = "wide_schema",
+      userSpecifiedSchema = Some(schema),
+      provider = "json",
+      options = Map("path" -> "just a dummy path"),
+      isExternal = false)
+
+    invalidateTable("wide_schema")
+
+    val actualSchema = table("wide_schema").schema
+    assert(schema === actualSchema)
+  }
 }

From 12135e90549f957962899487cd5eb95badd8976d Mon Sep 17 00:00:00 2001
From: jerryshao <saisai.shao@intel.com>
Date: Thu, 26 Feb 2015 22:35:43 -0800
Subject: [PATCH 334/357] [SPARK-5771][UI][hotfix] Change Requested Cores into
 * if default cores is not set

cc andrewor14, srowen.

Author: jerryshao <saisai.shao@intel.com>

Closes #4800 from jerryshao/SPARK-5771 and squashes the following commits:

a2483c2 [jerryshao] Change the UI of Requested Cores into * if default cores is not set
---
 .../scala/org/apache/spark/deploy/master/ui/MasterPage.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
index 9dd96493ee48d..c7a71ea72a77f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
@@ -182,7 +182,7 @@ private[spark] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
         }
       }
       <td>
-        {app.requestedCores}
+        {if (app.requestedCores == Int.MaxValue) "*" else app.requestedCores}
       </td>
       <td sorttable_customkey={app.desc.memoryPerSlave.toString}>
         {Utils.megabytesToString(app.desc.memoryPerSlave)}

From 67595eb8fb563eb26654f056033a01f0199bdf68 Mon Sep 17 00:00:00 2001
From: jerryshao <saisai.shao@intel.com>
Date: Thu, 26 Feb 2015 22:36:48 -0800
Subject: [PATCH 335/357] [SPARK-5495][UI] Add app and driver kill function in
 master web UI

Add application kill function in master web UI for standalone mode. Details can be seen in [SPARK-5495](https://issues.apache.org/jira/browse/SPARK-5495).

The snapshot of UI shows as below:
![snapshot](https://dl.dropboxusercontent.com/u/19230832/master_ui.png)

Please help to review, thanks a lot.

Author: jerryshao <saisai.shao@intel.com>

Closes #4288 from jerryshao/SPARK-5495 and squashes the following commits:

fa3e486 [jerryshao] Add some conditions
9a7be93 [jerryshao] Add kill Driver function
a239776 [jerryshao] Change the code format
ff5195d [jerryshao] Add app kill function in master web UI
---
 .../deploy/master/ApplicationState.scala      |  2 +-
 .../spark/deploy/master/ui/MasterPage.scala   | 53 +++++++++++++++++--
 .../spark/deploy/master/ui/MasterWebUI.scala  |  8 ++-
 3 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationState.scala b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationState.scala
index 67e6c5d66af0e..f5b946329ae9b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationState.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationState.scala
@@ -21,7 +21,7 @@ private[spark] object ApplicationState extends Enumeration {
 
   type ApplicationState = Value
 
-  val WAITING, RUNNING, FINISHED, FAILED, UNKNOWN = Value
+  val WAITING, RUNNING, FINISHED, FAILED, KILLED, UNKNOWN = Value
 
   val MAX_NUM_RETRY = 10
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
index c7a71ea72a77f..c086cadca2c7d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
@@ -26,8 +26,8 @@ import akka.pattern.ask
 import org.json4s.JValue
 
 import org.apache.spark.deploy.JsonProtocol
-import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState}
-import org.apache.spark.deploy.master.{ApplicationInfo, DriverInfo, WorkerInfo}
+import org.apache.spark.deploy.DeployMessages.{RequestKillDriver, MasterStateResponse, RequestMasterState}
+import org.apache.spark.deploy.master._
 import org.apache.spark.ui.{WebUIPage, UIUtils}
 import org.apache.spark.util.Utils
 
@@ -41,6 +41,31 @@ private[spark] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
     JsonProtocol.writeMasterState(state)
   }
 
+  def handleAppKillRequest(request: HttpServletRequest): Unit = {
+    handleKillRequest(request, id => {
+      parent.master.idToApp.get(id).foreach { app =>
+        parent.master.removeApplication(app, ApplicationState.KILLED)
+      }
+    })
+  }
+
+  def handleDriverKillRequest(request: HttpServletRequest): Unit = {
+    handleKillRequest(request, id => { master ! RequestKillDriver(id) })
+  }
+
+  private def handleKillRequest(request: HttpServletRequest, action: String => Unit): Unit = {
+    if (parent.killEnabled &&
+        parent.master.securityMgr.checkModifyPermissions(request.getRemoteUser)) {
+      val killFlag = Option(request.getParameter("terminate")).getOrElse("false").toBoolean
+      val id = Option(request.getParameter("id"))
+      if (id.isDefined && killFlag) {
+        action(id.get)
+      }
+
+      Thread.sleep(100)
+    }
+  }
+
   /** Index view listing applications and executors */
   def render(request: HttpServletRequest): Seq[Node] = {
     val stateFuture = (master ? RequestMasterState)(timeout).mapTo[MasterStateResponse]
@@ -167,9 +192,20 @@ private[spark] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
   }
 
   private def appRow(app: ApplicationInfo, active: Boolean): Seq[Node] = {
+    val killLink = if (parent.killEnabled &&
+      (app.state == ApplicationState.RUNNING || app.state == ApplicationState.WAITING)) {
+    val killLinkUri = s"app/kill?id=${app.id}&terminate=true"
+    val confirm = "return window.confirm(" +
+      s"'Are you sure you want to kill application ${app.id} ?');"
+      <span class="kill-link">
+        (<a href={killLinkUri} onclick={confirm}>kill</a>)
+      </span>
+    }
+
     <tr>
       <td>
         <a href={"app?appId=" + app.id}>{app.id}</a>
+        {killLink}
       </td>
       <td>
         <a href={app.desc.appUiUrl}>{app.desc.name}</a>
@@ -203,8 +239,19 @@ private[spark] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
   }
 
   private def driverRow(driver: DriverInfo): Seq[Node] = {
+    val killLink = if (parent.killEnabled &&
+      (driver.state == DriverState.RUNNING ||
+        driver.state == DriverState.SUBMITTED ||
+        driver.state == DriverState.RELAUNCHING)) {
+    val killLinkUri = s"driver/kill?id=${driver.id}&terminate=true"
+    val confirm = "return window.confirm(" +
+      s"'Are you sure you want to kill driver ${driver.id} ?');"
+      <span class="kill-link">
+        (<a href={killLinkUri} onclick={confirm}>kill</a>)
+      </span>
+    }
     <tr>
-      <td>{driver.id} </td>
+      <td>{driver.id} {killLink}</td>
       <td>{driver.submitDate}</td>
       <td>{driver.worker.map(w => <a href={w.webUiAddress}>{w.id.toString}</a>).getOrElse("None")}
       </td>
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
index 73400c5affb5d..170f90a00ad2a 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala
@@ -32,15 +32,21 @@ class MasterWebUI(val master: Master, requestedPort: Int)
 
   val masterActorRef = master.self
   val timeout = AkkaUtils.askTimeout(master.conf)
+  val killEnabled = master.conf.getBoolean("spark.ui.killEnabled", true)
 
   initialize()
 
   /** Initialize all components of the server. */
   def initialize() {
+    val masterPage = new MasterPage(this)
     attachPage(new ApplicationPage(this))
     attachPage(new HistoryNotFoundPage(this))
-    attachPage(new MasterPage(this))
+    attachPage(masterPage)
     attachHandler(createStaticHandler(MasterWebUI.STATIC_RESOURCE_DIR, "/static"))
+    attachHandler(
+      createRedirectHandler("/app/kill", "/", masterPage.handleAppKillRequest))
+    attachHandler(
+      createRedirectHandler("/driver/kill", "/", masterPage.handleDriverKillRequest))
   }
 
   /** Attach a reconstructed UI to this Master UI. Only valid after bind(). */

From 4a8a0a8ecd836bf7fe0f2e692cf20a62dda313c0 Mon Sep 17 00:00:00 2001
From: Lukasz Jastrzebski <lukasz.jastrzebski@gmail.com>
Date: Thu, 26 Feb 2015 22:38:06 -0800
Subject: [PATCH 336/357] SPARK-2168 [Spark core] Use relative URIs for the app
 links in the History Server.

As agreed in PR #1160 adding test to verify if history server generates relative links to applications.

Author: Lukasz Jastrzebski <lukasz.jastrzebski@gmail.com>

Closes #4778 from elyast/master and squashes the following commits:

0c07fab [Lukasz Jastrzebski] Incorporating comments for SPARK-2168
6d7866d [Lukasz Jastrzebski] Adjusting test for  SPARK-2168 for master branch
d6f4fbe [Lukasz Jastrzebski] Added test for  SPARK-2168
---
 .../deploy/history/HistoryServerSuite.scala   | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala

diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
new file mode 100644
index 0000000000000..3a9963a5ce7b7
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.history
+
+import javax.servlet.http.HttpServletRequest
+
+import scala.collection.mutable
+
+import org.apache.hadoop.fs.Path
+import org.mockito.Mockito.{when}
+import org.scalatest.FunSuite
+import org.scalatest.Matchers
+import org.scalatest.mock.MockitoSugar
+
+import org.apache.spark.ui.SparkUI
+
+class HistoryServerSuite extends FunSuite with Matchers with MockitoSugar {
+
+  test("generate history page with relative links") {
+    val historyServer = mock[HistoryServer]
+    val request = mock[HttpServletRequest]
+    val ui = mock[SparkUI]
+    val link = "/history/app1"
+    val info = new ApplicationHistoryInfo("app1", "app1", 0, 2, 1, "xxx", true)
+    when(historyServer.getApplicationList()).thenReturn(Seq(info))
+    when(ui.basePath).thenReturn(link)
+    when(historyServer.getProviderConfig()).thenReturn(Map[String, String]())
+    val page = new HistoryPage(historyServer)
+
+    //when
+    val response = page.render(request)
+
+    //then
+    val links = response \\ "a"
+    val justHrefs = for {
+      l <- links
+      attrs <- l.attribute("href")
+    } yield (attrs.toString)
+    justHrefs should contain(link)
+  }
+}

From 7c99a014fb1e370e038e0f537c2301ca8138e620 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Thu, 26 Feb 2015 22:39:46 -0800
Subject: [PATCH 337/357] [SPARK-6046] Privatize SparkConf.translateConfKey

The warning of deprecated configs is actually done when the configs are set, not when they are get. As a result we don't need to explicitly call `translateConfKey` outside of `SparkConf` just to print the warning again in vain.

Author: Andrew Or <andrew@databricks.com>

Closes #4797 from andrewor14/warn-deprecated-config and squashes the following commits:

8fb43e6 [Andrew Or] Privatize SparkConf.translateConfKey
---
 core/src/main/scala/org/apache/spark/SparkConf.scala          | 2 +-
 .../org/apache/spark/deploy/history/FsHistoryProvider.scala   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 0f4922ab4e310..61b34d524a421 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -407,7 +407,7 @@ private[spark] object SparkConf extends Logging {
    * @param warn Whether to print a warning if the key is deprecated. Warnings will be printed
    *             only once for each key.
    */
-  def translateConfKey(userKey: String, warn: Boolean = false): String = {
+  private def translateConfKey(userKey: String, warn: Boolean = false): String = {
     deprecatedConfigs.get(userKey)
       .map { deprecatedKey =>
         if (warn) {
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 1aaa7b72735ab..3e3d6ff29faf0 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -49,8 +49,8 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
 
   // Interval between each check for event log updates
   private val UPDATE_INTERVAL_MS = conf.getOption("spark.history.fs.update.interval.seconds")
-    .orElse(conf.getOption(SparkConf.translateConfKey("spark.history.fs.updateInterval", true)))
-    .orElse(conf.getOption(SparkConf.translateConfKey("spark.history.updateInterval", true)))
+    .orElse(conf.getOption("spark.history.fs.updateInterval"))
+    .orElse(conf.getOption("spark.history.updateInterval"))
     .map(_.toInt)
     .getOrElse(10) * 1000
 

From 0375a413b8a009f5820897691570a1273ee25b97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=AE=B8=E9=B9=8F?= <peng.xu@fraudmetrix.cn>
Date: Thu, 26 Feb 2015 23:05:56 -0800
Subject: [PATCH 338/357] fix spark-6033, clarify the spark.worker.cleanup
 behavior in standalone mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

jira case spark-6033 https://issues.apache.org/jira/browse/SPARK-6033

In standalone deploy mode, the cleanup will only remove the stopped application's directories.

The original description about the cleanup behavior is incorrect.

Author: 许鹏 <peng.xu@fraudmetrix.cn>

Closes #4803 from hseagle/spark-6033 and squashes the following commits:

927a6a0 [许鹏] fix the incorrect description about the spark.worker.cleanup in standalone mode
---
 docs/spark-standalone.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index 5c6084fb46255..74d8653a8b845 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -222,8 +222,7 @@ SPARK_WORKER_OPTS supports the following system properties:
   <td>false</td>
   <td>
     Enable periodic cleanup of worker / application directories.  Note that this only affects standalone
-    mode, as YARN works differently. Applications directories are cleaned up regardless of whether
-    the application is still running.
+    mode, as YARN works differently. Only the directories of stopped applications are cleaned up.
   </td>
 </tr>
 <tr>

From 8cd1692c9092150107bed27777951633cbf945f6 Mon Sep 17 00:00:00 2001
From: "Zhang, Liye" <liye.zhang@intel.com>
Date: Thu, 26 Feb 2015 23:11:43 -0800
Subject: [PATCH 339/357] [SPARK-6036][CORE] avoid race condition between
 eventlogListener and akka actor system

For detail description, pls refer to [SPARK-6036](https://issues.apache.org/jira/browse/SPARK-6036).

Author: Zhang, Liye <liye.zhang@intel.com>

Closes #4785 from liyezhang556520/EventLogInProcess and squashes the following commits:

8b0b0a6 [Zhang, Liye] stop listener after DAGScheduler
79b15b3 [Zhang, Liye] SPARK-6036 avoid race condition between eventlogListener and akka actor system
---
 core/src/main/scala/org/apache/spark/SparkContext.scala | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index d3948d4e6d91b..3cd0c218a36fd 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1389,17 +1389,17 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
         stopped = true
         env.metricsSystem.report()
         metadataCleaner.cancel()
-        env.actorSystem.stop(heartbeatReceiver)
         cleaner.foreach(_.stop())
         dagScheduler.stop()
         dagScheduler = null
+        listenerBus.stop()
+        eventLogger.foreach(_.stop())
+        env.actorSystem.stop(heartbeatReceiver)
         progressBar.foreach(_.stop())
         taskScheduler = null
         // TODO: Cache.stop()?
         env.stop()
         SparkEnv.set(null)
-        listenerBus.stop()
-        eventLogger.foreach(_.stop())
         logInfo("Successfully stopped SparkContext")
         SparkContext.clearActiveContext()
       } else {

From e747e98490f8ede23b0a9e0795e7445d0b597624 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 27 Feb 2015 13:31:46 +0000
Subject: [PATCH 340/357] [SPARK-6058][Yarn] Log the user class exception in
 ApplicationMaster

Because ApplicationMaster doesn't set SparkUncaughtExceptionHandler, the exception in the user class won't be logged. This PR added a `logError` for it.

Author: zsxwing <zsxwing@gmail.com>

Closes #4813 from zsxwing/SPARK-6058 and squashes the following commits:

806c932 [zsxwing] Log the user class exception
---
 .../scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 20fc19166ac4e..796422bc25b59 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -486,11 +486,10 @@ private[spark] class ApplicationMaster(
               case _: InterruptedException =>
                 // Reporter thread can interrupt to stop user class
               case cause: Throwable =>
+                logError("User class threw exception: " + cause.getMessage, cause)
                 finish(FinalApplicationStatus.FAILED,
                   ApplicationMaster.EXIT_EXCEPTION_USER_CLASS,
                   "User class threw exception: " + cause.getMessage)
-                // re-throw to get it logged
-                throw cause
             }
         }
       }

From 57566d0af3008982a1e24a763ed2f6a700b40f8f Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 27 Feb 2015 13:33:39 +0000
Subject: [PATCH 341/357] [SPARK-6059][Yarn] Add volatile to
 ApplicationMaster's reporterThread and allocator

`ApplicationMaster.reporterThread` and `ApplicationMaster.allocator` are accessed in multiple threads, so they should be marked as `volatile`.

Author: zsxwing <zsxwing@gmail.com>

Closes #4814 from zsxwing/SPARK-6059 and squashes the following commits:

17d9386 [zsxwing] Add volatile to ApplicationMaster's reporterThread and allocator
---
 .../org/apache/spark/deploy/yarn/ApplicationMaster.scala      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 796422bc25b59..e966bfba7bb7d 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -68,8 +68,8 @@ private[spark] class ApplicationMaster(
   @volatile private var finalMsg: String = ""
   @volatile private var userClassThread: Thread = _
 
-  private var reporterThread: Thread = _
-  private var allocator: YarnAllocator = _
+  @volatile private var reporterThread: Thread = _
+  @volatile private var allocator: YarnAllocator = _
 
   // Fields used in client mode.
   private var actorSystem: ActorSystem = null

From d17cb2ba33b363dd346ac5a5681e1757decd0f4d Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Fri, 27 Feb 2015 13:00:36 -0800
Subject: [PATCH 342/357] [SPARK-4587] [mllib] [docs] Fixed save,load calls in
 ML guide examples

Should pass spark context to save/load

CC: mengxr

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #4816 from jkbradley/ml-io-doc-fix and squashes the following commits:

83d369d [Joseph K. Bradley] added comment to save,load parts of ML guide examples
2841170 [Joseph K. Bradley] Fixed save,load calls in ML guide examples
---
 docs/mllib-collaborative-filtering.md | 10 ++++---
 docs/mllib-decision-tree.md           | 20 ++++++++------
 docs/mllib-ensembles.md               | 40 ++++++++++++++++-----------
 docs/mllib-linear-methods.md          | 20 ++++++++------
 docs/mllib-naive-bayes.md             | 10 ++++---
 5 files changed, 60 insertions(+), 40 deletions(-)

diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index 935cd8dad3b25..27aa4d38b7617 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -97,8 +97,9 @@ val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
 }.mean()
 println("Mean Squared Error = " + MSE)
 
-model.save("myModelPath")
-val sameModel = MatrixFactorizationModel.load("myModelPath")
+// Save and load model
+model.save(sc, "myModelPath")
+val sameModel = MatrixFactorizationModel.load(sc, "myModelPath")
 {% endhighlight %}
 
 If the rating matrix is derived from another source of information (e.g., it is inferred from
@@ -186,8 +187,9 @@ public class CollaborativeFiltering {
     ).rdd()).mean();
     System.out.println("Mean Squared Error = " + MSE);
 
-    model.save("myModelPath");
-    MatrixFactorizationModel sameModel = MatrixFactorizationModel.load("myModelPath");
+    // Save and load model
+    model.save(sc.sc(), "myModelPath");
+    MatrixFactorizationModel sameModel = MatrixFactorizationModel.load(sc.sc(), "myModelPath");
   }
 }
 {% endhighlight %}
diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md
index 4695d1cde4901..8e478ab035582 100644
--- a/docs/mllib-decision-tree.md
+++ b/docs/mllib-decision-tree.md
@@ -223,8 +223,9 @@ val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.
 println("Test Error = " + testErr)
 println("Learned classification tree model:\n" + model.toDebugString)
 
-model.save("myModelPath")
-val sameModel = DecisionTreeModel.load("myModelPath")
+// Save and load model
+model.save(sc, "myModelPath")
+val sameModel = DecisionTreeModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
@@ -284,8 +285,9 @@ Double testErr =
 System.out.println("Test Error: " + testErr);
 System.out.println("Learned classification tree model:\n" + model.toDebugString());
 
-model.save("myModelPath");
-DecisionTreeModel sameModel = DecisionTreeModel.load("myModelPath");
+// Save and load model
+model.save(sc.sc(), "myModelPath");
+DecisionTreeModel sameModel = DecisionTreeModel.load(sc.sc(), "myModelPath");
 {% endhighlight %}
 </div>
 
@@ -362,8 +364,9 @@ val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean
 println("Test Mean Squared Error = " + testMSE)
 println("Learned regression tree model:\n" + model.toDebugString)
 
-model.save("myModelPath")
-val sameModel = DecisionTreeModel.load("myModelPath")
+// Save and load model
+model.save(sc, "myModelPath")
+val sameModel = DecisionTreeModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
@@ -429,8 +432,9 @@ Double testMSE =
 System.out.println("Test Mean Squared Error: " + testMSE);
 System.out.println("Learned regression tree model:\n" + model.toDebugString());
 
-model.save("myModelPath");
-DecisionTreeModel sameModel = DecisionTreeModel.load("myModelPath");
+// Save and load model
+model.save(sc.sc(), "myModelPath");
+DecisionTreeModel sameModel = DecisionTreeModel.load(sc.sc(), "myModelPath");
 {% endhighlight %}
 </div>
 
diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md
index ddae84165f8a9..ec1ef38b453d3 100644
--- a/docs/mllib-ensembles.md
+++ b/docs/mllib-ensembles.md
@@ -129,8 +129,9 @@ val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.
 println("Test Error = " + testErr)
 println("Learned classification forest model:\n" + model.toDebugString)
 
-model.save("myModelPath")
-val sameModel = RandomForestModel.load("myModelPath")
+// Save and load model
+model.save(sc, "myModelPath")
+val sameModel = RandomForestModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
@@ -193,8 +194,9 @@ Double testErr =
 System.out.println("Test Error: " + testErr);
 System.out.println("Learned classification forest model:\n" + model.toDebugString());
 
-model.save("myModelPath");
-RandomForestModel sameModel = RandomForestModel.load("myModelPath");
+// Save and load model
+model.save(sc.sc(), "myModelPath");
+RandomForestModel sameModel = RandomForestModel.load(sc.sc(), "myModelPath");
 {% endhighlight %}
 </div>
 
@@ -276,8 +278,9 @@ val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean
 println("Test Mean Squared Error = " + testMSE)
 println("Learned regression forest model:\n" + model.toDebugString)
 
-model.save("myModelPath")
-val sameModel = RandomForestModel.load("myModelPath")
+// Save and load model
+model.save(sc, "myModelPath")
+val sameModel = RandomForestModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
@@ -343,8 +346,9 @@ Double testMSE =
 System.out.println("Test Mean Squared Error: " + testMSE);
 System.out.println("Learned regression forest model:\n" + model.toDebugString());
 
-model.save("myModelPath");
-RandomForestModel sameModel = RandomForestModel.load("myModelPath");
+// Save and load model
+model.save(sc.sc(), "myModelPath");
+RandomForestModel sameModel = RandomForestModel.load(sc.sc(), "myModelPath");
 {% endhighlight %}
 </div>
 
@@ -504,8 +508,9 @@ val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.
 println("Test Error = " + testErr)
 println("Learned classification GBT model:\n" + model.toDebugString)
 
-model.save("myModelPath")
-val sameModel = GradientBoostedTreesModel.load("myModelPath")
+// Save and load model
+model.save(sc, "myModelPath")
+val sameModel = GradientBoostedTreesModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
@@ -568,8 +573,9 @@ Double testErr =
 System.out.println("Test Error: " + testErr);
 System.out.println("Learned classification GBT model:\n" + model.toDebugString());
 
-model.save("myModelPath");
-GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load("myModelPath");
+// Save and load model
+model.save(sc.sc(), "myModelPath");
+GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(sc.sc(), "myModelPath");
 {% endhighlight %}
 </div>
 
@@ -647,8 +653,9 @@ val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean
 println("Test Mean Squared Error = " + testMSE)
 println("Learned regression GBT model:\n" + model.toDebugString)
 
-model.save("myModelPath")
-val sameModel = GradientBoostedTreesModel.load("myModelPath")
+// Save and load model
+model.save(sc, "myModelPath")
+val sameModel = GradientBoostedTreesModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
@@ -717,8 +724,9 @@ Double testMSE =
 System.out.println("Test Mean Squared Error: " + testMSE);
 System.out.println("Learned regression GBT model:\n" + model.toDebugString());
 
-model.save("myModelPath");
-GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load("myModelPath");
+// Save and load model
+model.save(sc.sc(), "myModelPath");
+GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(sc.sc(), "myModelPath");
 {% endhighlight %}
 </div>
 
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index d9fc63b37d116..ffbd7ef1bff51 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -223,8 +223,9 @@ val auROC = metrics.areaUnderROC()
 
 println("Area under ROC = " + auROC)
 
-model.save("myModelPath")
-val sameModel = SVMModel.load("myModelPath")
+// Save and load model
+model.save(sc, "myModelPath")
+val sameModel = SVMModel.load(sc, "myModelPath")
 {% endhighlight %}
 
 The `SVMWithSGD.train()` method by default performs L2 regularization with the
@@ -308,8 +309,9 @@ public class SVMClassifier {
     
     System.out.println("Area under ROC = " + auROC);
 
-    model.save("myModelPath");
-    SVMModel sameModel = SVMModel.load("myModelPath");
+    // Save and load model
+    model.save(sc.sc(), "myModelPath");
+    SVMModel sameModel = SVMModel.load(sc.sc(), "myModelPath");
   }
 }
 {% endhighlight %}
@@ -423,8 +425,9 @@ val valuesAndPreds = parsedData.map { point =>
 val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.mean()
 println("training Mean Squared Error = " + MSE)
 
-model.save("myModelPath")
-val sameModel = LinearRegressionModel.load("myModelPath")
+// Save and load model
+model.save(sc, "myModelPath")
+val sameModel = LinearRegressionModel.load(sc, "myModelPath")
 {% endhighlight %}
 
 [`RidgeRegressionWithSGD`](api/scala/index.html#org.apache.spark.mllib.regression.RidgeRegressionWithSGD)
@@ -496,8 +499,9 @@ public class LinearRegression {
     ).rdd()).mean();
     System.out.println("training Mean Squared Error = " + MSE);
 
-    model.save("myModelPath");
-    LinearRegressionModel sameModel = LinearRegressionModel.load("myModelPath");
+    // Save and load model
+    model.save(sc.sc(), "myModelPath");
+    LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "myModelPath");
   }
 }
 {% endhighlight %}
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index 81173255b590d..5224a0b49a991 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -56,8 +56,9 @@ val model = NaiveBayes.train(training, lambda = 1.0)
 val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
 val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
 
-model.save("myModelPath")
-val sameModel = NaiveBayesModel.load("myModelPath")
+// Save and load model
+model.save(sc, "myModelPath")
+val sameModel = NaiveBayesModel.load(sc, "myModelPath")
 {% endhighlight %}
 </div>
 
@@ -97,8 +98,9 @@ double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>,
     }
   }).count() / (double) test.count();
 
-model.save("myModelPath");
-NaiveBayesModel sameModel = NaiveBayesModel.load("myModelPath");
+// Save and load model
+model.save(sc.sc(), "myModelPath");
+NaiveBayesModel sameModel = NaiveBayesModel.load(sc.sc(), "myModelPath");
 {% endhighlight %}
 </div>
 

From 5f7f3b938e1776168be866fc9ee87dc7494696cc Mon Sep 17 00:00:00 2001
From: Saisai Shao <saisai.shao@intel.com>
Date: Fri, 27 Feb 2015 13:01:42 -0800
Subject: [PATCH 343/357] [Streaming][Minor] Remove useless type signature of
 Java Kafka direct stream API

cc tdas .

Author: Saisai Shao <saisai.shao@intel.com>

Closes #4817 from jerryshao/signature-minor-fix and squashes the following commits:

eebfaac [Saisai Shao] Remove useless type parameter
---
 .../scala/org/apache/spark/streaming/kafka/KafkaUtils.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index 62a659518943d..5a9bd4214cf51 100644
--- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -512,7 +512,7 @@ object KafkaUtils {
    * @param topics Names of the topics to consume
    */
   @Experimental
-  def createDirectStream[K, V, KD <: Decoder[K], VD <: Decoder[V], R](
+  def createDirectStream[K, V, KD <: Decoder[K], VD <: Decoder[V]](
       jssc: JavaStreamingContext,
       keyClass: Class[K],
       valueClass: Class[V],

From 8c468a6600e0deb5464990df60148212e64fdecd Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sat, 28 Feb 2015 08:41:49 +0800
Subject: [PATCH 344/357] [SPARK-5751] [SQL] Sets SPARK_HOME as SPARK_PID_DIR
 when running Thrift server test suites

This is a follow-up of #4720. By default, `spark-daemon.sh` writes PID files under `/tmp`, which makes it impossible to start multiple server instances simultaneously. This PR sets `SPARK_PID_DIR` to Spark home directory to workaround this problem.

Many thanks to chenghao-intel for pointing out this issue!

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4758)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>

Closes #4758 from liancheng/thriftserver-pid-dir and squashes the following commits:

252fa0f [Cheng Lian] Uses temporary directory as Thrift server PID directory
1b3d1e3 [Cheng Lian] Sets SPARK_HOME as SPARK_PID_DIR when running Thrift server test suites
---
 .../hive/thriftserver/HiveThriftServer2Suites.scala | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index 77ef37253e38f..d783d487b5c60 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -39,6 +39,7 @@ import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.util
 import org.apache.spark.sql.hive.HiveShim
+import org.apache.spark.util.Utils
 
 object TestData {
   def getTestDataFilePath(name: String) = {
@@ -273,6 +274,7 @@ abstract class HiveThriftServer2Test extends FunSuite with BeforeAndAfterAll wit
   private var metastorePath: File = _
   private def metastoreJdbcUri = s"jdbc:derby:;databaseName=$metastorePath;create=true"
 
+  private val pidDir: File = Utils.createTempDir("thriftserver-pid")
   private var logPath: File = _
   private var logTailingProcess: Process = _
   private var diagnosisBuffer: ArrayBuffer[String] = ArrayBuffer.empty[String]
@@ -315,7 +317,14 @@ abstract class HiveThriftServer2Test extends FunSuite with BeforeAndAfterAll wit
 
     logInfo(s"Trying to start HiveThriftServer2: port=$port, mode=$mode, attempt=$attempt")
 
-    logPath = Process(command, None, "SPARK_TESTING" -> "0").lines.collectFirst {
+    val env = Seq(
+      // Disables SPARK_TESTING to exclude log4j.properties in test directories.
+      "SPARK_TESTING" -> "0",
+      // Points SPARK_PID_DIR to SPARK_HOME, otherwise only 1 Thrift server instance can be started
+      // at a time, which is not Jenkins friendly.
+      "SPARK_PID_DIR" -> pidDir.getCanonicalPath)
+
+    logPath = Process(command, None, env: _*).lines.collectFirst {
       case line if line.contains(LOG_FILE_MARK) => new File(line.drop(LOG_FILE_MARK.length))
     }.getOrElse {
       throw new RuntimeException("Failed to find HiveThriftServer2 log file.")
@@ -346,7 +355,7 @@ abstract class HiveThriftServer2Test extends FunSuite with BeforeAndAfterAll wit
 
   private def stopThriftServer(): Unit = {
     // The `spark-daemon.sh' script uses kill, which is not synchronous, have to wait for a while.
-    Process(stopScript, None).run().exitValue()
+    Process(stopScript, None, "SPARK_PID_DIR" -> pidDir.getCanonicalPath).run().exitValue()
     Thread.sleep(3.seconds.toMillis)
 
     warehousePath.delete()

From e0e64ba4b1b8eb72e856286f756c65fa22ab0a36 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Fri, 27 Feb 2015 20:07:17 -0800
Subject: [PATCH 345/357] [SPARK-6055] [PySpark] fix incorrect __eq__ of
 DataType

The _eq_ of DataType is not correct, class cache is not use correctly (created class can not be find by dataType), then it will create lots of classes (saved in _cached_cls), never released.

Also, all same DataType have same hash code, there will be many object in a dict with the same hash code, end with hash attach, it's very slow to access this dict (depends on the implementation of CPython).

This PR also improve the performance of inferSchema (avoid the unnecessary converter of object).

cc pwendell  JoshRosen

Author: Davies Liu <davies@databricks.com>

Closes #4808 from davies/leak and squashes the following commits:

6a322a4 [Davies Liu] tests refactor
3da44fc [Davies Liu] fix __eq__ of Singleton
534ac90 [Davies Liu] add more checks
46999dc [Davies Liu] fix tests
d9ae973 [Davies Liu] fix memory leak in sql
---
 python/pyspark/sql/context.py   |  90 +-----------------------
 python/pyspark/sql/dataframe.py |   4 +-
 python/pyspark/sql/tests.py     |   9 +++
 python/pyspark/sql/types.py     | 120 +++++++++++++++++++-------------
 4 files changed, 86 insertions(+), 137 deletions(-)

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 5d7aeb664cadf..795ef0dbc4c47 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -17,7 +17,6 @@
 
 import warnings
 import json
-from array import array
 from itertools import imap
 
 from py4j.protocol import Py4JError
@@ -25,7 +24,7 @@
 
 from pyspark.rdd import RDD, _prepare_for_python_RDD
 from pyspark.serializers import AutoBatchedSerializer, PickleSerializer
-from pyspark.sql.types import StringType, StructType, _verify_type, \
+from pyspark.sql.types import Row, StringType, StructType, _verify_type, \
     _infer_schema, _has_nulltype, _merge_type, _create_converter, _python_to_sql_converter
 from pyspark.sql.dataframe import DataFrame
 
@@ -620,93 +619,6 @@ def _get_hive_ctx(self):
         return self._jvm.HiveContext(self._jsc.sc())
 
 
-def _create_row(fields, values):
-    row = Row(*values)
-    row.__FIELDS__ = fields
-    return row
-
-
-class Row(tuple):
-
-    """
-    A row in L{DataFrame}. The fields in it can be accessed like attributes.
-
-    Row can be used to create a row object by using named arguments,
-    the fields will be sorted by names.
-
-    >>> row = Row(name="Alice", age=11)
-    >>> row
-    Row(age=11, name='Alice')
-    >>> row.name, row.age
-    ('Alice', 11)
-
-    Row also can be used to create another Row like class, then it
-    could be used to create Row objects, such as
-
-    >>> Person = Row("name", "age")
-    >>> Person
-    <Row(name, age)>
-    >>> Person("Alice", 11)
-    Row(name='Alice', age=11)
-    """
-
-    def __new__(self, *args, **kwargs):
-        if args and kwargs:
-            raise ValueError("Can not use both args "
-                             "and kwargs to create Row")
-        if args:
-            # create row class or objects
-            return tuple.__new__(self, args)
-
-        elif kwargs:
-            # create row objects
-            names = sorted(kwargs.keys())
-            values = tuple(kwargs[n] for n in names)
-            row = tuple.__new__(self, values)
-            row.__FIELDS__ = names
-            return row
-
-        else:
-            raise ValueError("No args or kwargs")
-
-    def asDict(self):
-        """
-        Return as an dict
-        """
-        if not hasattr(self, "__FIELDS__"):
-            raise TypeError("Cannot convert a Row class into dict")
-        return dict(zip(self.__FIELDS__, self))
-
-    # let obect acs like class
-    def __call__(self, *args):
-        """create new Row object"""
-        return _create_row(self, args)
-
-    def __getattr__(self, item):
-        if item.startswith("__"):
-            raise AttributeError(item)
-        try:
-            # it will be slow when it has many fields,
-            # but this will not be used in normal cases
-            idx = self.__FIELDS__.index(item)
-            return self[idx]
-        except IndexError:
-            raise AttributeError(item)
-
-    def __reduce__(self):
-        if hasattr(self, "__FIELDS__"):
-            return (_create_row, (self.__FIELDS__, tuple(self)))
-        else:
-            return tuple.__reduce__(self)
-
-    def __repr__(self):
-        if hasattr(self, "__FIELDS__"):
-            return "Row(%s)" % ", ".join("%s=%r" % (k, v)
-                                         for k, v in zip(self.__FIELDS__, self))
-        else:
-            return "<Row(%s)>" % ", ".join(self)
-
-
 def _test():
     import doctest
     from pyspark.context import SparkContext
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index aec99017fbdc1..5c3b7377c33b5 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1025,10 +1025,12 @@ def cast(self, dataType):
             ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc())
             jdt = ssql_ctx.parseDataType(dataType.json())
             jc = self._jc.cast(jdt)
+        else:
+            raise TypeError("unexpected type: %s" % type(dataType))
         return Column(jc)
 
     def __repr__(self):
-        return 'Column<%s>' % self._jdf.toString().encode('utf8')
+        return 'Column<%s>' % self._jc.toString().encode('utf8')
 
 
 def _test():
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 83899ad4b1b12..2720439416682 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -24,6 +24,7 @@
 import pydoc
 import shutil
 import tempfile
+import pickle
 
 import py4j
 
@@ -88,6 +89,14 @@ def __eq__(self, other):
             other.x == self.x and other.y == self.y
 
 
+class DataTypeTests(unittest.TestCase):
+    # regression test for SPARK-6055
+    def test_data_type_eq(self):
+        lt = LongType()
+        lt2 = pickle.loads(pickle.dumps(LongType()))
+        self.assertEquals(lt, lt2)
+
+
 class SQLTests(ReusedPySparkTestCase):
 
     @classmethod
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 0f5dc2be6dab8..31a861e1feb46 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -21,6 +21,7 @@
 import warnings
 import json
 import re
+import weakref
 from array import array
 from operator import itemgetter
 
@@ -42,8 +43,7 @@ def __hash__(self):
         return hash(str(self))
 
     def __eq__(self, other):
-        return (isinstance(other, self.__class__) and
-                self.__dict__ == other.__dict__)
+        return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
 
     def __ne__(self, other):
         return not self.__eq__(other)
@@ -64,6 +64,8 @@ def json(self):
                           sort_keys=True)
 
 
+# This singleton pattern does not work with pickle, you will get
+# another object after pickle and unpickle
 class PrimitiveTypeSingleton(type):
 
     """Metaclass for PrimitiveType"""
@@ -82,10 +84,6 @@ class PrimitiveType(DataType):
 
     __metaclass__ = PrimitiveTypeSingleton
 
-    def __eq__(self, other):
-        # because they should be the same object
-        return self is other
-
 
 class NullType(PrimitiveType):
 
@@ -242,11 +240,12 @@ def __init__(self, elementType, containsNull=True):
         :param elementType: the data type of elements.
         :param containsNull: indicates whether the list contains None values.
 
-        >>> ArrayType(StringType) == ArrayType(StringType, True)
+        >>> ArrayType(StringType()) == ArrayType(StringType(), True)
         True
-        >>> ArrayType(StringType, False) == ArrayType(StringType)
+        >>> ArrayType(StringType(), False) == ArrayType(StringType())
         False
         """
+        assert isinstance(elementType, DataType), "elementType should be DataType"
         self.elementType = elementType
         self.containsNull = containsNull
 
@@ -292,13 +291,15 @@ def __init__(self, keyType, valueType, valueContainsNull=True):
         :param valueContainsNull: indicates whether values contains
         null values.
 
-        >>> (MapType(StringType, IntegerType)
-        ...        == MapType(StringType, IntegerType, True))
+        >>> (MapType(StringType(), IntegerType())
+        ...        == MapType(StringType(), IntegerType(), True))
         True
-        >>> (MapType(StringType, IntegerType, False)
-        ...        == MapType(StringType, FloatType))
+        >>> (MapType(StringType(), IntegerType(), False)
+        ...        == MapType(StringType(), FloatType()))
         False
         """
+        assert isinstance(keyType, DataType), "keyType should be DataType"
+        assert isinstance(valueType, DataType), "valueType should be DataType"
         self.keyType = keyType
         self.valueType = valueType
         self.valueContainsNull = valueContainsNull
@@ -348,13 +349,14 @@ def __init__(self, name, dataType, nullable=True, metadata=None):
                          to simple type that can be serialized to JSON
                          automatically
 
-        >>> (StructField("f1", StringType, True)
-        ...      == StructField("f1", StringType, True))
+        >>> (StructField("f1", StringType(), True)
+        ...      == StructField("f1", StringType(), True))
         True
-        >>> (StructField("f1", StringType, True)
-        ...      == StructField("f2", StringType, True))
+        >>> (StructField("f1", StringType(), True)
+        ...      == StructField("f2", StringType(), True))
         False
         """
+        assert isinstance(dataType, DataType), "dataType should be DataType"
         self.name = name
         self.dataType = dataType
         self.nullable = nullable
@@ -393,16 +395,17 @@ class StructType(DataType):
     def __init__(self, fields):
         """Creates a StructType
 
-        >>> struct1 = StructType([StructField("f1", StringType, True)])
-        >>> struct2 = StructType([StructField("f1", StringType, True)])
+        >>> struct1 = StructType([StructField("f1", StringType(), True)])
+        >>> struct2 = StructType([StructField("f1", StringType(), True)])
         >>> struct1 == struct2
         True
-        >>> struct1 = StructType([StructField("f1", StringType, True)])
-        >>> struct2 = StructType([StructField("f1", StringType, True),
-        ...   [StructField("f2", IntegerType, False)]])
+        >>> struct1 = StructType([StructField("f1", StringType(), True)])
+        >>> struct2 = StructType([StructField("f1", StringType(), True),
+        ...     StructField("f2", IntegerType(), False)])
         >>> struct1 == struct2
         False
         """
+        assert all(isinstance(f, DataType) for f in fields), "fields should be a list of DataType"
         self.fields = fields
 
     def simpleString(self):
@@ -505,20 +508,24 @@ def __eq__(self, other):
 
 def _parse_datatype_json_string(json_string):
     """Parses the given data type JSON string.
+    >>> import pickle
     >>> def check_datatype(datatype):
+    ...     pickled = pickle.loads(pickle.dumps(datatype))
+    ...     assert datatype == pickled
     ...     scala_datatype = sqlCtx._ssql_ctx.parseDataType(datatype.json())
     ...     python_datatype = _parse_datatype_json_string(scala_datatype.json())
-    ...     return datatype == python_datatype
-    >>> all(check_datatype(cls()) for cls in _all_primitive_types.values())
-    True
+    ...     assert datatype == python_datatype
+    >>> for cls in _all_primitive_types.values():
+    ...     check_datatype(cls())
+
     >>> # Simple ArrayType.
     >>> simple_arraytype = ArrayType(StringType(), True)
     >>> check_datatype(simple_arraytype)
-    True
+
     >>> # Simple MapType.
     >>> simple_maptype = MapType(StringType(), LongType())
     >>> check_datatype(simple_maptype)
-    True
+
     >>> # Simple StructType.
     >>> simple_structtype = StructType([
     ...     StructField("a", DecimalType(), False),
@@ -526,7 +533,7 @@ def _parse_datatype_json_string(json_string):
     ...     StructField("c", LongType(), True),
     ...     StructField("d", BinaryType(), False)])
     >>> check_datatype(simple_structtype)
-    True
+
     >>> # Complex StructType.
     >>> complex_structtype = StructType([
     ...     StructField("simpleArray", simple_arraytype, True),
@@ -535,22 +542,20 @@ def _parse_datatype_json_string(json_string):
     ...     StructField("boolean", BooleanType(), False),
     ...     StructField("withMeta", DoubleType(), False, {"name": "age"})])
     >>> check_datatype(complex_structtype)
-    True
+
     >>> # Complex ArrayType.
     >>> complex_arraytype = ArrayType(complex_structtype, True)
     >>> check_datatype(complex_arraytype)
-    True
+
     >>> # Complex MapType.
     >>> complex_maptype = MapType(complex_structtype,
     ...                           complex_arraytype, False)
     >>> check_datatype(complex_maptype)
-    True
+
     >>> check_datatype(ExamplePointUDT())
-    True
     >>> structtype_with_udt = StructType([StructField("label", DoubleType(), False),
     ...                                   StructField("point", ExamplePointUDT(), False)])
     >>> check_datatype(structtype_with_udt)
-    True
     """
     return _parse_datatype_json_value(json.loads(json_string))
 
@@ -786,8 +791,24 @@ def _merge_type(a, b):
         return a
 
 
+def _need_converter(dataType):
+    if isinstance(dataType, StructType):
+        return True
+    elif isinstance(dataType, ArrayType):
+        return _need_converter(dataType.elementType)
+    elif isinstance(dataType, MapType):
+        return _need_converter(dataType.keyType) or _need_converter(dataType.valueType)
+    elif isinstance(dataType, NullType):
+        return True
+    else:
+        return False
+
+
 def _create_converter(dataType):
     """Create an converter to drop the names of fields in obj """
+    if not _need_converter(dataType):
+        return lambda x: x
+
     if isinstance(dataType, ArrayType):
         conv = _create_converter(dataType.elementType)
         return lambda row: map(conv, row)
@@ -806,13 +827,17 @@ def _create_converter(dataType):
     # dataType must be StructType
     names = [f.name for f in dataType.fields]
     converters = [_create_converter(f.dataType) for f in dataType.fields]
+    convert_fields = any(_need_converter(f.dataType) for f in dataType.fields)
 
     def convert_struct(obj):
         if obj is None:
             return
 
         if isinstance(obj, (tuple, list)):
-            return tuple(conv(v) for v, conv in zip(obj, converters))
+            if convert_fields:
+                return tuple(conv(v) for v, conv in zip(obj, converters))
+            else:
+                return tuple(obj)
 
         if isinstance(obj, dict):
             d = obj
@@ -821,7 +846,10 @@ def convert_struct(obj):
         else:
             raise ValueError("Unexpected obj: %s" % obj)
 
-        return tuple([conv(d.get(name)) for name, conv in zip(names, converters)])
+        if convert_fields:
+            return tuple([conv(d.get(name)) for name, conv in zip(names, converters)])
+        else:
+            return tuple([d.get(name) for name in names])
 
     return convert_struct
 
@@ -871,20 +899,20 @@ def _parse_field_abstract(s):
     Parse a field in schema abstract
 
     >>> _parse_field_abstract("a")
-    StructField(a,None,true)
+    StructField(a,NullType,true)
     >>> _parse_field_abstract("b(c d)")
-    StructField(b,StructType(...c,None,true),StructField(d...
+    StructField(b,StructType(...c,NullType,true),StructField(d...
     >>> _parse_field_abstract("a[]")
-    StructField(a,ArrayType(None,true),true)
+    StructField(a,ArrayType(NullType,true),true)
     >>> _parse_field_abstract("a{[]}")
-    StructField(a,MapType(None,ArrayType(None,true),true),true)
+    StructField(a,MapType(NullType,ArrayType(NullType,true),true),true)
     """
     if set(_BRACKETS.keys()) & set(s):
         idx = min((s.index(c) for c in _BRACKETS if c in s))
         name = s[:idx]
         return StructField(name, _parse_schema_abstract(s[idx:]), True)
     else:
-        return StructField(s, None, True)
+        return StructField(s, NullType(), True)
 
 
 def _parse_schema_abstract(s):
@@ -898,11 +926,11 @@ def _parse_schema_abstract(s):
     >>> _parse_schema_abstract("c{} d{a b}")
     StructType...c,MapType...d,MapType...a...b...
     >>> _parse_schema_abstract("a b(t)").fields[1]
-    StructField(b,StructType(List(StructField(t,None,true))),true)
+    StructField(b,StructType(List(StructField(t,NullType,true))),true)
     """
     s = s.strip()
     if not s:
-        return
+        return NullType()
 
     elif s.startswith('('):
         return _parse_schema_abstract(s[1:-1])
@@ -911,7 +939,7 @@ def _parse_schema_abstract(s):
         return ArrayType(_parse_schema_abstract(s[1:-1]), True)
 
     elif s.startswith('{'):
-        return MapType(None, _parse_schema_abstract(s[1:-1]))
+        return MapType(NullType(), _parse_schema_abstract(s[1:-1]))
 
     parts = _split_schema_abstract(s)
     fields = [_parse_field_abstract(p) for p in parts]
@@ -931,7 +959,7 @@ def _infer_schema_type(obj, dataType):
     >>> _infer_schema_type(row, schema)
     StructType...a,ArrayType...b,MapType(StringType,...c,LongType...
     """
-    if dataType is None:
+    if dataType is NullType():
         return _infer_type(obj)
 
     if not obj:
@@ -1037,8 +1065,7 @@ def _verify_type(obj, dataType):
         for v, f in zip(obj, dataType.fields):
             _verify_type(v, f.dataType)
 
-
-_cached_cls = {}
+_cached_cls = weakref.WeakValueDictionary()
 
 
 def _restore_object(dataType, obj):
@@ -1233,8 +1260,7 @@ def __new__(self, *args, **kwargs):
         elif kwargs:
             # create row objects
             names = sorted(kwargs.keys())
-            values = tuple(kwargs[n] for n in names)
-            row = tuple.__new__(self, values)
+            row = tuple.__new__(self, [kwargs[n] for n in names])
             row.__FIELDS__ = names
             return row
 

From dba08d1fc3bdb9245aefe695970354df088a93b6 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 27 Feb 2015 22:44:11 -0800
Subject: [PATCH 346/357] [SPARK-6070] [yarn] Remove unneeded classes from
 shuffle service jar.

These may conflict with the classes already in the NM. We shouldn't
be repackaging them.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #4820 from vanzin/SPARK-6070 and squashes the following commits:

871b566 [Marcelo Vanzin] The "d'oh how didn't I think of it before" solution.
3cba946 [Marcelo Vanzin] Use profile instead, so that dependencies don't need to be explicitly listed.
7a18a1b [Marcelo Vanzin] [SPARK-6070] [yarn] Remove unneeded classes from shuffle service jar.
---
 network/yarn/pom.xml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/network/yarn/pom.xml b/network/yarn/pom.xml
index acec8f18f2b5c..39b99f54f6dbc 100644
--- a/network/yarn/pom.xml
+++ b/network/yarn/pom.xml
@@ -33,6 +33,8 @@
   <url>http://spark.apache.org/</url>
   <properties>
     <sbt.project.name>network-yarn</sbt.project.name>
+    <!-- Make sure all Hadoop dependencies are provided to avoid repackaging. -->
+    <hadoop.deps.scope>provided</hadoop.deps.scope>
   </properties>
 
   <dependencies>
@@ -47,7 +49,6 @@
     <dependency>
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-client</artifactId>
-      <scope>provided</scope>
     </dependency>
   </dependencies>
 

From 6d8e5fbc0d83411174ffa59ff6a761a862eca32c Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Fri, 27 Feb 2015 22:59:35 -0800
Subject: [PATCH 347/357] [SPARK-5979][SPARK-6032] Smaller safer --packages fix

pwendell tdas
This is the safer parts of PR #4754:
 - SPARK-5979: All dependencies with the groupId `org.apache.spark` passed through `--packages`, were being excluded from the dependency tree on the assumption that they would be in the assembly jar. This is not the case, therefore the exclusion rules had to be defined more explicitly.
 - SPARK-6032: Ivy prints a whole lot of logs while retrieving dependencies. These were printed to `System.out`. Moved the logging to `System.err`.

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #4802 from brkyvz/simple-streaming-fix and squashes the following commits:

e0f38cb [Burak Yavuz] Merge branch 'master' of github.com:apache/spark into simple-streaming-fix
bad921c [Burak Yavuz] [SPARK-5979][SPARK-6032] Smaller safer fix
---
 .../org/apache/spark/deploy/SparkSubmit.scala | 53 +++++++++++++------
 .../spark/deploy/SparkSubmitUtilsSuite.scala  | 16 +++++-
 2 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 4c4110812e0a1..4a74641f4e1fa 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -655,8 +655,7 @@ private[spark] object SparkSubmitUtils {
 
 /**
  * Extracts maven coordinates from a comma-delimited string. Coordinates should be provided
- * in the format `groupId:artifactId:version` or `groupId/artifactId:version`. The latter provides
- * simplicity for Spark Package users.
+ * in the format `groupId:artifactId:version` or `groupId/artifactId:version`.
  * @param coordinates Comma-delimited string of maven coordinates
  * @return Sequence of Maven coordinates
  */
@@ -747,6 +746,35 @@ private[spark] object SparkSubmitUtils {
       md.addDependency(dd)
     }
   }
+  
+  /** Add exclusion rules for dependencies already included in the spark-assembly */
+  private[spark] def addExclusionRules(
+      ivySettings: IvySettings,
+      ivyConfName: String,
+      md: DefaultModuleDescriptor): Unit = {
+    // Add scala exclusion rule
+    val scalaArtifacts = new ArtifactId(new ModuleId("*", "scala-library"), "*", "*", "*")
+    val scalaDependencyExcludeRule =
+      new DefaultExcludeRule(scalaArtifacts, ivySettings.getMatcher("glob"), null)
+    scalaDependencyExcludeRule.addConfiguration(ivyConfName)
+    md.addExcludeRule(scalaDependencyExcludeRule)
+
+    // We need to specify each component explicitly, otherwise we miss spark-streaming-kafka and
+    // other spark-streaming utility components. Underscore is there to differentiate between
+    // spark-streaming_2.1x and spark-streaming-kafka-assembly_2.1x
+    val components = Seq("bagel_", "catalyst_", "core_", "graphx_", "hive_", "mllib_", "repl_",
+      "sql_", "streaming_", "yarn_", "network-common_", "network-shuffle_", "network-yarn_")
+
+    components.foreach { comp =>
+      val sparkArtifacts =
+        new ArtifactId(new ModuleId("org.apache.spark", s"spark-$comp*"), "*", "*", "*")
+      val sparkDependencyExcludeRule =
+        new DefaultExcludeRule(sparkArtifacts, ivySettings.getMatcher("glob"), null)
+      sparkDependencyExcludeRule.addConfiguration(ivyConfName)
+
+      md.addExcludeRule(sparkDependencyExcludeRule)
+    }
+  }
 
   /** A nice function to use in tests as well. Values are dummy strings. */
   private[spark] def getModuleDescriptor = DefaultModuleDescriptor.newDefaultInstance(
@@ -768,6 +796,9 @@ private[spark] object SparkSubmitUtils {
     if (coordinates == null || coordinates.trim.isEmpty) {
       ""
     } else {
+      val sysOut = System.out
+      // To prevent ivy from logging to system out
+      System.setOut(printStream)
       val artifacts = extractMavenCoordinates(coordinates)
       // Default configuration name for ivy
       val ivyConfName = "default"
@@ -811,19 +842,9 @@ private[spark] object SparkSubmitUtils {
       val md = getModuleDescriptor
       md.setDefaultConf(ivyConfName)
 
-      // Add an exclusion rule for Spark and Scala Library
-      val sparkArtifacts = new ArtifactId(new ModuleId("org.apache.spark", "*"), "*", "*", "*")
-      val sparkDependencyExcludeRule =
-        new DefaultExcludeRule(sparkArtifacts, ivySettings.getMatcher("glob"), null)
-      sparkDependencyExcludeRule.addConfiguration(ivyConfName)
-      val scalaArtifacts = new ArtifactId(new ModuleId("*", "scala-library"), "*", "*", "*")
-      val scalaDependencyExcludeRule =
-        new DefaultExcludeRule(scalaArtifacts, ivySettings.getMatcher("glob"), null)
-      scalaDependencyExcludeRule.addConfiguration(ivyConfName)
-
-      // Exclude any Spark dependencies, and add all supplied maven artifacts as dependencies
-      md.addExcludeRule(sparkDependencyExcludeRule)
-      md.addExcludeRule(scalaDependencyExcludeRule)
+      // Add exclusion rules for Spark and Scala Library
+      addExclusionRules(ivySettings, ivyConfName, md)
+      // add all supplied maven artifacts as dependencies
       addDependenciesToIvy(md, artifacts, ivyConfName)
 
       // resolve dependencies
@@ -835,7 +856,7 @@ private[spark] object SparkSubmitUtils {
       ivy.retrieve(rr.getModuleDescriptor.getModuleRevisionId,
         packagesDirectory.getAbsolutePath + File.separator + "[artifact](-[classifier]).[ext]",
         retrieveOptions.setConfs(Array(ivyConfName)))
-
+      System.setOut(sysOut)
       resolveDependencyPaths(rr.getArtifacts.toArray, packagesDirectory)
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
index ad62b35f624f6..8bcca926097a1 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
@@ -117,8 +117,20 @@ class SparkSubmitUtilsSuite extends FunSuite with BeforeAndAfterAll {
   }
 
   test("neglects Spark and Spark's dependencies") {
-    val path = SparkSubmitUtils.resolveMavenCoordinates(
-      "org.apache.spark:spark-core_2.10:1.2.0", None, None, true)
+    val components = Seq("bagel_", "catalyst_", "core_", "graphx_", "hive_", "mllib_", "repl_",
+      "sql_", "streaming_", "yarn_", "network-common_", "network-shuffle_", "network-yarn_")
+
+    val coordinates =
+      components.map(comp => s"org.apache.spark:spark-${comp}2.10:1.2.0").mkString(",") +
+      ",org.apache.spark:spark-core_fake:1.2.0"
+
+    val path = SparkSubmitUtils.resolveMavenCoordinates(coordinates, None, None, true)
     assert(path === "", "should return empty path")
+    // Should not exclude the following dependency. Will throw an error, because it doesn't exist,
+    // but the fact that it is checking means that it wasn't excluded.
+    intercept[RuntimeException] {
+      SparkSubmitUtils.resolveMavenCoordinates(coordinates +
+        ",org.apache.spark:spark-streaming-kafka-assembly_2.10:1.2.0", None, None, true)
+    }
   }
 }

From 9168259813713a12251fb0d457ffbbed8ba857f8 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Fri, 27 Feb 2015 23:10:09 -0800
Subject: [PATCH 348/357] MAINTENANCE: Automated closing of pull requests.

This commit exists to close the following pull requests on Github:

Closes #1128 (close requested by 'srowen')
Closes #3425 (close requested by 'srowen')
Closes #4770 (close requested by 'srowen')
Closes #2813 (close requested by 'srowen')

From e6003f0a571ba44fcd011e695c8622e11cfee7dd Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sat, 28 Feb 2015 21:15:43 +0800
Subject: [PATCH 349/357] [SPARK-5775] [SQL] BugFix: GenericRow cannot be cast
 to SpecificMutableRow when nested data and partitioned table

This PR adapts anselmevignon's #4697 to master and branch-1.3. Please refer to PR description of #4697 for details.

<!-- Reviewable:start -->
[<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/4792)
<!-- Reviewable:end -->

Author: Cheng Lian <lian@databricks.com>
Author: Cheng Lian <liancheng@users.noreply.github.com>
Author: Yin Huai <yhuai@databricks.com>

Closes #4792 from liancheng/spark-5775 and squashes the following commits:

538f506 [Cheng Lian] Addresses comments
cee55cf [Cheng Lian] Merge pull request #4 from yhuai/spark-5775-yin
b0b74fb [Yin Huai] Remove runtime pattern matching.
ca6e038 [Cheng Lian] Fixes SPARK-5775
---
 .../sql/parquet/ParquetTableOperations.scala  |  59 ++++++--
 .../apache/spark/sql/parquet/newParquet.scala |  48 +++++--
 .../spark/sql/parquet/parquetSuites.scala     | 134 +++++++++++++++++-
 3 files changed, 217 insertions(+), 24 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 4e4f647767dc9..225ec6db7d553 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -126,6 +126,13 @@ private[sql] case class ParquetTableScan(
         conf)
 
     if (requestedPartitionOrdinals.nonEmpty) {
+      // This check is based on CatalystConverter.createRootConverter.
+      val primitiveRow = output.forall(a => ParquetTypesConverter.isPrimitiveType(a.dataType))
+
+      // Uses temporary variable to avoid the whole `ParquetTableScan` object being captured into
+      // the `mapPartitionsWithInputSplit` closure below.
+      val outputSize = output.size
+
       baseRDD.mapPartitionsWithInputSplit { case (split, iter) =>
         val partValue = "([^=]+)=([^=]+)".r
         val partValues =
@@ -143,19 +150,47 @@ private[sql] case class ParquetTableScan(
           relation.partitioningAttributes
             .map(a => Cast(Literal(partValues(a.name)), a.dataType).eval(EmptyRow))
 
-        new Iterator[Row] {
-          def hasNext = iter.hasNext
-          def next() = {
-            val row = iter.next()._2.asInstanceOf[SpecificMutableRow]
-
-            // Parquet will leave partitioning columns empty, so we fill them in here.
-            var i = 0
-            while (i < requestedPartitionOrdinals.size) {
-              row(requestedPartitionOrdinals(i)._2) =
-                partitionRowValues(requestedPartitionOrdinals(i)._1)
-              i += 1
+        if (primitiveRow) {
+          new Iterator[Row] {
+            def hasNext = iter.hasNext
+            def next() = {
+              // We are using CatalystPrimitiveRowConverter and it returns a SpecificMutableRow.
+              val row = iter.next()._2.asInstanceOf[SpecificMutableRow]
+
+              // Parquet will leave partitioning columns empty, so we fill them in here.
+              var i = 0
+              while (i < requestedPartitionOrdinals.size) {
+                row(requestedPartitionOrdinals(i)._2) =
+                  partitionRowValues(requestedPartitionOrdinals(i)._1)
+                i += 1
+              }
+              row
+            }
+          }
+        } else {
+          // Create a mutable row since we need to fill in values from partition columns.
+          val mutableRow = new GenericMutableRow(outputSize)
+          new Iterator[Row] {
+            def hasNext = iter.hasNext
+            def next() = {
+              // We are using CatalystGroupConverter and it returns a GenericRow.
+              // Since GenericRow is not mutable, we just cast it to a Row.
+              val row = iter.next()._2.asInstanceOf[Row]
+
+              var i = 0
+              while (i < row.size) {
+                mutableRow(i) = row(i)
+                i += 1
+              }
+              // Parquet will leave partitioning columns empty, so we fill them in here.
+              i = 0
+              while (i < requestedPartitionOrdinals.size) {
+                mutableRow(requestedPartitionOrdinals(i)._2) =
+                  partitionRowValues(requestedPartitionOrdinals(i)._1)
+                i += 1
+              }
+              mutableRow
             }
-            row
           }
         }
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index e648618468d5d..6d56be3ab8dd4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -482,6 +482,10 @@ private[sql] case class ParquetRelation2(
     // When the data does not include the key and the key is requested then we must fill it in
     // based on information from the input split.
     if (!partitionKeysIncludedInDataSchema && partitionKeyLocations.nonEmpty) {
+      // This check is based on CatalystConverter.createRootConverter.
+      val primitiveRow =
+        requestedSchema.forall(a => ParquetTypesConverter.isPrimitiveType(a.dataType))
+
       baseRDD.mapPartitionsWithInputSplit { case (split: ParquetInputSplit, iterator) =>
         val partValues = selectedPartitions.collectFirst {
           case p if split.getPath.getParent.toString == p.path => p.values
@@ -489,16 +493,42 @@ private[sql] case class ParquetRelation2(
 
         val requiredPartOrdinal = partitionKeyLocations.keys.toSeq
 
-        iterator.map { pair =>
-          val row = pair._2.asInstanceOf[SpecificMutableRow]
-          var i = 0
-          while (i < requiredPartOrdinal.size) {
-            // TODO Avoids boxing cost here!
-            val partOrdinal = requiredPartOrdinal(i)
-            row.update(partitionKeyLocations(partOrdinal), partValues(partOrdinal))
-            i += 1
+        if (primitiveRow) {
+          iterator.map { pair =>
+            // We are using CatalystPrimitiveRowConverter and it returns a SpecificMutableRow.
+            val row = pair._2.asInstanceOf[SpecificMutableRow]
+            var i = 0
+            while (i < requiredPartOrdinal.size) {
+              // TODO Avoids boxing cost here!
+              val partOrdinal = requiredPartOrdinal(i)
+              row.update(partitionKeyLocations(partOrdinal), partValues(partOrdinal))
+              i += 1
+            }
+            row
+          }
+        } else {
+          // Create a mutable row since we need to fill in values from partition columns.
+          val mutableRow = new GenericMutableRow(requestedSchema.size)
+          iterator.map { pair =>
+            // We are using CatalystGroupConverter and it returns a GenericRow.
+            // Since GenericRow is not mutable, we just cast it to a Row.
+            val row = pair._2.asInstanceOf[Row]
+            var i = 0
+            while (i < row.size) {
+              // TODO Avoids boxing cost here!
+              mutableRow(i) = row(i)
+              i += 1
+            }
+
+            i = 0
+            while (i < requiredPartOrdinal.size) {
+              // TODO Avoids boxing cost here!
+              val partOrdinal = requiredPartOrdinal(i)
+              mutableRow.update(partitionKeyLocations(partOrdinal), partValues(partOrdinal))
+              i += 1
+            }
+            mutableRow
           }
-          row
         }
       }
     } else {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
index 6a9d9daf6750c..c8da8eea4e646 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
@@ -36,6 +36,20 @@ case class ParquetData(intField: Int, stringField: String)
 // The data that also includes the partitioning key
 case class ParquetDataWithKey(p: Int, intField: Int, stringField: String)
 
+case class StructContainer(intStructField :Int, stringStructField: String)
+
+case class ParquetDataWithComplexTypes(
+    intField: Int,
+    stringField: String,
+    structField: StructContainer,
+    arrayField: Seq[Int])
+
+case class ParquetDataWithKeyAndComplexTypes(
+    p: Int,
+    intField: Int,
+    stringField: String,
+    structField: StructContainer,
+    arrayField: Seq[Int])
 
 /**
  * A suite to test the automatic conversion of metastore tables with parquet data to use the
@@ -86,6 +100,38 @@ class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
       location '${new File(normalTableDir, "normal").getCanonicalPath}'
     """)
 
+    sql(s"""
+      CREATE EXTERNAL TABLE partitioned_parquet_with_complextypes
+      (
+        intField INT,
+        stringField STRING,
+        structField STRUCT<intStructField: INT, stringStructField: STRING>,
+        arrayField ARRAY<INT>
+      )
+      PARTITIONED BY (p int)
+      ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+       STORED AS
+       INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+       OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+      LOCATION '${partitionedTableDirWithComplexTypes.getCanonicalPath}'
+    """)
+
+    sql(s"""
+      CREATE EXTERNAL TABLE partitioned_parquet_with_key_and_complextypes
+      (
+        intField INT,
+        stringField STRING,
+        structField STRUCT<intStructField: INT, stringStructField: STRING>,
+        arrayField ARRAY<INT>
+      )
+      PARTITIONED BY (p int)
+      ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
+       STORED AS
+       INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
+       OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
+      LOCATION '${partitionedTableDirWithKeyAndComplexTypes.getCanonicalPath}'
+    """)
+
     (1 to 10).foreach { p =>
       sql(s"ALTER TABLE partitioned_parquet ADD PARTITION (p=$p)")
     }
@@ -94,7 +140,15 @@ class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
       sql(s"ALTER TABLE partitioned_parquet_with_key ADD PARTITION (p=$p)")
     }
 
-    val rdd1 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str${i}"}"""))
+    (1 to 10).foreach { p =>
+      sql(s"ALTER TABLE partitioned_parquet_with_key_and_complextypes ADD PARTITION (p=$p)")
+    }
+
+    (1 to 10).foreach { p =>
+      sql(s"ALTER TABLE partitioned_parquet_with_complextypes ADD PARTITION (p=$p)")
+    }
+
+    val rdd1 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str$i"}"""))
     jsonRDD(rdd1).registerTempTable("jt")
     val rdd2 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":[$i, null]}"""))
     jsonRDD(rdd2).registerTempTable("jt_array")
@@ -105,6 +159,8 @@ class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
   override def afterAll(): Unit = {
     sql("DROP TABLE partitioned_parquet")
     sql("DROP TABLE partitioned_parquet_with_key")
+    sql("DROP TABLE partitioned_parquet_with_complextypes")
+    sql("DROP TABLE partitioned_parquet_with_key_and_complextypes")
     sql("DROP TABLE normal_parquet")
     sql("DROP TABLE IF EXISTS jt")
     sql("DROP TABLE IF EXISTS jt_array")
@@ -409,6 +465,22 @@ class ParquetSourceSuiteBase extends ParquetPartitioningTest {
         path '${new File(partitionedTableDir, "p=1").getCanonicalPath}'
       )
     """)
+
+    sql( s"""
+      CREATE TEMPORARY TABLE partitioned_parquet_with_key_and_complextypes
+      USING org.apache.spark.sql.parquet
+      OPTIONS (
+        path '${partitionedTableDirWithKeyAndComplexTypes.getCanonicalPath}'
+      )
+    """)
+
+    sql( s"""
+      CREATE TEMPORARY TABLE partitioned_parquet_with_complextypes
+      USING org.apache.spark.sql.parquet
+      OPTIONS (
+        path '${partitionedTableDirWithComplexTypes.getCanonicalPath}'
+      )
+    """)
   }
 
   test("SPARK-6016 make sure to use the latest footers") {
@@ -473,7 +545,8 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
   var partitionedTableDir: File = null
   var normalTableDir: File = null
   var partitionedTableDirWithKey: File = null
-
+  var partitionedTableDirWithComplexTypes: File = null
+  var partitionedTableDirWithKeyAndComplexTypes: File = null
 
   override def beforeAll(): Unit = {
     partitionedTableDir = File.createTempFile("parquettests", "sparksql")
@@ -509,9 +582,45 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
         .toDF()
         .saveAsParquetFile(partDir.getCanonicalPath)
     }
+
+    partitionedTableDirWithKeyAndComplexTypes = File.createTempFile("parquettests", "sparksql")
+    partitionedTableDirWithKeyAndComplexTypes.delete()
+    partitionedTableDirWithKeyAndComplexTypes.mkdir()
+
+    (1 to 10).foreach { p =>
+      val partDir = new File(partitionedTableDirWithKeyAndComplexTypes, s"p=$p")
+      sparkContext.makeRDD(1 to 10).map { i =>
+        ParquetDataWithKeyAndComplexTypes(
+          p, i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i)
+      }.toDF().saveAsParquetFile(partDir.getCanonicalPath)
+    }
+
+    partitionedTableDirWithComplexTypes = File.createTempFile("parquettests", "sparksql")
+    partitionedTableDirWithComplexTypes.delete()
+    partitionedTableDirWithComplexTypes.mkdir()
+
+    (1 to 10).foreach { p =>
+      val partDir = new File(partitionedTableDirWithComplexTypes, s"p=$p")
+      sparkContext.makeRDD(1 to 10).map { i =>
+        ParquetDataWithComplexTypes(i, s"part-$p", StructContainer(i, f"${i}_string"), 1 to i)
+      }.toDF().saveAsParquetFile(partDir.getCanonicalPath)
+    }
+  }
+
+  override protected def afterAll(): Unit = {
+    partitionedTableDir.delete()
+    normalTableDir.delete()
+    partitionedTableDirWithKey.delete()
+    partitionedTableDirWithComplexTypes.delete()
+    partitionedTableDirWithKeyAndComplexTypes.delete()
   }
 
-  Seq("partitioned_parquet", "partitioned_parquet_with_key").foreach { table =>
+  Seq(
+    "partitioned_parquet",
+    "partitioned_parquet_with_key",
+    "partitioned_parquet_with_complextypes",
+    "partitioned_parquet_with_key_and_complextypes").foreach { table =>
+
     test(s"ordering of the partitioning columns $table") {
       checkAnswer(
         sql(s"SELECT p, stringField FROM $table WHERE p = 1"),
@@ -601,6 +710,25 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
     }
   }
 
+  Seq(
+    "partitioned_parquet_with_key_and_complextypes",
+    "partitioned_parquet_with_complextypes").foreach { table =>
+
+    test(s"SPARK-5775 read struct from $table") {
+      checkAnswer(
+        sql(s"SELECT p, structField.intStructField, structField.stringStructField FROM $table WHERE p = 1"),
+        (1 to 10).map(i => Row(1, i, f"${i}_string")))
+    }
+
+    // Re-enable this after SPARK-5508 is fixed
+    ignore(s"SPARK-5775 read array from $table") {
+      checkAnswer(
+        sql(s"SELECT arrayField, p FROM $table WHERE p = 1"),
+        (1 to 10).map(i => Row(1 to i, 1)))
+    }
+  }
+
+
   test("non-part select(*)") {
     checkAnswer(
       sql("SELECT COUNT(*) FROM normal_parquet"),

From b36b1bc22ea73669b0f69ed21e77d47fb0a7cd5d Mon Sep 17 00:00:00 2001
From: Michael Griffiths <msjgriffiths@gmail.com>
Date: Sat, 28 Feb 2015 14:47:39 +0000
Subject: [PATCH 350/357] SPARK-6063 MLlib doesn't pass mvn scalastyle check
 due to UTF chars in LDAModel.scala

Remove unicode characters from MLlib file.

Author: Michael Griffiths <msjgriffiths@gmail.com>
Author: Griffiths, Michael (NYC-RPM) <michael.griffiths@reprisemedia.com>

Closes #4815 from msjgriffiths/SPARK-6063 and squashes the following commits:

bcd7de1 [Griffiths, Michael (NYC-RPM)] Change \u201D quote marks around 'theta' to standard single apostrophe (\x27)
38eb535 [Michael Griffiths] Merge pull request #2 from apache/master
b08e865 [Michael Griffiths] Merge pull request #1 from apache/master
---
 .../main/scala/org/apache/spark/mllib/clustering/LDAModel.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index b0e991d2f2344..0a3f21ecee0dc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -130,7 +130,7 @@ abstract class LDAModel private[clustering] {
 
   /* TODO
    * Compute the estimated topic distribution for each document.
-   * This is often called “theta” in the literature.
+   * This is often called 'theta' in the literature.
    *
    * @param documents  RDD of documents, which are term (word) count vectors paired with IDs.
    *                   The term count vectors are "bags of words" with a fixed-size vocabulary

From f91298e2c597e45af461931919372da5d33ae3da Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 28 Feb 2015 15:23:59 +0000
Subject: [PATCH 351/357] SPARK-5983 [WEBUI] Don't respond to HTTP TRACE in
 HTTP-based UIs

Disallow TRACE HTTP method in servlets

Author: Sean Owen <sowen@cloudera.com>

Closes #4765 from srowen/SPARK-5983 and squashes the following commits:

421b25b [Sean Owen] Disallow TRACE HTTP method in servlets
---
 .../org/apache/spark/deploy/history/HistoryServer.scala   | 4 ++++
 core/src/main/scala/org/apache/spark/ui/JettyUtils.scala  | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index fa9bfe5426b6c..af483d560b33e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -96,6 +96,10 @@ class HistoryServer(
         }
       }
     }
+    // SPARK-5983 ensure TRACE is not supported
+    protected override def doTrace(req: HttpServletRequest, res: HttpServletResponse): Unit = {
+      res.sendError(HttpServletResponse.SC_METHOD_NOT_ALLOWED)
+    }
   }
 
   initialize()
diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
index bf4b24e98b134..95f254a9ef22a 100644
--- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala
@@ -80,6 +80,10 @@ private[spark] object JettyUtils extends Logging {
             response.sendError(HttpServletResponse.SC_BAD_REQUEST, e.getMessage)
         }
       }
+      // SPARK-5983 ensure TRACE is not supported
+      protected override def doTrace(req: HttpServletRequest, res: HttpServletResponse): Unit = {
+        res.sendError(HttpServletResponse.SC_METHOD_NOT_ALLOWED)
+      }
     }
   }
 
@@ -119,6 +123,10 @@ private[spark] object JettyUtils extends Logging {
         val newUrl = new URL(new URL(request.getRequestURL.toString), prefixedDestPath).toString
         response.sendRedirect(newUrl)
       }
+      // SPARK-5983 ensure TRACE is not supported
+      protected override def doTrace(req: HttpServletRequest, res: HttpServletResponse): Unit = {
+        res.sendError(HttpServletResponse.SC_METHOD_NOT_ALLOWED)
+      }
     }
     createServletHandler(srcPath, servlet, basePath)
   }

From 86fcdaef62dbe624233e364ffe43fe3a1da893f0 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 28 Feb 2015 15:34:08 +0000
Subject: [PATCH 352/357] SPARK-1965 [WEBUI] Spark UI throws NPE on trying to
 load the app page for non-existent app

Don't throw NPE if appId is unknown. kayousterhout is this a decent enough band-aid for avoiding a full-blown NPE? it should just render empty content instead

Author: Sean Owen <sowen@cloudera.com>

Closes #4777 from srowen/SPARK-1965 and squashes the following commits:

7e16590 [Sean Owen] Update app not found message
cb878d6 [Sean Owen] Return basic "not found" page for unknown appId
d8270da [Sean Owen] Don't throw NPE if appId is unknown
---
 .../spark/deploy/master/ui/ApplicationPage.scala      | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
index 3aae2b95d7396..76fc40e17d9a8 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
@@ -24,6 +24,7 @@ import scala.xml.Node
 
 import akka.pattern.ask
 import org.json4s.JValue
+import org.json4s.JsonAST.JNothing
 
 import org.apache.spark.deploy.{ExecutorState, JsonProtocol}
 import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState}
@@ -44,7 +45,11 @@ private[spark] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app
     val app = state.activeApps.find(_.id == appId).getOrElse({
       state.completedApps.find(_.id == appId).getOrElse(null)
     })
-    JsonProtocol.writeApplicationInfo(app)
+    if (app == null) {
+      JNothing
+    } else {
+      JsonProtocol.writeApplicationInfo(app)
+    }
   }
 
   /** Executor details for a particular application */
@@ -55,6 +60,10 @@ private[spark] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app
     val app = state.activeApps.find(_.id == appId).getOrElse({
       state.completedApps.find(_.id == appId).getOrElse(null)
     })
+    if (app == null) {
+      val msg = <div class="row-fluid">No running application with ID {appId}</div>
+      return UIUtils.basicSparkPage(msg, "Not Found")
+    }
 
     val executorHeaders = Seq("ExecutorID", "Worker", "Cores", "Memory", "State", "Logs")
     val allExecutors = (app.executors.values ++ app.removedExecutors).toSet.toSeq

From 643300a6e27dac3822f9a3ced0ad5fb3b4f2ad75 Mon Sep 17 00:00:00 2001
From: Evan Yu <ehotou@gmail.com>
Date: Sat, 28 Feb 2015 18:55:34 -0800
Subject: [PATCH 353/357] SPARK-5984: Fix TimSort bug causes
 ArrayOutOfBoundsException

Fix TimSort bug which causes a ArrayOutOfBoundsException.

Using the proposed fix here
http://envisage-project.eu/proving-android-java-and-python-sorting-algorithm-is-broken-and-how-to-fix-it/

Author: Evan Yu <ehotou@gmail.com>

Closes #4804 from hotou/SPARK-5984 and squashes the following commits:

3421b6c [Evan Yu] SPARK-5984: Add info to LICENSE
e61c6b8 [Evan Yu] SPARK-5984: Fix license and document
6ccc280 [Evan Yu] SPARK-5984: Add License header to file
e06c0d2 [Evan Yu] SPARK-5984: Add License header to file
4d95f75 [Evan Yu] SPARK-5984: Fix TimSort bug causes ArrayOutOfBoundsException
479a106 [Evan Yu] SPARK-5984: Fix TimSort bug causes ArrayOutOfBoundsException
---
 LICENSE                                       |  16 +++
 .../apache/spark/util/collection/TimSort.java |   9 +-
 .../spark/util/collection/TestTimSort.java    | 134 ++++++++++++++++++
 .../spark/util/collection/SorterSuite.scala   |   7 +
 4 files changed, 161 insertions(+), 5 deletions(-)
 create mode 100644 core/src/test/java/org/apache/spark/util/collection/TestTimSort.java

diff --git a/LICENSE b/LICENSE
index 0a42d389e4c3c..9b364a4d00079 100644
--- a/LICENSE
+++ b/LICENSE
@@ -771,6 +771,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 
+========================================================================
+For TestTimSort (core/src/test/java/org/apache/spark/util/collection/TestTimSort.java):
+========================================================================
+Copyright (C) 2015 Stijn de Gouw
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
 
 ========================================================================
 For LimitedInputStream
diff --git a/core/src/main/java/org/apache/spark/util/collection/TimSort.java b/core/src/main/java/org/apache/spark/util/collection/TimSort.java
index 409e1a41c5d49..a90cc0e761f62 100644
--- a/core/src/main/java/org/apache/spark/util/collection/TimSort.java
+++ b/core/src/main/java/org/apache/spark/util/collection/TimSort.java
@@ -425,15 +425,14 @@ private void pushRun(int runBase, int runLen) {
     private void mergeCollapse() {
       while (stackSize > 1) {
         int n = stackSize - 2;
-        if (n > 0 && runLen[n-1] <= runLen[n] + runLen[n+1]) {
+        if ( (n >= 1 && runLen[n-1] <= runLen[n] + runLen[n+1])
+          || (n >= 2 && runLen[n-2] <= runLen[n] + runLen[n-1])) {
           if (runLen[n - 1] < runLen[n + 1])
             n--;
-          mergeAt(n);
-        } else if (runLen[n] <= runLen[n + 1]) {
-          mergeAt(n);
-        } else {
+        } else if (runLen[n] > runLen[n + 1]) {
           break; // Invariant is established
         }
+        mergeAt(n);
       }
     }
 
diff --git a/core/src/test/java/org/apache/spark/util/collection/TestTimSort.java b/core/src/test/java/org/apache/spark/util/collection/TestTimSort.java
new file mode 100644
index 0000000000000..45772b6d3c20d
--- /dev/null
+++ b/core/src/test/java/org/apache/spark/util/collection/TestTimSort.java
@@ -0,0 +1,134 @@
+/**
+ * Copyright 2015 Stijn de Gouw
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+package org.apache.spark.util.collection;
+
+import java.util.*;
+
+/**
+ * This codes generates a int array which fails the standard TimSort.
+ *
+ * The blog that reported the bug
+ * http://www.envisage-project.eu/timsort-specification-and-verification/
+ *
+ * This codes was originally wrote by Stijn de Gouw, modified by Evan Yu to adapt to
+ * our test suite.
+ *
+ * https://github.com/abstools/java-timsort-bug
+ * https://github.com/abstools/java-timsort-bug/blob/master/LICENSE
+ */
+public class TestTimSort {
+
+  private static final int MIN_MERGE = 32;
+
+  /**
+   * Returns an array of integers that demonstrate the bug in TimSort
+   */
+  public static int[] getTimSortBugTestSet(int length) {
+    int minRun = minRunLength(length);
+    List<Long> runs = runsJDKWorstCase(minRun, length);
+    return createArray(runs, length);
+  }
+
+  private static int minRunLength(int n) {
+    int r = 0; // Becomes 1 if any 1 bits are shifted off
+    while (n >= MIN_MERGE) {
+      r |= (n & 1);
+      n >>= 1;
+    }
+    return n + r;
+  }
+
+  private static int[] createArray(List<Long> runs, int length) {
+    int[] a = new int[length];
+    Arrays.fill(a, 0);
+    int endRun = -1;
+    for (long len : runs) {
+      a[endRun += len] = 1;
+    }
+    a[length - 1] = 0;
+    return a;
+  }
+
+  /**
+   * Fills <code>runs</code> with a sequence of run lengths of the form<br>
+   * Y_n     x_{n,1}   x_{n,2}   ... x_{n,l_n} <br>
+   * Y_{n-1} x_{n-1,1} x_{n-1,2} ... x_{n-1,l_{n-1}} <br>
+   * ... <br>
+   * Y_1     x_{1,1}   x_{1,2}   ... x_{1,l_1}<br>
+   * The Y_i's are chosen to satisfy the invariant throughout execution,
+   * but the x_{i,j}'s are merged (by <code>TimSort.mergeCollapse</code>)
+   * into an X_i that violates the invariant.
+   *
+   * @param length The sum of all run lengths that will be added to <code>runs</code>.
+   */
+  private static List<Long> runsJDKWorstCase(int minRun, int length) {
+    List<Long> runs = new ArrayList<Long>();
+
+    long runningTotal = 0, Y = minRun + 4, X = minRun;
+
+    while (runningTotal + Y + X <= length) {
+      runningTotal += X + Y;
+      generateJDKWrongElem(runs, minRun, X);
+      runs.add(0, Y);
+      // X_{i+1} = Y_i + x_{i,1} + 1, since runs.get(1) = x_{i,1}
+      X = Y + runs.get(1) + 1;
+      // Y_{i+1} = X_{i+1} + Y_i + 1
+      Y += X + 1;
+    }
+
+    if (runningTotal + X <= length) {
+      runningTotal += X;
+      generateJDKWrongElem(runs, minRun, X);
+    }
+
+    runs.add(length - runningTotal);
+    return runs;
+  }
+
+  /**
+   * Adds a sequence x_1, ..., x_n of run lengths to <code>runs</code> such that:<br>
+   * 1. X = x_1 + ... + x_n <br>
+   * 2. x_j >= minRun for all j <br>
+   * 3. x_1 + ... + x_{j-2}  <  x_j  <  x_1 + ... + x_{j-1} for all j <br>
+   * These conditions guarantee that TimSort merges all x_j's one by one
+   * (resulting in X) using only merges on the second-to-last element.
+   *
+   * @param X The sum of the sequence that should be added to runs.
+   */
+  private static void generateJDKWrongElem(List<Long> runs, int minRun, long X) {
+    for (long newTotal; X >= 2 * minRun + 1; X = newTotal) {
+      //Default strategy
+      newTotal = X / 2 + 1;
+      //Specialized strategies
+      if (3 * minRun + 3 <= X && X <= 4 * minRun + 1) {
+        // add x_1=MIN+1, x_2=MIN, x_3=X-newTotal  to runs
+        newTotal = 2 * minRun + 1;
+      } else if (5 * minRun + 5 <= X && X <= 6 * minRun + 5) {
+        // add x_1=MIN+1, x_2=MIN, x_3=MIN+2, x_4=X-newTotal  to runs
+        newTotal = 3 * minRun + 3;
+      } else if (8 * minRun + 9 <= X && X <= 10 * minRun + 9) {
+        // add x_1=MIN+1, x_2=MIN, x_3=MIN+2, x_4=2MIN+2, x_5=X-newTotal  to runs
+        newTotal = 5 * minRun + 5;
+      } else if (13 * minRun + 15 <= X && X <= 16 * minRun + 17) {
+        // add x_1=MIN+1, x_2=MIN, x_3=MIN+2, x_4=2MIN+2, x_5=3MIN+4, x_6=X-newTotal to runs
+        newTotal = 8 * minRun + 9;
+      }
+      runs.add(0, X - newTotal);
+    }
+    runs.add(0, X);
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
index 0cb1ed7397655..e0d6cc16bde05 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
@@ -65,6 +65,13 @@ class SorterSuite extends FunSuite {
     }
   }
 
+  // http://www.envisage-project.eu/timsort-specification-and-verification/
+  test("SPARK-5984 TimSort bug") {
+    val data = TestTimSort.getTimSortBugTestSet(67108864)
+    new Sorter(new IntArraySortDataFormat).sort(data, 0, data.length, Ordering.Int)
+    (0 to data.length - 2).foreach(i => assert(data(i) <= data(i + 1)))
+  }
+
   /** Runs an experiment several times. */
   def runExperiment(name: String, skip: Boolean = false)(f: => Unit, prepare: () => Unit): Unit = {
     if (skip) {

From 2df5f1f00661cd31b9fc37e80345a3f5f856c95f Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sat, 28 Feb 2015 22:51:01 -0800
Subject: [PATCH 354/357] [SPARK-6075] Fix bug in that caused lost accumulator
 updates: do not store WeakReferences in localAccums map

This fixes a non-deterministic bug introduced in #4021 that could cause tasks' accumulator updates to be lost.  The problem is that `localAccums` should not hold weak references: after the task finishes running there won't be any strong references to these local accumulators, so they can get garbage-collected before the executor reads the `localAccums` map.  We don't need weak references here anyways, since this map is cleared at the end of each task.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #4835 from JoshRosen/SPARK-6075 and squashes the following commits:

4f4b5b2 [Josh Rosen] Remove defensive assertions that caused test failures in code unrelated to this change
120c7b0 [Josh Rosen] [SPARK-6075] Do not store WeakReferences in localAccums map
---
 .../scala/org/apache/spark/Accumulators.scala | 40 +++++++++++--------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Accumulators.scala b/core/src/main/scala/org/apache/spark/Accumulators.scala
index 30f0ccd73ccca..bcf832467f00b 100644
--- a/core/src/main/scala/org/apache/spark/Accumulators.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulators.scala
@@ -280,15 +280,24 @@ object AccumulatorParam {
 
 // TODO: The multi-thread support in accumulators is kind of lame; check
 // if there's a more intuitive way of doing it right
-private[spark] object Accumulators {
-  // Store a WeakReference instead of a StrongReference because this way accumulators can be
-  // appropriately garbage collected during long-running jobs and release memory
-  type WeakAcc = WeakReference[Accumulable[_, _]]
-  val originals = Map[Long, WeakAcc]()
-  val localAccums = new ThreadLocal[Map[Long, WeakAcc]]() {
-    override protected def initialValue() = Map[Long, WeakAcc]()
+private[spark] object Accumulators extends Logging {
+  /**
+   * This global map holds the original accumulator objects that are created on the driver.
+   * It keeps weak references to these objects so that accumulators can be garbage-collected
+   * once the RDDs and user-code that reference them are cleaned up.
+   */
+  val originals = Map[Long, WeakReference[Accumulable[_, _]]]()
+
+  /**
+   * This thread-local map holds per-task copies of accumulators; it is used to collect the set
+   * of accumulator updates to send back to the driver when tasks complete. After tasks complete,
+   * this map is cleared by `Accumulators.clear()` (see Executor.scala).
+   */
+  private val localAccums = new ThreadLocal[Map[Long, Accumulable[_, _]]]() {
+    override protected def initialValue() = Map[Long, Accumulable[_, _]]()
   }
-  var lastId: Long = 0
+
+  private var lastId: Long = 0
 
   def newId(): Long = synchronized {
     lastId += 1
@@ -297,16 +306,16 @@ private[spark] object Accumulators {
 
   def register(a: Accumulable[_, _], original: Boolean): Unit = synchronized {
     if (original) {
-      originals(a.id) = new WeakAcc(a)
+      originals(a.id) = new WeakReference[Accumulable[_, _]](a)
     } else {
-      localAccums.get()(a.id) = new WeakAcc(a)
+      localAccums.get()(a.id) = a
     }
   }
 
   // Clear the local (non-original) accumulators for the current thread
   def clear() {
     synchronized {
-      localAccums.get.clear
+      localAccums.get.clear()
     }
   }
 
@@ -320,12 +329,7 @@ private[spark] object Accumulators {
   def values: Map[Long, Any] = synchronized {
     val ret = Map[Long, Any]()
     for ((id, accum) <- localAccums.get) {
-      // Since we are now storing weak references, we must check whether the underlying data
-      // is valid.
-      ret(id) = accum.get match {
-        case Some(values) => values.localValue
-        case None => None
-      }
+      ret(id) = accum.localValue
     }
     return ret
   }
@@ -341,6 +345,8 @@ private[spark] object Accumulators {
           case None =>
             throw new IllegalAccessError("Attempted to access garbage collected Accumulator.")
         }
+      } else {
+        logWarning(s"Ignoring accumulator update for unknown accumulator id $id")
       }
     }
   }

From fd8d283eeb98e310b1e85ef8c3a8af9e547ab5e0 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Sun, 1 Mar 2015 11:05:10 +0000
Subject: [PATCH 355/357] [SPARK-6074] [sql] Package pyspark sql bindings.

This is needed for the SQL bindings to work on Yarn.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #4822 from vanzin/SPARK-6074 and squashes the following commits:

fb52001 [Marcelo Vanzin] [SPARK-6074] [sql] Package pyspark sql bindings.
---
 sql/core/pom.xml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 03a5c9e7c24a0..e28baa512b95c 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -109,5 +109,13 @@
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+    <resources>
+      <resource>
+        <directory>../../python</directory>
+        <includes>
+          <include>pyspark/sql/*.py</include>
+        </includes>
+      </resource>
+    </resources>
   </build>
 </project>

From aedbbaa3dda9cbc154cd52c07f6d296b972b0eb2 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Sun, 1 Mar 2015 16:26:57 -0800
Subject: [PATCH 356/357] [SPARK-6053][MLLIB] support save/load in PySpark's
 ALS

A simple wrapper to save/load `MatrixFactorizationModel` in Python. jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #4811 from mengxr/SPARK-5991 and squashes the following commits:

f135dac [Xiangrui Meng] update save doc
57e5200 [Xiangrui Meng] address comments
06140a4 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-5991
282ec8d [Xiangrui Meng] support save/load in PySpark's ALS
---
 docs/mllib-collaborative-filtering.md         |  8 ++-
 .../spark/mllib/util/modelSaveLoad.scala      |  2 +-
 python/pyspark/mllib/recommendation.py        | 20 ++++++-
 python/pyspark/mllib/util.py                  | 58 +++++++++++++++++++
 4 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index 27aa4d38b7617..76140282a2dd0 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -200,10 +200,8 @@ In the following example we load rating data. Each row consists of a user, a pro
 We use the default ALS.train() method which assumes ratings are explicit. We evaluate the
 recommendation by measuring the Mean Squared Error of rating prediction.
 
-Note that the Python API does not yet support model save/load but will in the future.
-
 {% highlight python %}
-from pyspark.mllib.recommendation import ALS, Rating
+from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
 
 # Load and parse the data
 data = sc.textFile("data/mllib/als/test.data")
@@ -220,6 +218,10 @@ predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
 ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
 MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).reduce(lambda x, y: x + y) / ratesAndPreds.count()
 print("Mean Squared Error = " + str(MSE))
+
+# Save and load model
+model.save(sc, "myModelPath")
+sameModel = MatrixFactorizationModel.load(sc, "myModelPath")
 {% endhighlight %}
 
 If the rating matrix is derived from other source of information (i.e., it is inferred from other
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
index 4458340497f0b..526d055c87387 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
@@ -48,7 +48,7 @@ trait Saveable {
    *
    * @param sc  Spark context used to save model data.
    * @param path  Path specifying the directory in which to save this model.
-   *              This directory and any intermediate directory will be created if needed.
+   *              If the directory already exists, this method throws an exception.
    */
   def save(sc: SparkContext, path: String): Unit
 
diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py
index 0d99e6dedfad9..03d7d011474cb 100644
--- a/python/pyspark/mllib/recommendation.py
+++ b/python/pyspark/mllib/recommendation.py
@@ -19,7 +19,8 @@
 
 from pyspark import SparkContext
 from pyspark.rdd import RDD
-from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc
+from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc, inherit_doc
+from pyspark.mllib.util import Saveable, JavaLoader
 
 __all__ = ['MatrixFactorizationModel', 'ALS', 'Rating']
 
@@ -39,7 +40,8 @@ def __reduce__(self):
         return Rating, (int(self.user), int(self.product), float(self.rating))
 
 
-class MatrixFactorizationModel(JavaModelWrapper):
+@inherit_doc
+class MatrixFactorizationModel(JavaModelWrapper, Saveable, JavaLoader):
 
     """A matrix factorisation model trained by regularized alternating
     least-squares.
@@ -81,6 +83,17 @@ class MatrixFactorizationModel(JavaModelWrapper):
     >>> model = ALS.trainImplicit(ratings, 1, nonnegative=True, seed=10)
     >>> model.predict(2,2)
     0.43...
+
+    >>> import os, tempfile
+    >>> path = tempfile.mkdtemp()
+    >>> model.save(sc, path)
+    >>> sameModel = MatrixFactorizationModel.load(sc, path)
+    >>> sameModel.predict(2,2)
+    0.43...
+    >>> try:
+    ...     os.removedirs(path)
+    ... except:
+    ...     pass
     """
     def predict(self, user, product):
         return self._java_model.predict(int(user), int(product))
@@ -98,6 +111,9 @@ def userFeatures(self):
     def productFeatures(self):
         return self.call("getProductFeatures")
 
+    def save(self, sc, path):
+        self.call("save", sc._jsc.sc(), path)
+
 
 class ALS(object):
 
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index 4ed978b45409c..17d43eadba12b 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -168,6 +168,64 @@ def loadLabeledPoints(sc, path, minPartitions=None):
         return callMLlibFunc("loadLabeledPoints", sc, path, minPartitions)
 
 
+class Saveable(object):
+    """
+    Mixin for models and transformers which may be saved as files.
+    """
+
+    def save(self, sc, path):
+        """
+        Save this model to the given path.
+
+        This saves:
+         * human-readable (JSON) model metadata to path/metadata/
+         * Parquet formatted data to path/data/
+
+        The model may be loaded using py:meth:`Loader.load`.
+
+        :param sc: Spark context used to save model data.
+        :param path: Path specifying the directory in which to save
+                     this model. If the directory already exists,
+                     this method throws an exception.
+        """
+        raise NotImplementedError
+
+
+class Loader(object):
+    """
+    Mixin for classes which can load saved models from files.
+    """
+
+    @classmethod
+    def load(cls, sc, path):
+        """
+        Load a model from the given path. The model should have been
+        saved using py:meth:`Saveable.save`.
+
+        :param sc: Spark context used for loading model files.
+        :param path: Path specifying the directory to which the model
+                     was saved.
+        :return: model instance
+        """
+        raise NotImplemented
+
+
+class JavaLoader(Loader):
+    """
+    Mixin for classes which can load saved models using its Scala
+    implementation.
+    """
+
+    @classmethod
+    def load(cls, sc, path):
+        java_package = cls.__module__.replace("pyspark", "org.apache.spark")
+        java_class = ".".join([java_package, cls.__name__])
+        java_obj = sc._jvm
+        for name in java_class.split("."):
+            java_obj = getattr(java_obj, name)
+        return cls(java_obj.load(sc._jsc.sc(), path))
+
+
 def _test():
     import doctest
     from pyspark.context import SparkContext

From 3f00bb3ef1384fabf86a68180d40a1a515f6f5e3 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Sun, 1 Mar 2015 16:28:15 -0800
Subject: [PATCH 357/357] [SPARK-6083] [MLLib] [DOC] Make Python API example
 consistent in NaiveBayes

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #4834 from MechCoder/spark-6083 and squashes the following commits:

1cdd7b5 [MechCoder] Add parse function
65bbbe9 [MechCoder] [SPARK-6083] Make Python API example consistent in NaiveBayes
---
 docs/mllib-naive-bayes.md | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index 5224a0b49a991..55b8f2ce6c364 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -115,22 +115,28 @@ used for evaluation and prediction.
 
 Note that the Python API does not yet support model save/load but will in the future.
 
-<!-- TODO: Make Python's example consistent with Scala's and Java's. -->
 {% highlight python %}
-from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.classification import NaiveBayes
+from pyspark.mllib.linalg import Vectors
+from pyspark.mllib.regression import LabeledPoint
+
+def parseLine(line):
+    parts = line.split(',')
+    label = float(parts[0])
+    features = Vectors.dense([float(x) for x in parts[1].split(' ')])
+    return LabeledPoint(label, features)
+
+data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine)
 
-# an RDD of LabeledPoint
-data = sc.parallelize([
-  LabeledPoint(0.0, [0.0, 0.0])
-  ... # more labeled points
-])
+# Split data aproximately into training (60%) and test (40%)
+training, test = data.randomSplit([0.6, 0.4], seed = 0)
 
 # Train a naive Bayes model.
-model = NaiveBayes.train(data, 1.0)
+model = NaiveBayes.train(training, 1.0)
 
-# Make prediction.
-prediction = model.predict([0.0, 0.0])
+# Make prediction and test accuracy.
+predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label))
+accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
 {% endhighlight %}
 
 </div>