From 12ef6e3af0a2f8df2375e774c9bfcc313fb84d8a Mon Sep 17 00:00:00 2001
From: William Benton <willb@redhat.com>
Date: Tue, 25 Mar 2014 23:45:45 -0500
Subject: [PATCH] Skip proactive closure capture for runJob

There are two possible cases for runJob calls: either they are called
by RDD action methods from inside Spark or they are called from client
code. There's no need to proactively check the closure argument to
runJob for serializability or force variable capture in either case:

1. if they are called by RDD actions, their closure arguments consist
of mapping an already-serializable closure (with an already-frozen
environment) to each element in the RDD;

2. in both cases, the closure is about to execute and thus the benefit
of proactively checking for serializability (or ensuring immediate
variable capture) is nonexistent.

(Note that ensuring capture via serializability on closure arguments to
runJob also causes pyspark accumulators to fail to update.)
---
 core/src/main/scala/org/apache/spark/SparkContext.scala | 4 +++-
 core/src/main/scala/org/apache/spark/rdd/RDD.scala      | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index f3c5b420db8a2..4c5b35d4025bf 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -896,7 +896,9 @@ class SparkContext(
       require(p >= 0 && p < rdd.partitions.size, s"Invalid partition requested: $p")
     }
     val callSite = getCallSite
-    val cleanedFunc = clean(func)
+    // There's no need to check this function for serializability,
+    // since it will be run right away.
+    val cleanedFunc = clean(func, false)
     logInfo("Starting job: " + callSite)
     val start = System.nanoTime
     dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, allowLocal,
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 1b43040c6d918..42a1e3faec722 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -640,14 +640,16 @@ abstract class RDD[T: ClassTag](
    * Applies a function f to all elements of this RDD.
    */
   def foreach(f: T => Unit) {
-    sc.runJob(this, (iter: Iterator[T]) => iter.foreach(f))
+    val cleanF = sc.clean(f)
+    sc.runJob(this, (iter: Iterator[T]) => iter.foreach(cleanF))
   }
 
   /**
    * Applies a function f to each partition of this RDD.
    */
   def foreachPartition(f: Iterator[T] => Unit) {
-    sc.runJob(this, (iter: Iterator[T]) => f(iter))
+    val cleanF = sc.clean(f)
+    sc.runJob(this, (iter: Iterator[T]) => cleanF(iter))
   }
 
   /**