From 12ef6e3af0a2f8df2375e774c9bfcc313fb84d8a Mon Sep 17 00:00:00 2001 From: William Benton Date: Tue, 25 Mar 2014 23:45:45 -0500 Subject: [PATCH] Skip proactive closure capture for runJob There are two possible cases for runJob calls: either they are called by RDD action methods from inside Spark or they are called from client code. There's no need to proactively check the closure argument to runJob for serializability or force variable capture in either case: 1. if they are called by RDD actions, their closure arguments consist of mapping an already-serializable closure (with an already-frozen environment) to each element in the RDD; 2. in both cases, the closure is about to execute and thus the benefit of proactively checking for serializability (or ensuring immediate variable capture) is nonexistent. (Note that ensuring capture via serializability on closure arguments to runJob also causes pyspark accumulators to fail to update.) --- core/src/main/scala/org/apache/spark/SparkContext.scala | 4 +++- core/src/main/scala/org/apache/spark/rdd/RDD.scala | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index f3c5b420db8a2..4c5b35d4025bf 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -896,7 +896,9 @@ class SparkContext( require(p >= 0 && p < rdd.partitions.size, s"Invalid partition requested: $p") } val callSite = getCallSite - val cleanedFunc = clean(func) + // There's no need to check this function for serializability, + // since it will be run right away. + val cleanedFunc = clean(func, false) logInfo("Starting job: " + callSite) val start = System.nanoTime dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, allowLocal, diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 1b43040c6d918..42a1e3faec722 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -640,14 +640,16 @@ abstract class RDD[T: ClassTag]( * Applies a function f to all elements of this RDD. */ def foreach(f: T => Unit) { - sc.runJob(this, (iter: Iterator[T]) => iter.foreach(f)) + val cleanF = sc.clean(f) + sc.runJob(this, (iter: Iterator[T]) => iter.foreach(cleanF)) } /** * Applies a function f to each partition of this RDD. */ def foreachPartition(f: Iterator[T] => Unit) { - sc.runJob(this, (iter: Iterator[T]) => f(iter)) + val cleanF = sc.clean(f) + sc.runJob(this, (iter: Iterator[T]) => cleanF(iter)) } /**