From 3bd32f023d9bd83da7afab37fffe614064df3e6b Mon Sep 17 00:00:00 2001 From: Daniel Darabos Date: Tue, 8 Jul 2014 10:43:46 -0700 Subject: [PATCH] [SPARK-2403] Catch all errors during serialization in DAGScheduler https://issues.apache.org/jira/browse/SPARK-2403 Spark hangs for us whenever we forget to register a class with Kryo. This should be a simple fix for that. But let me know if you have a better suggestion. I did not write a new test for this. It would be pretty complicated and I'm not sure it's worthwhile for such a simple change. Let me know if you disagree. Author: Daniel Darabos Closes #1329 from darabos/spark-2403 and squashes the following commits: 3aceaad [Daniel Darabos] Print full stack trace for miscellaneous exceptions during serialization. 52c22ba [Daniel Darabos] Only catch NonFatal exceptions. 361e962 [Daniel Darabos] Catch all errors during serialization in DAGScheduler. (cherry picked from commit c8a2313cdf825e0191680a423d17619b5504ff89) Signed-off-by: Aaron Davidson --- .../main/scala/org/apache/spark/scheduler/DAGScheduler.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index d15aaa3fc11ed..a9fd7e71318cc 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -26,6 +26,7 @@ import scala.concurrent.Await import scala.concurrent.duration._ import scala.language.postfixOps import scala.reflect.ClassTag +import scala.util.control.NonFatal import akka.actor._ import akka.actor.OneForOneStrategy @@ -771,6 +772,10 @@ class DAGScheduler( abortStage(stage, "Task not serializable: " + e.toString) runningStages -= stage return + case NonFatal(e) => // Other exceptions, such as IllegalArgumentException from Kryo. + abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}") + runningStages -= stage + return } logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")