From c480574b3ac7ba83679ea1fba6739c1825093c1f Mon Sep 17 00:00:00 2001 From: Shuai Lin Date: Wed, 25 Jan 2017 19:07:11 +0000 Subject: [PATCH] Support setting the driver pod launching timeout. (#36) * Support setting the driver pod launching timeout. And increase the default value from 30s to 60s. The current value of 30s is kind of short for pulling the image from public docker registry plus the container/JVM start time. * Use a better name for the default timeout. --- .../org/apache/spark/deploy/kubernetes/Client.scala | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala index fe3256b9e12be..93471a97e9ccd 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala @@ -60,6 +60,8 @@ private[spark] class Client( private val driverDockerImage = sparkConf.get( "spark.kubernetes.driver.docker.image", s"spark-driver:$SPARK_VERSION") private val uploadedJars = sparkConf.getOption("spark.kubernetes.driver.uploads.jars") + private val driverLaunchTimeoutSecs = sparkConf.getTimeAsSeconds( + "spark.kubernetes.driverLaunchTimeout", s"${DEFAULT_LAUNCH_TIMEOUT_SECONDS}s") private val secretBase64String = { val secretBytes = new Array[Byte](128) @@ -218,7 +220,7 @@ private[spark] class Client( .done() var submitSucceeded = false try { - submitCompletedFuture.get(LAUNCH_TIMEOUT_SECONDS, TimeUnit.SECONDS) + submitCompletedFuture.get(driverLaunchTimeoutSecs, TimeUnit.SECONDS) submitSucceeded = true } catch { case e: TimeoutException => @@ -226,17 +228,17 @@ private[spark] class Client( kubernetesClient.pods().withName(kubernetesAppId).get() } catch { case throwable: Throwable => - logError(s"Timed out while waiting $LAUNCH_TIMEOUT_SECONDS seconds for the" + + logError(s"Timed out while waiting $driverLaunchTimeoutSecs seconds for the" + " driver pod to start, but an error occurred while fetching the driver" + " pod's details.", throwable) - throw new SparkException(s"Timed out while waiting $LAUNCH_TIMEOUT_SECONDS" + + throw new SparkException(s"Timed out while waiting $driverLaunchTimeoutSecs" + " seconds for the driver pod to start. Unfortunately, in attempting to fetch" + " the latest state of the pod, another error was thrown. Check the logs for" + " the error that was thrown in looking up the driver pod.", e) } val topLevelMessage = s"The driver pod with name ${driverPod.getMetadata.getName}" + s" in namespace ${driverPod.getMetadata.getNamespace} was not ready in" + - s" $LAUNCH_TIMEOUT_SECONDS seconds." + s" $driverLaunchTimeoutSecs seconds." val podStatusPhase = if (driverPod.getStatus.getPhase != null) { s"Latest phase from the pod is: ${driverPod.getStatus.getPhase}" } else { @@ -424,7 +426,7 @@ private[spark] object Client extends Logging { private val DRIVER_LAUNCHER_CONTAINER_NAME = "spark-kubernetes-driver-launcher" private val SECURE_RANDOM = new SecureRandom() private val SPARK_SUBMISSION_SECRET_BASE_DIR = "/var/run/secrets/spark-submission" - private val LAUNCH_TIMEOUT_SECONDS = 30 + private val DEFAULT_LAUNCH_TIMEOUT_SECONDS = 60 private val SPARK_APP_NAME_LABEL = "spark-app-name" def main(args: Array[String]): Unit = {