apache · holdenk · Jun 2, 2020 · Jun 3, 2020 · Jun 3, 2020 · Jun 3, 2020
diff --git a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
@@ -158,8 +158,6 @@ private[deploy] object DeployMessages {
 
   case object ReregisterWithMaster // used when a worker attempts to reconnect to a master
 
-  case object DecommissionSelf // Mark as decommissioned. May be Master to Worker in the future.
-
   // AppClient to Master
 
   case class RegisterApplication(appDescription: ApplicationDescription, driver: RpcEndpointRef)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -668,7 +668,7 @@ private[deploy] class Worker(
       finishedApps += id
       maybeCleanupApplication(id)
 
-    case DecommissionSelf =>
+    case WorkerDecommission(_, _) =>
       decommissionSelf()
   }
 

diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -210,6 +210,10 @@ private[spark] class CoarseGrainedExecutorBackend(
     case UpdateDelegationTokens(tokenBytes) =>
       logInfo(s"Received tokens of ${tokenBytes.length} bytes")
       SparkHadoopUtil.get.addDelegationTokens(tokenBytes, env.conf)
+
+    case DecommissionSelf =>
+      logInfo("Received decommission self")
+      decommissionSelf()
   }
 
   override def onDisconnected(remoteAddress: RpcAddress): Unit = {
@@ -258,26 +262,65 @@ private[spark] class CoarseGrainedExecutorBackend(
     System.exit(code)
   }
 
-  private def decommissionSelf(): Boolean = {
-    logInfo("Decommissioning self w/sync")
-    try {
-      decommissioned = true
-      // Tell master we are are decommissioned so it stops trying to schedule us
-      if (driver.nonEmpty) {
-        driver.get.askSync[Boolean](DecommissionExecutor(executorId))
+  private var previousAllBlocksMigrated = false
+  private def shutdownIfDone(): Unit = {
+    val numRunningTasks = executor.numRunningTasks
+    logInfo(s"Checking to see if we can shutdown have ${numRunningTasks} running tasks.")
+    if (executor.numRunningTasks == 0) {
+      if (env.conf.get(STORAGE_DECOMMISSION_ENABLED)) {
+        val allBlocksMigrated = env.blockManager.decommissionManager match {
+          case Some(m) => m.allBlocksMigrated
+          case None => false // We haven't started migrations yet.
+        }
+        if (allBlocksMigrated && previousAllBlocksMigrated) {
+          logInfo("No running tasks, all blocks migrated, stopping.")
+          exitExecutor(0, "Finished decommissioning", notifyDriver = true)
 for ((tid, info) <- taskInfos if info.running && info.executorId == execId) { 
 for ((tid, info) <- taskInfos if info.running && info.executorId == execId) { 
+        }
+        previousAllBlocksMigrated = allBlocksMigrated
       } else {
-        logError("No driver to message decommissioning.")
+        logInfo("No running tasks, no block migration configured, stopping.")
+        exitExecutor(0, "Finished decommissioning", notifyDriver = true)
       }
-      if (executor != null) {
-        executor.decommission()
+    } else {
+      // If there's a running task it could store blocks.
+      previousAllBlocksMigrated = false
+    }
+  }
+
+  private def decommissionSelf(): Boolean = {
+    if (!decommissioned) {
+      logInfo("Decommissioning self w/sync")
+      try {
+        decommissioned = true
+        // Tell master we are are decommissioned so it stops trying to schedule us
+        if (driver.nonEmpty) {
+          driver.get.askSync[Boolean](DecommissionExecutor(executorId))
+        } else {
+          logError("No driver to message decommissioning.")
+        }
+        if (executor != null) {
+          executor.decommission()
+        }
+        // Shutdown the executor once all tasks are gone :)
+        val shutdownThread = new Thread() {
+          while (true) {
+            shutdownIfDone()
+            Thread.sleep(1000) // 1s
+          }
+        }
+        shutdownThread.setDaemon(true)
+        shutdownThread.setName("decommission-shutdown-thread")
+        shutdownThread.start()
+        logInfo("Done decommissioning self.")
+        // Return true since we are handling a signal
+        true
+      } catch {
+        case e: Exception =>
+          logError(s"Error ${e} during attempt to decommission self")
+          false
       }
-      logInfo("Done decommissioning self.")
-      // Return true since we are handling a signal
+    } else {
       true
-    } catch {
-      case e: Exception =>
-        logError(s"Error ${e} during attempt to decommission self")
-        false
     }
   }
 }

diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -229,38 +229,12 @@ private[spark] class Executor(
 
   private[executor] def numRunningTasks: Int = runningTasks.size()
 
-  private def shutdownIfDone(): Unit = {
-    if (numRunningTasks == 0) {
-      if (conf.get(STORAGE_DECOMMISSION_ENABLED)) {
-        val allBlocksMigrated = env.blockManager.decommissionManager match {
-          case Some(m) => m.allBlocksMigrated
-          case None => false // We haven't started migrations yet.
-        }
-        if (allBlocksMigrated) {
-          stop()
-        }
-      } else {
-        stop()
-      }
-    }
-  }
-
-
   /**
    * Mark an executor for decommissioning and avoid launching new tasks.
    */
   private[spark] def decommission(): Unit = {
+    logInfo("Executor asked to decommission. Starting shutdown thread.")
     decommissioned = true
-    // Shutdown the executor once all tasks are gone :)
-    val shutdownThread = new Thread() {
-      while (true) {
-        shutdownIfDone()
-        Thread.sleep(1000) // 1s
-      }
-    }
-    shutdownThread.setDaemon(true)
-    shutdownThread.setName("decommission-shutdown-thread")
-    shutdownThread.start()
   }
 
   def launchTask(context: ExecutorBackend, taskDescription: TaskDescription): Unit = {

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -52,6 +52,8 @@ private[spark] object CoarseGrainedClusterMessages {
   case class UpdateDelegationTokens(tokens: Array[Byte])
     extends CoarseGrainedClusterMessage
 
+  case object DecommissionSelf extends CoarseGrainedClusterMessage // Mark as decommissioned.
+
   // Executors to driver
   case class RegisterExecutor(
       executorId: String,

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -27,7 +27,7 @@ import scala.concurrent.Future
 import org.apache.hadoop.security.UserGroupInformation
 
 import org.apache.spark.{ExecutorAllocationClient, SparkEnv, SparkException, TaskState}
-import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.{DeployMessage, SparkHadoopUtil}
 import org.apache.spark.deploy.security.HadoopDelegationTokenManager
 import org.apache.spark.executor.ExecutorLogUrlHandler
 import org.apache.spark.internal.Logging
@@ -432,7 +432,18 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
       if (shouldDisable) {
         logInfo(s"Starting decommissioning executor $executorId.")
         try {
+          // Stop making offers on this executor
           scheduler.executorDecommission(executorId)
+          // Send decommission message to the executor (it could have originated there but not
+          // necessarily).
+          executorDataMap.get(executorId) match {
+            case Some(executorInfo) =>
+              executorInfo.executorEndpoint.send(
+                DecommissionSelf)
+            case None =>
+              // Ignoring the executor since it is not registered.
+              logWarning(s"Attempted to decommission unknown executor $executorId.")
+          }
         } catch {
           case e: Exception =>
             logError(s"Unexpected error during decommissioning ${e.toString}", e)

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -1927,6 +1927,7 @@ private[spark] class BlockManager(
    * Stop migrating shuffle blocks.
    */
   def stopOffloadingShuffleBlocks(): Unit = {
+    logInfo("Stopping offloading shuffle blocks")
     migrationPeers.values.foreach(_.running = false)
   }
 
@@ -2050,6 +2051,7 @@ private[spark] class BlockManager(
     @volatile private var stopped = false
     // Since running tasks can add more blocks this can change.
     @volatile var allBlocksMigrated = false
+    var previousBlocksLeft = true
     private val blockMigrationThread = new Thread {
       val sleepInterval = conf.get(
         config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL)
@@ -2065,17 +2067,20 @@ private[spark] class BlockManager(
             var blocksLeft = false
             // If enabled we migrate shuffle blocks first as they are more expensive.
             if (conf.get(config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED)) {
-              logDebug(s"Attempting to replicate all shuffle blocks")
+              logDebug("Attempting to replicate all shuffle blocks")
               blocksLeft = blocksLeft || offloadShuffleBlocks()
-              logInfo(s"Done starting workers to migrate shuffle blocks")
+              logInfo("Done starting workers to migrate shuffle blocks")
             }
             if (conf.get(config.STORAGE_RDD_DECOMMISSION_ENABLED)) {
-              logDebug(s"Attempting to replicate all cached RDD blocks")
+              logDebug("Attempting to replicate all cached RDD blocks")
               blocksLeft = blocksLeft || decommissionRddCacheBlocks()
-              logInfo(s"Attempt to replicate all cached blocks done")
+              logInfo("Attempt to replicate all cached blocks done")
               blocksLeft
             }
-            allBlocksMigrated = ! blocksLeft
+            logInfo(s"We have blocksLeft: ${blocksLeft}")
+            // Avoid the situation where  block was added during the loop
+            allBlocksMigrated = (! blocksLeft ) && ( ! previousBlocksLeft )
+            previousBlocksLeft = blocksLeft
             if (!conf.get(config.STORAGE_RDD_DECOMMISSION_ENABLED) &&
               !conf.get(config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED)) {
               logWarning("Decommissioning, but no task configured set one or both:\n" +
@@ -2117,6 +2122,7 @@ private[spark] class BlockManager(
   }
 
   def stop(): Unit = {
+    logInfo("Stopping decommission manager")
     decommissionManager.foreach(_.stop())
     blockTransferService.close()
     if (blockStoreClient ne blockTransferService) {

diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionSuite.scala
@@ -59,7 +59,7 @@ class BlockManagerDecommissionSuite extends SparkFunSuite with LocalSparkContext
       .set(config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED, shuffle)
     // Just replicate blocks as fast as we can during testing, there isn't another
     // workload we need to worry about.
-      .set(config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL, 1L)
+      .set(config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL, 10L)
 
     sc = new SparkContext(master, "test", conf)
 
@@ -223,10 +223,7 @@ class BlockManagerDecommissionSuite extends SparkFunSuite with LocalSparkContext
       assert(execIdToBlocksMapping.values.flatMap(_.keys).count(_.isRDD) === numParts)
     }
 
-    // Make the executor we decommissioned exit
-    sched.client.killExecutors(List(execToDecommission))
-
-    // Wait for the executor to be removed
+    // Wait for the executor to be removed after blocks are migrated.
     executorRemovedSem.acquire(1)
 
     // Since the RDD is cached or shuffled so further usage of same RDD should use the