hashicorp · schmichael · May 14, 2019 · May 8, 2019 · May 8, 2019 · May 8, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ __BACKWARDS INCOMPATIBILITIES:__
    to update your code.  [[GH-5536](https://github.com/hashicorp/nomad/pull/5536)]
  * client: The format of check IDs in Consul has changed. If you rely upon
    Nomad's check IDs you will need to update your code.  [[GH-5536](https://github.com/hashicorp/nomad/pull/5536)]
+ * client: On startup a client will reattach to running tasks as before but
+   will not restart exited tasks. Exited tasks will be restarted only after the
+   client has reestablished communication with servers. System jobs will always
+   be restarted. [[GH-5669](https://github.com/hashicorp/nomad/pull/5669)]
 
 FEATURES:
 

diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go
@@ -136,6 +136,11 @@ type allocRunner struct {
 	// driverManager is responsible for dispensing driver plugins and registering
 	// event handlers
 	driverManager drivermanager.Manager
+
+	// serversContactedCh is passed to TaskRunners so they can detect when
+	// servers have been contacted for the first time in case of a failed
+	// restore.
+	serversContactedCh chan struct{}
 }
 
 // NewAllocRunner returns a new allocation runner.
@@ -167,6 +172,7 @@ func NewAllocRunner(config *Config) (*allocRunner, error) {
 		prevAllocMigrator:        config.PrevAllocMigrator,
 		devicemanager:            config.DeviceManager,
 		driverManager:            config.DriverManager,
+		serversContactedCh:       config.ServersContactedCh,
 	}
 
 	// Create the logger based on the allocation ID
@@ -205,6 +211,7 @@ func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error {
 			DeviceStatsReporter: ar.deviceStatsReporter,
 			DeviceManager:       ar.devicemanager,
 			DriverManager:       ar.driverManager,
+			ServersContactedCh:  ar.serversContactedCh,
 		}
 
 		// Create, but do not Run, the task runner

diff --git a/client/allocrunner/config.go b/client/allocrunner/config.go
@@ -51,4 +51,8 @@ type Config struct {
 
 	// DriverManager handles dispensing of driver plugins
 	DriverManager drivermanager.Manager
+
+	// ServersContactedCh is closed when the first GetClientAllocs call to
+	// servers succeeds and allocs are synced.
+	ServersContactedCh chan struct{}
 }
diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go
@@ -193,6 +193,15 @@ type TaskRunner struct {
 	// maxEvents is the capacity of the TaskEvents on the TaskState.
 	// Defaults to defaultMaxEvents but overrideable for testing.
 	maxEvents int
+
+	// serversContactedCh is passed to TaskRunners so they can detect when
+	// GetClientAllocs has been called in case of a failed restore.
+	serversContactedCh <-chan struct{}
+
+	// waitOnServers defaults to false but will be set true if a restore
+	// fails and the Run method should wait until serversContactedCh is
+	// closed.
+	waitOnServers bool
 }
 
 type Config struct {
@@ -222,6 +231,10 @@ type Config struct {
 	// DriverManager is used to dispense driver plugins and register event
 	// handlers
 	DriverManager drivermanager.Manager
+
+	// ServersContactedCh is closed when the first GetClientAllocs call to
+	// servers succeeds and allocs are synced.
+	ServersContactedCh chan struct{}
 }
 
 func NewTaskRunner(config *Config) (*TaskRunner, error) {
@@ -270,6 +283,7 @@ func NewTaskRunner(config *Config) (*TaskRunner, error) {
 		devicemanager:       config.DeviceManager,
 		driverManager:       config.DriverManager,
 		maxEvents:           defaultMaxEvents,
+		serversContactedCh:  config.ServersContactedCh,
 	}
 
 	// Create the logger based on the allocation ID
@@ -381,6 +395,19 @@ func (tr *TaskRunner) Run() {
 	// - should be handled serially.
 	go tr.handleUpdates()
 
+	// If restore failed wait until servers are contacted before running.
+	// #1795
+	if tr.waitOnServers {
+		tr.logger.Info("task failed to restore; waiting to contact server before restarting")
+		select {
+		case <-tr.killCtx.Done():
+		case <-tr.shutdownCtx.Done():
+			return
+		case <-tr.serversContactedCh:
+			tr.logger.Trace("server contacted; unblocking waiting task")
+		}
+	}
+
 MAIN:
 	for !tr.Alloc().TerminalStatus() {
 		select {
@@ -858,8 +885,28 @@ func (tr *TaskRunner) Restore() error {
 	if taskHandle := tr.localState.TaskHandle; taskHandle != nil {
 		//TODO if RecoverTask returned the DriverNetwork we wouldn't
 		//     have to persist it at all!
-		tr.restoreHandle(taskHandle, tr.localState.DriverNetwork)
+		restored := tr.restoreHandle(taskHandle, tr.localState.DriverNetwork)
+
+		// If the handle could not be restored, the alloc is
+		// non-terminal, and the task isn't a system job: wait until
+		// servers have been contacted before running. #1795
+		if restored {
+			return nil
+		}
+
+		alloc := tr.Alloc()
+		if alloc.TerminalStatus() || alloc.Job.Type == structs.JobTypeSystem {
+			return nil
+		}
+
+		tr.logger.Trace("failed to reattach to task; will not run until server is contacted")
+		tr.waitOnServers = true
+
+		ev := structs.NewTaskEvent(structs.TaskRestoreFailed).
+			SetDisplayMessage("failed to restore task; will not run until server is contacted")
+		tr.UpdateState(structs.TaskStatePending, ev)
 	}
+
 	return nil
 }