hashicorp · schmichael · Sep 18, 2017 · Aug 24, 2017 · Aug 25, 2017 · Aug 26, 2017
diff --git a/api/tasks.go b/api/tasks.go
@@ -187,6 +187,8 @@ func (s *Service) Canonicalize(t *Task, tg *TaskGroup, job *Job) {
 
 	s.CheckRestart.Canonicalize()
 
+	// Canonicallize CheckRestart on Checks and merge Service.CheckRestart
+	// into each check.
 	for _, c := range s.Checks {
 		c.CheckRestart.Canonicalize()
 		c.CheckRestart = c.CheckRestart.Merge(s.CheckRestart)

diff --git a/client/restarts.go b/client/restarts.go
@@ -74,7 +74,9 @@ func (r *RestartTracker) SetWaitResult(res *dstructs.WaitResult) *RestartTracker
 }
 
 // SetRestartTriggered is used to mark that the task has been signalled to be
-// restarted
+// restarted. Setting the failure to true restarts according to the restart
+// policy. When failure is false the task is restarted without considering the
+// restart policy.
 func (r *RestartTracker) SetRestartTriggered(failure bool) *RestartTracker {
 	r.lock.Lock()
 	defer r.lock.Unlock()
@@ -143,39 +145,42 @@ func (r *RestartTracker) GetState() (string, time.Duration) {
 	}
 
 	// Handle restarts due to failures
-	if r.failure {
-		if r.startErr != nil {
-			// If the error is not recoverable, do not restart.
-			if !structs.IsRecoverable(r.startErr) {
-				r.reason = ReasonUnrecoverableErrror
-				return structs.TaskNotRestarting, 0
-			}
-		} else if r.waitRes != nil {
-			// If the task started successfully and restart on success isn't specified,
-			// don't restart but don't mark as failed.
-			if r.waitRes.Successful() && !r.onSuccess {
-				r.reason = "Restart unnecessary as task terminated successfully"
-				return structs.TaskTerminated, 0
-			}
-		}
+	if !r.failure {
+		return "", 0
+	}
 
-		if r.count > r.policy.Attempts {
-			if r.policy.Mode == structs.RestartPolicyModeFail {
-				r.reason = fmt.Sprintf(
-					`Exceeded allowed attempts %d in interval %v and mode is "fail"`,
-					r.policy.Attempts, r.policy.Interval)
-				return structs.TaskNotRestarting, 0
-			} else {
-				r.reason = ReasonDelay
-				return structs.TaskRestarting, r.getDelay()
-			}
+	if r.startErr != nil {
+		// If the error is not recoverable, do not restart.
+		if !structs.IsRecoverable(r.startErr) {
+			r.reason = ReasonUnrecoverableErrror
+			return structs.TaskNotRestarting, 0
+		}
+	} else if r.waitRes != nil {
+		// If the task started successfully and restart on success isn't specified,
+		// don't restart but don't mark as failed.
+		if r.waitRes.Successful() && !r.onSuccess {
+			r.reason = "Restart unnecessary as task terminated successfully"
+			return structs.TaskTerminated, 0
 		}
+	}
 
-		r.reason = ReasonWithinPolicy
-		return structs.TaskRestarting, r.jitter()
+	// If this task has been restarted due to failures more times
+	// than the restart policy allows within an interval fail
+	// according to the restart policy's mode.
+	if r.count > r.policy.Attempts {
+		if r.policy.Mode == structs.RestartPolicyModeFail {
+			r.reason = fmt.Sprintf(
+				`Exceeded allowed attempts %d in interval %v and mode is "fail"`,
+				r.policy.Attempts, r.policy.Interval)
+			return structs.TaskNotRestarting, 0
+		} else {
+			r.reason = ReasonDelay
+			return structs.TaskRestarting, r.getDelay()
+		}
 	}
 
-	return "", 0
+	r.reason = ReasonWithinPolicy
+	return structs.TaskRestarting, r.jitter()
 }
 
 // getDelay returns the delay time to enter the next interval.

diff --git a/client/restarts_test.go b/client/restarts_test.go
@@ -104,6 +104,19 @@ func TestClient_RestartTracker_RestartTriggered(t *testing.T) {
 	}
 }
 
+func TestClient_RestartTracker_RestartTriggered_Failure(t *testing.T) {
+	t.Parallel()
+	p := testPolicy(true, structs.RestartPolicyModeFail)
+	p.Attempts = 1
+	rt := newRestartTracker(p, structs.JobTypeService)
+	if state, when := rt.SetRestartTriggered(true).GetState(); state != structs.TaskRestarting || when == 0 {
+		t.Fatalf("expect restart got %v %v", state, when)
+	}
+	if state, when := rt.SetRestartTriggered(true).GetState(); state != structs.TaskNotRestarting || when != 0 {
+		t.Fatalf("expect failed got %v %v", state, when)
+	}
+}
+
 func TestClient_RestartTracker_StartError_Recoverable_Fail(t *testing.T) {
 	t.Parallel()
 	p := testPolicy(true, structs.RestartPolicyModeFail)

diff --git a/client/task_runner.go b/client/task_runner.go
@@ -789,8 +789,8 @@ OUTER:
 					return
 				}
 			case structs.VaultChangeModeRestart:
-				const failure = false
-				r.Restart("vault", "new Vault token acquired", failure)
+				const noFailure = false
+				r.Restart("vault", "new Vault token acquired", noFailure)
 			case structs.VaultChangeModeNoop:
 				fallthrough
 			default:

diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go
@@ -267,7 +267,7 @@ func (w *checkWatcher) Run(ctx context.Context) {
 	}
 }
 
-// Watch a task and restart it if unhealthy.
+// Watch a check and restart its task if unhealthy.
 func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, restarter TaskRestarter) {
 	if !check.TriggersRestarts() {
 		// Not watched, noop
@@ -302,7 +302,7 @@ func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.S
 	}
 }
 
-// Unwatch a task.
+// Unwatch a check.
 func (w *checkWatcher) Unwatch(cid string) {
 	c := checkWatchUpdate{
 		checkID: cid,

diff --git a/command/agent/job_endpoint_test.go b/command/agent/job_endpoint_test.go
@@ -1216,6 +1216,11 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) {
 										Interval:      4 * time.Second,
 										Timeout:       2 * time.Second,
 										InitialStatus: "ok",
+										CheckRestart: &api.CheckRestart{
+											Limit:          3,
+											Grace:          helper.TimeToPtr(10 * time.Second),
+											IgnoreWarnings: true,
+										},
 									},
 								},
 							},
@@ -1406,6 +1411,11 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) {
 										Interval:      4 * time.Second,
 										Timeout:       2 * time.Second,
 										InitialStatus: "ok",
+										CheckRestart: &structs.CheckRestart{
+											Limit:          3,
+											Grace:          10 * time.Second,
+											IgnoreWarnings: true,
+										},
 									},
 								},
 							},

diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
@@ -2780,15 +2780,16 @@ func (c *CheckRestart) Validate() error {
 		return nil
 	}
 
+	var mErr multierror.Error
 	if c.Limit < 0 {
-		return fmt.Errorf("limit must be greater than or equal to 0 but found %d", c.Limit)
+		mErr.Errors = append(mErr.Errors, fmt.Errorf("limit must be greater than or equal to 0 but found %d", c.Limit))
 	}
 
 	if c.Grace < 0 {
-		return fmt.Errorf("grace period must be greater than or equal to 0 but found %d", c.Grace)
+		mErr.Errors = append(mErr.Errors, fmt.Errorf("grace period must be greater than or equal to 0 but found %d", c.Grace))
 	}
 
-	return nil
+	return mErr.ErrorOrNil()
 }
 
 const (

diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go
@@ -1154,6 +1154,24 @@ func TestTask_Validate_Service_Check(t *testing.T) {
 	}
 }
 
+func TestTask_Validate_Service_Check_CheckRestart(t *testing.T) {
+	invalidCheckRestart := &CheckRestart{
+		Limit: -1,
+		Grace: -1,
+	}
+
+	err := invalidCheckRestart.Validate()
+	assert.NotNil(t, err, "invalidateCheckRestart.Validate()")
+	assert.Len(t, err.(*multierror.Error).Errors, 2)
+
+	validCheckRestart := &CheckRestart{}
+	assert.Nil(t, validCheckRestart.Validate())
+
+	validCheckRestart.Limit = 1
+	validCheckRestart.Grace = 1
+	assert.Nil(t, validCheckRestart.Validate())
+}
+
 func TestTask_Validate_LogConfig(t *testing.T) {
 	task := &Task{
 		LogConfig: DefaultLogConfig(),

diff --git a/website/source/docs/job-specification/check_restart.html.md b/website/source/docs/job-specification/check_restart.html.md
@@ -30,6 +30,8 @@ unhealthy for the `limit` specified in a `check_restart` stanza, it is
 restarted according to the task group's [`restart` policy][restart_stanza]. The
 `check_restart` settings apply to [`check`s][check_stanza], but may also be
 placed on [`service`s][service_stanza] to apply to all checks on a service.
+`check_restart` settings on `service` will only overwrite unset `check_restart`
+settings on `checks.`
 
 ```hcl
 job "mysql" {
@@ -66,7 +68,6 @@ job "mysql" {
           check_restart {
             limit = 3
             grace = "90s"
-
             ignore_warnings = false
           }
         }
@@ -78,7 +79,7 @@ job "mysql" {
 
 - `limit` `(int: 0)` - Restart task when a health check has failed `limit`
   times.  For example 1 causes a restart on the first failure. The default,
-  `0`, disables healtcheck based restarts. Failures must be consecutive. A
+  `0`, disables health check based restarts. Failures must be consecutive. A
   single passing check will reset the count, so flapping services may not be
   restarted.
 
@@ -124,8 +125,8 @@ restart {
 ```
 
 The [`restart` stanza][restart_stanza] controls the restart behavior of the
-task. In this case it will wait 10 seconds before restarting. Note that even if
-the check passes in this time the restart will still occur.
+task. In this case it will stop the task and then wait 10 seconds before
+starting it again.
 
 Once the task restarts Nomad waits the `grace` period again before starting to
 check the task's health.