diff --git a/client/task_runner.go b/client/task_runner.go index 383106a0252e..a33df2c39a46 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -1703,13 +1703,6 @@ func (r *TaskRunner) Restart(source, reason string, failure bool) { } } -// RestartDelay returns the *max* value of the delay for this task's restart -// policy for use by the healtcheck watcher. -func (r *TaskRunner) RestartDelay() time.Duration { - delay := r.alloc.Job.LookupTaskGroup(r.alloc.TaskGroup).RestartPolicy.Delay - return delay + time.Duration(float64(delay)*jitter) -} - // Signal will send a signal to the task func (r *TaskRunner) Signal(source, reason string, s os.Signal) error { diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go index 9ae94f282344..05b636a4bc5b 100644 --- a/command/agent/consul/check_watcher.go +++ b/command/agent/consul/check_watcher.go @@ -21,10 +21,8 @@ type ChecksAPI interface { Checks() (map[string]*api.AgentCheck, error) } -// TaskRestarter allows the checkWatcher to restart tasks and how long the -// grace period should be afterward. +// TaskRestarter allows the checkWatcher to restart tasks. type TaskRestarter interface { - RestartDelay() time.Duration Restart(source, reason string, failure bool) } @@ -36,7 +34,6 @@ type checkRestart struct { checkName string task TaskRestarter - restartDelay time.Duration grace time.Duration interval time.Duration timeLimit time.Duration @@ -55,17 +52,19 @@ type checkRestart struct { logger *log.Logger } -// update restart state for check and restart task if necessary. Currrent +// apply restart state for check and restart task if necessary. Currrent // timestamp is passed in so all check updates have the same view of time (and // to ease testing). -func (c *checkRestart) update(now time.Time, status string) { +// +// Returns true if a restart was triggered in which case this check should be +// removed (checks are added on task startup). +func (c *checkRestart) apply(now time.Time, status string) bool { healthy := func() { if !c.unhealthyStart.IsZero() { c.logger.Printf("[DEBUG] consul.health: alloc %q task %q check %q became healthy; canceling restart", c.allocID, c.taskName, c.checkName) c.unhealthyStart = time.Time{} } - return } switch status { case api.HealthCritical: @@ -73,17 +72,17 @@ func (c *checkRestart) update(now time.Time, status string) { if c.ignoreWarnings { // Warnings are ignored, reset state and exit healthy() - return + return false } default: // All other statuses are ok, reset state and exit healthy() - return + return false } if now.Before(c.graceUntil) { - // In grace period exit - return + // In grace period, exit + return false } if c.unhealthyStart.IsZero() { @@ -106,11 +105,10 @@ func (c *checkRestart) update(now time.Time, status string) { // Tell TaskRunner to restart due to failure const failure = true c.task.Restart("healthcheck", fmt.Sprintf("check %q unhealthy", c.checkName), failure) - - // Reset grace time to grace + restart.delay - c.graceUntil = now.Add(c.grace + c.restartDelay) - c.unhealthyStart = time.Time{} + return true } + + return false } // checkWatchUpdates add or remove checks from the watcher @@ -179,17 +177,16 @@ func (w *checkWatcher) Run(ctx context.Context) { // Main watch loop for { + // disable polling if there are no checks + if len(checks) == 0 { + stopTimer() + } + select { case update := <-w.checkUpdateCh: if update.remove { // Remove a check delete(checks, update.checkID) - - // disable polling if last check was removed - if len(checks) == 0 { - stopTimer() - } - continue } @@ -235,7 +232,13 @@ func (w *checkWatcher) Run(ctx context.Context) { continue } - check.update(now, result.Status) + restarted := check.apply(now, result.Status) + if restarted { + // Checks are registered+watched on + // startup, so it's safe to remove them + // whenever they're restarted + delete(checks, cid) + } } } } @@ -254,7 +257,6 @@ func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.S checkID: checkID, checkName: check.Name, task: restarter, - restartDelay: restarter.RestartDelay(), interval: check.Interval, grace: check.CheckRestart.Grace, graceUntil: time.Now().Add(check.CheckRestart.Grace), diff --git a/command/agent/consul/check_watcher_test.go b/command/agent/consul/check_watcher_test.go index c8544bdc76fb..3098182eb1c1 100644 --- a/command/agent/consul/check_watcher_test.go +++ b/command/agent/consul/check_watcher_test.go @@ -85,7 +85,7 @@ func TestCheckWatcher_Skip(t *testing.T) { check.CheckRestart = nil cw := newCheckWatcher(testLogger(), newFakeChecksAPI()) - restarter1 := newFakeCheckRestarter() + restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check) cw.Watch("testalloc1", "testtask1", "testcheck1", check, restarter1) // Check should have been dropped as it's not watched @@ -101,13 +101,13 @@ func TestCheckWatcher_Healthy(t *testing.T) { fakeAPI, cw := testWatcherSetup() check1 := testCheck() - restarter1 := newFakeCheckRestarter() + restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) check2 := testCheck() check2.CheckRestart.Limit = 1 check2.CheckRestart.Grace = 0 - restarter2 := newFakeCheckRestarter() + restarter2 := newFakeCheckRestarter(cw, "testalloc2", "testtask2", "testcheck2", check2) cw.Watch("testalloc2", "testtask2", "testcheck2", check2, restarter2) // Make both checks healthy from the beginning @@ -135,14 +135,13 @@ func TestCheckWatcher_Unhealthy(t *testing.T) { fakeAPI, cw := testWatcherSetup() check1 := testCheck() - restarter1 := newFakeCheckRestarter() + restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) check2 := testCheck() check2.CheckRestart.Limit = 1 - check2.CheckRestart.Grace = 0 - restarter2 := newFakeCheckRestarter() - restarter2.restartDelay = 600 * time.Millisecond + check2.CheckRestart.Grace = 200 * time.Millisecond + restarter2 := newFakeCheckRestarter(cw, "testalloc2", "testtask2", "testcheck2", check2) cw.Watch("testalloc2", "testtask2", "testcheck2", check2, restarter2) // Check 1 always passes, check 2 always fails @@ -150,7 +149,7 @@ func TestCheckWatcher_Unhealthy(t *testing.T) { fakeAPI.add("testcheck2", "critical", time.Time{}) // Run for 1 second - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) defer cancel() cw.Run(ctx) @@ -176,15 +175,14 @@ func TestCheckWatcher_HealthyWarning(t *testing.T) { check1.CheckRestart.Limit = 1 check1.CheckRestart.Grace = 0 check1.CheckRestart.IgnoreWarnings = true - restarter1 := newFakeCheckRestarter() - restarter1.restartDelay = 1100 * time.Millisecond + restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) // Check is always in warning but that's ok fakeAPI.add("testcheck1", "warning", time.Time{}) // Run for 1 second - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) defer cancel() cw.Run(ctx) @@ -203,7 +201,7 @@ func TestCheckWatcher_Flapping(t *testing.T) { check1 := testCheck() check1.CheckRestart.Grace = 0 - restarter1 := newFakeCheckRestarter() + restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) // Check flaps and is never failing for the full 200ms needed to restart @@ -214,7 +212,7 @@ func TestCheckWatcher_Flapping(t *testing.T) { fakeAPI.add("testcheck1", "critical", now.Add(300*time.Millisecond)) fakeAPI.add("testcheck1", "passing", now.Add(450*time.Millisecond)) - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 600*time.Millisecond) defer cancel() cw.Run(ctx) @@ -234,14 +232,14 @@ func TestCheckWatcher_Unwatch(t *testing.T) { check1 := testCheck() check1.CheckRestart.Limit = 1 check1.CheckRestart.Grace = 100 * time.Millisecond - restarter1 := newFakeCheckRestarter() + restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) cw.Unwatch("testcheck1") // Always failing fakeAPI.add("testcheck1", "critical", time.Time{}) - ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond) defer cancel() cw.Run(ctx) diff --git a/command/agent/consul/unit_test.go b/command/agent/consul/unit_test.go index 011fa0065afe..ddb7e2dfc486 100644 --- a/command/agent/consul/unit_test.go +++ b/command/agent/consul/unit_test.go @@ -64,37 +64,43 @@ type checkRestartRecord struct { // fakeCheckRestarter is a test implementation of type fakeCheckRestarter struct { - // restartDelay is returned by RestartDelay to control the behavior of - // the checkWatcher - restartDelay time.Duration - // restarts is a slice of all of the restarts triggered by the checkWatcher restarts []checkRestartRecord -} -func newFakeCheckRestarter() *fakeCheckRestarter { - return &fakeCheckRestarter{} + // need the checkWatcher to re-Watch restarted tasks like TaskRunner + watcher *checkWatcher + + // check to re-Watch on restarts + check *structs.ServiceCheck + allocID string + taskName string + checkName string } -// RestartDelay implements part of the TaskRestarter interface needed for check -// watching and is normally fulfilled by a task runner. -// -// The return value is determined by the restartDelay field. -func (c *fakeCheckRestarter) RestartDelay() time.Duration { - return c.restartDelay +func newFakeCheckRestarter(w *checkWatcher, allocID, taskName, checkName string, c *structs.ServiceCheck) *fakeCheckRestarter { + return &fakeCheckRestarter{ + watcher: w, + check: c, + allocID: allocID, + taskName: taskName, + checkName: checkName, + } } // Restart implements part of the TaskRestarter interface needed for check // watching and is normally fulfilled by a TaskRunner. // -// Restarts are recorded in the []restarts field. +// Restarts are recorded in the []restarts field and re-Watch the check. func (c *fakeCheckRestarter) Restart(source, reason string, failure bool) { c.restarts = append(c.restarts, checkRestartRecord{time.Now(), source, reason, failure}) + + // Re-Watch the check just like TaskRunner + c.watcher.Watch(c.allocID, c.taskName, c.checkName, c.check, c) } // String for debugging func (c *fakeCheckRestarter) String() string { - s := "" + s := fmt.Sprintf("%s %s %s restarts:\n", c.allocID, c.taskName, c.checkName) for _, r := range c.restarts { s += fmt.Sprintf("%s - %s: %s (failure: %t)\n", r.timestamp, r.source, r.reason, r.failure) } @@ -152,7 +158,6 @@ func setupFake() *testFakeCtx { ServiceClient: NewServiceClient(fc, true, testLogger()), FakeConsul: fc, Task: testTask(), - Restarter: newFakeCheckRestarter(), execs: make(chan int, 100), } }