Skip to content

Commit

Permalink
consul: fix deadlock in check-based restarts
Browse files Browse the repository at this point in the history
Fixes #5395
Alternative to #5957

Make task restarting asynchronous when handling check-based restarts.
This matches the pre-0.9 behavior where TaskRunner.Restart was an
asynchronous signal. The check-based restarting code was not designed
to handle blocking in TaskRunner.Restart. 0.9 made it reentrant and
could easily overwhelm the buffered update chan and deadlock.

Many thanks to @byronwolfman for his excellent debugging, PR, and
reproducer!

I created this alternative as changing the functionality of
TaskRunner.Restart has a much larger impact. This approach reverts to
old known-good behavior and minimizes the number of places changes are
made.
  • Loading branch information
schmichael committed Jul 17, 2019
1 parent 596b5aa commit 9c418c2
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 6 deletions.
26 changes: 20 additions & 6 deletions command/agent/consul/check_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,20 +103,34 @@ func (c *checkRestart) apply(ctx context.Context, now time.Time, status string)
c.logger.Debug("restarting due to unhealthy check")

// Tell TaskRunner to restart due to failure
const failure = true
reason := fmt.Sprintf("healthcheck: check %q unhealthy", c.checkName)
event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason)
err := c.task.Restart(ctx, event, failure)
if err != nil {
// Error restarting
return false
}
go asyncRestart(ctx, c.logger, c.task, event)
return true
}

return false
}

// asyncRestart mimics the pre-0.9 TaskRunner.Restart behavior and is intended
// to be called in a goroutine.
func asyncRestart(ctx context.Context, logger log.Logger, task TaskRestarter, event *structs.TaskEvent) {
// Check watcher restarts are always failures
const failure = true

// Restarting is asynchronous so there's no reason to allow this
// goroutine to block indefinitely.
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()

if err := task.Restart(ctx, event, failure); err != nil {
// Restart errors are not actionable and only relevant when
// debugging allocation lifecycle management.
logger.Debug("error restarting task", "error", err,
"event_time", event.Time, "event_type", event.Type)
}
}

// checkWatchUpdates add or remove checks from the watcher
type checkWatchUpdate struct {
checkID string
Expand Down
23 changes: 23 additions & 0 deletions command/agent/consul/check_watcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/hashicorp/consul/api"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/stretchr/testify/require"
)

// checkRestartRecord is used by a testFakeCtx to record when restarts occur
Expand Down Expand Up @@ -194,6 +195,28 @@ func TestCheckWatcher_Healthy(t *testing.T) {
}
}

// TestCheckWatcher_Unhealthy asserts unhealthy tasks are restarted exactly once.
func TestCheckWatcher_Unhealthy(t *testing.T) {
t.Parallel()

fakeAPI, cw := testWatcherSetup(t)

check1 := testCheck()
restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1)
cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1)

// Check has always been failing
fakeAPI.add("testcheck1", "critical", time.Time{})

// Run
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
defer cancel()
cw.Run(ctx)

// Ensure restart was called exactly once
require.Len(t, restarter1.restarts, 1)
}

// TestCheckWatcher_HealthyWarning asserts checks in warning with
// ignore_warnings=true do not restart tasks.
func TestCheckWatcher_HealthyWarning(t *testing.T) {
Expand Down

0 comments on commit 9c418c2

Please sign in to comment.