diff --git a/client/client.go b/client/client.go index ce6c1f2afcd5..d142f13a2dc5 100644 --- a/client/client.go +++ b/client/client.go @@ -14,11 +14,11 @@ import ( "sync" "time" - "github.com/armon/go-metrics" + metrics "github.com/armon/go-metrics" consulapi "github.com/hashicorp/consul/api" "github.com/hashicorp/consul/lib" - "github.com/hashicorp/go-hclog" - "github.com/hashicorp/go-multierror" + hclog "github.com/hashicorp/go-hclog" + multierror "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/allocrunner" "github.com/hashicorp/nomad/client/allocrunner/interfaces" @@ -1006,6 +1006,15 @@ func (c *Client) restoreState() error { // Load each alloc back for _, alloc := range allocs { + // COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported + // See hasLocalState for details. Skipping suspicious allocs + // now. If allocs should be run, they will be started when the client + // gets allocs from servers. + if !c.hasLocalState(alloc) { + c.logger.Warn("found a alloc without any local state, skipping restore", "alloc_id", alloc.ID) + continue + } + //XXX On Restore we give up on watching previous allocs because // we need the local AllocRunners initialized first. We could // add a second loop to initialize just the alloc watcher. @@ -1062,6 +1071,42 @@ func (c *Client) restoreState() error { return nil } +// hasLocalState returns true if we have any other associated state +// with alloc beyond the task itself +// +// Useful for detecting if a potentially completed alloc got resurrected +// after AR was destroyed. In such cases, re-running the alloc lead to +// unexpected reruns and may lead to process and task exhaustion on node. +// +// The heuristic used here is an alloc is suspect if we see no other information +// and no other task/status info is found. +// +// Also, an alloc without any client state will not be restored correctly; there will +// be no tasks processes to reattach to, etc. In such cases, client should +// wait until it gets allocs from server to launch them. +// +// See: +// * https://github.com/hashicorp/nomad/pull/6207 +// * https://github.com/hashicorp/nomad/issues/5984 +// +// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported +func (c *Client) hasLocalState(alloc *structs.Allocation) bool { + tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) + if tg == nil { + // corrupt alloc?! + return false + } + + for _, task := range tg.Tasks { + ls, tr, _ := c.stateDB.GetTaskRunnerState(alloc.ID, task.Name) + if ls != nil || tr != nil { + return true + } + } + + return false +} + func (c *Client) handleInvalidAllocs(alloc *structs.Allocation, err error) { c.invalidAllocsLock.Lock() c.invalidAllocs[alloc.ID] = struct{}{} diff --git a/client/client_test.go b/client/client_test.go index d9b2013b56f4..f1a4c00134f5 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -10,10 +10,12 @@ import ( "testing" "time" - "github.com/hashicorp/go-memdb" + memdb "github.com/hashicorp/go-memdb" + trstate "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state" "github.com/hashicorp/nomad/client/config" consulApi "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/client/fingerprint" + "github.com/hashicorp/nomad/client/state" "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/helper/pluginutils/catalog" "github.com/hashicorp/nomad/helper/testlog" @@ -27,7 +29,7 @@ import ( "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/assert" - "github.com/hashicorp/go-hclog" + hclog "github.com/hashicorp/go-hclog" cstate "github.com/hashicorp/nomad/client/state" ctestutil "github.com/hashicorp/nomad/client/testutil" "github.com/stretchr/testify/require" @@ -1644,3 +1646,44 @@ func TestClient_updateNodeFromDriverUpdatesAll(t *testing.T) { assert.EqualValues(t, n, un) } } + +// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported +func TestClient_hasLocalState(t *testing.T) { + t.Parallel() + + c, cleanup := TestClient(t, nil) + defer cleanup() + + c.stateDB = state.NewMemDB(c.logger) + + t.Run("plain alloc", func(t *testing.T) { + alloc := mock.BatchAlloc() + c.stateDB.PutAllocation(alloc) + + require.False(t, c.hasLocalState(alloc)) + }) + + t.Run("alloc with a task with local state", func(t *testing.T) { + alloc := mock.BatchAlloc() + taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name + ls := &trstate.LocalState{} + + c.stateDB.PutAllocation(alloc) + c.stateDB.PutTaskRunnerLocalState(alloc.ID, taskName, ls) + + require.True(t, c.hasLocalState(alloc)) + }) + + t.Run("alloc with a task with task state", func(t *testing.T) { + alloc := mock.BatchAlloc() + taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name + ts := &structs.TaskState{ + State: structs.TaskStateRunning, + } + + c.stateDB.PutAllocation(alloc) + c.stateDB.PutTaskState(alloc.ID, taskName, ts) + + require.True(t, c.hasLocalState(alloc)) + }) +}