Skip to content

Commit

Permalink
Merge pull request #6216 from hashicorp/b-recognize-pending-allocs
Browse files Browse the repository at this point in the history
alloc_runner: wait when starting suspicious allocs
  • Loading branch information
Mahmood Ali committed Aug 28, 2019
2 parents ddf2f6b + 8b05f87 commit b2ef75e
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 5 deletions.
51 changes: 48 additions & 3 deletions client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ import (
"sync"
"time"

"github.com/armon/go-metrics"
metrics "github.com/armon/go-metrics"
consulapi "github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-multierror"
hclog "github.com/hashicorp/go-hclog"
multierror "github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/allocrunner"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
Expand Down Expand Up @@ -1006,6 +1006,15 @@ func (c *Client) restoreState() error {
// Load each alloc back
for _, alloc := range allocs {

// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
// See hasLocalState for details. Skipping suspicious allocs
// now. If allocs should be run, they will be started when the client
// gets allocs from servers.
if !c.hasLocalState(alloc) {
c.logger.Warn("found a alloc without any local state, skipping restore", "alloc_id", alloc.ID)
continue
}

//XXX On Restore we give up on watching previous allocs because
// we need the local AllocRunners initialized first. We could
// add a second loop to initialize just the alloc watcher.
Expand Down Expand Up @@ -1062,6 +1071,42 @@ func (c *Client) restoreState() error {
return nil
}

// hasLocalState returns true if we have any other associated state
// with alloc beyond the task itself
//
// Useful for detecting if a potentially completed alloc got resurrected
// after AR was destroyed. In such cases, re-running the alloc lead to
// unexpected reruns and may lead to process and task exhaustion on node.
//
// The heuristic used here is an alloc is suspect if we see no other information
// and no other task/status info is found.
//
// Also, an alloc without any client state will not be restored correctly; there will
// be no tasks processes to reattach to, etc. In such cases, client should
// wait until it gets allocs from server to launch them.
//
// See:
// * https://github.com/hashicorp/nomad/pull/6207
// * https://github.com/hashicorp/nomad/issues/5984
//
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
func (c *Client) hasLocalState(alloc *structs.Allocation) bool {
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
if tg == nil {
// corrupt alloc?!
return false
}

for _, task := range tg.Tasks {
ls, tr, _ := c.stateDB.GetTaskRunnerState(alloc.ID, task.Name)
if ls != nil || tr != nil {
return true
}
}

return false
}

func (c *Client) handleInvalidAllocs(alloc *structs.Allocation, err error) {
c.invalidAllocsLock.Lock()
c.invalidAllocs[alloc.ID] = struct{}{}
Expand Down
47 changes: 45 additions & 2 deletions client/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@ import (
"testing"
"time"

"github.com/hashicorp/go-memdb"
memdb "github.com/hashicorp/go-memdb"
trstate "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state"
"github.com/hashicorp/nomad/client/config"
consulApi "github.com/hashicorp/nomad/client/consul"
"github.com/hashicorp/nomad/client/fingerprint"
"github.com/hashicorp/nomad/client/state"
"github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/helper/pluginutils/catalog"
"github.com/hashicorp/nomad/helper/testlog"
Expand All @@ -27,7 +29,7 @@ import (
"github.com/hashicorp/nomad/testutil"
"github.com/stretchr/testify/assert"

"github.com/hashicorp/go-hclog"
hclog "github.com/hashicorp/go-hclog"
cstate "github.com/hashicorp/nomad/client/state"
ctestutil "github.com/hashicorp/nomad/client/testutil"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -1644,3 +1646,44 @@ func TestClient_updateNodeFromDriverUpdatesAll(t *testing.T) {
assert.EqualValues(t, n, un)
}
}

// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
func TestClient_hasLocalState(t *testing.T) {
t.Parallel()

c, cleanup := TestClient(t, nil)
defer cleanup()

c.stateDB = state.NewMemDB(c.logger)

t.Run("plain alloc", func(t *testing.T) {
alloc := mock.BatchAlloc()
c.stateDB.PutAllocation(alloc)

require.False(t, c.hasLocalState(alloc))
})

t.Run("alloc with a task with local state", func(t *testing.T) {
alloc := mock.BatchAlloc()
taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
ls := &trstate.LocalState{}

c.stateDB.PutAllocation(alloc)
c.stateDB.PutTaskRunnerLocalState(alloc.ID, taskName, ls)

require.True(t, c.hasLocalState(alloc))
})

t.Run("alloc with a task with task state", func(t *testing.T) {
alloc := mock.BatchAlloc()
taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
ts := &structs.TaskState{
State: structs.TaskStateRunning,
}

c.stateDB.PutAllocation(alloc)
c.stateDB.PutTaskState(alloc.ID, taskName, ts)

require.True(t, c.hasLocalState(alloc))
})
}

0 comments on commit b2ef75e

Please sign in to comment.