Skip to content

Commit

Permalink
Don't persist allocs of destroyed alloc runners
Browse files Browse the repository at this point in the history
This fixes a bug where allocs that have been GCed get re-run again after client
is restarted.  A heavily-used client may launch thousands of allocs on startup
and get killed.

The bug is that an alloc runner that gets destroyed due to GC remains in
client alloc runner set.  Periodically, they get persisted until alloc is
gced by server.  During that  time, the client db will contain the alloc
but not its individual tasks status nor completed state.  On client restart,
client assumes that alloc is pending state and re-runs it.

Here, we fix it by ensuring that destroyed alloc runners don't persist any alloc
to the state DB.

This is a short-term fix, as we should consider revamping client state
management.  Storing alloc and task information in non-transaction non-atomic
concurrently while alloc runner is running and potentially changing state is a
recipe for bugs.

Fixes #5984
Related to #5890
  • Loading branch information
Mahmood Ali committed Aug 25, 2019
1 parent b4a80a7 commit a80643e
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 1 deletion.
18 changes: 18 additions & 0 deletions client/allocrunner/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,24 @@ func (ar *allocRunner) destroyImpl() {
ar.destroyedLock.Unlock()
}

func (ar *allocRunner) PersistState() error {
// note that a race exists where a goroutine attempts to persist state
// while another kicks off destruction process.
// Here, we attempt to reconcile by always deleting alloc bucket after alloc destruction
if ar.IsDestroyed() {
err := ar.stateDB.DeleteAllocationBucket(ar.id)
if err != nil {
ar.logger.Warn("failed to delete allocation bucket", "error", err)
}
return nil
}

// TODO: consider persisting deployment state along with task status.
// While we study why only the alloc is persisted, I opted to maintain current
// behavior and not risk adding yet more IO calls unnecessarily.
return ar.stateDB.PutAllocation(ar.Alloc())
}

// Destroy the alloc runner by stopping it if it is still running and cleaning
// up all of its resources.
//
Expand Down
58 changes: 58 additions & 0 deletions client/allocrunner/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1001,3 +1001,61 @@ func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
require.Fail(t, "err: %v", err)
})
}

// TestAllocRunner_PersistState_Destroyed asserts that destroyed allocs don't persist anymore
func TestAllocRunner_PersistState_Destroyed(t *testing.T) {
t.Parallel()

alloc := mock.BatchAlloc()
taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name

conf, cleanup := testAllocRunnerConfig(t, alloc)
conf.StateDB = state.NewMemDB(conf.Logger)

defer cleanup()
ar, err := NewAllocRunner(conf)
require.NoError(t, err)
defer destroy(ar)

go ar.Run()

select {
case <-ar.WaitCh():
case <-time.After(10 * time.Second):
require.Fail(t, "timed out waiting for alloc to complete")
}

// test final persisted state upon completion
require.NoError(t, ar.PersistState())
allocs, _, err := conf.StateDB.GetAllAllocations()
require.NoError(t, err)
require.Len(t, allocs, 1)
require.Equal(t, alloc.ID, allocs[0].ID)
_, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
require.NoError(t, err)
require.Equal(t, structs.TaskStateDead, ts.State)

// check that DB alloc is empty after destroying AR
ar.Destroy()
select {
case <-ar.DestroyCh():
case <-time.After(10 * time.Second):
require.Fail(t, "timedout waiting for destruction")
}

allocs, _, err = conf.StateDB.GetAllAllocations()
require.NoError(t, err)
require.Empty(t, allocs)
_, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
require.NoError(t, err)
require.Nil(t, ts)

// check that DB alloc is empty after persisting state of destroyed AR
ar.PersistState()
allocs, _, err = conf.StateDB.GetAllAllocations()
require.NoError(t, err)
require.Empty(t, allocs)
_, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
require.NoError(t, err)
require.Nil(t, ts)
}
3 changes: 2 additions & 1 deletion client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ type AllocRunner interface {
ShutdownCh() <-chan struct{}
Signal(taskName, signal string) error
GetTaskEventHandler(taskName string) drivermanager.EventHandler
PersistState() error

RestartTask(taskName string, taskEvent *structs.TaskEvent) error
RestartAll(taskEvent *structs.TaskEvent) error
Expand Down Expand Up @@ -1084,7 +1085,7 @@ func (c *Client) saveState() error {

for id, ar := range runners {
go func(id string, ar AllocRunner) {
err := c.stateDB.PutAllocation(ar.Alloc())
err := ar.PersistState()
if err != nil {
c.logger.Error("error saving alloc state", "error", err, "alloc_id", id)
l.Lock()
Expand Down

0 comments on commit a80643e

Please sign in to comment.