Skip to content

Commit

Permalink
Merge pull request #6207 from hashicorp/b-gc-destroyed-allocs-rerun
Browse files Browse the repository at this point in the history
Don't persist allocs of destroyed alloc runners
  • Loading branch information
Mahmood Ali committed Aug 26, 2019
2 parents e2efeb4 + ff3dedd commit f616370
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 5 deletions.
27 changes: 23 additions & 4 deletions client/allocrunner/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -765,14 +765,15 @@ func (ar *allocRunner) destroyImpl() {
// state if Run() ran at all.
<-ar.taskStateUpdateHandlerCh

// Cleanup state db
// Mark alloc as destroyed
ar.destroyedLock.Lock()

// Cleanup state db; while holding the lock to avoid
// a race periodic PersistState that may resurrect the alloc
if err := ar.stateDB.DeleteAllocationBucket(ar.id); err != nil {
ar.logger.Warn("failed to delete allocation state", "error", err)
}

// Mark alloc as destroyed
ar.destroyedLock.Lock()

if !ar.shutdown {
ar.shutdown = true
close(ar.shutdownCh)
Expand All @@ -784,6 +785,24 @@ func (ar *allocRunner) destroyImpl() {
ar.destroyedLock.Unlock()
}

func (ar *allocRunner) PersistState() error {
ar.destroyedLock.Lock()
defer ar.destroyedLock.Unlock()

if ar.destroyed {
err := ar.stateDB.DeleteAllocationBucket(ar.id)
if err != nil {
ar.logger.Warn("failed to delete allocation bucket", "error", err)
}
return nil
}

// TODO: consider persisting deployment state along with task status.
// While we study why only the alloc is persisted, I opted to maintain current
// behavior and not risk adding yet more IO calls unnecessarily.
return ar.stateDB.PutAllocation(ar.Alloc())
}

// Destroy the alloc runner by stopping it if it is still running and cleaning
// up all of its resources.
//
Expand Down
58 changes: 58 additions & 0 deletions client/allocrunner/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1001,3 +1001,61 @@ func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
require.Fail(t, "err: %v", err)
})
}

// TestAllocRunner_PersistState_Destroyed asserts that destroyed allocs don't persist anymore
func TestAllocRunner_PersistState_Destroyed(t *testing.T) {
t.Parallel()

alloc := mock.BatchAlloc()
taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name

conf, cleanup := testAllocRunnerConfig(t, alloc)
conf.StateDB = state.NewMemDB(conf.Logger)

defer cleanup()
ar, err := NewAllocRunner(conf)
require.NoError(t, err)
defer destroy(ar)

go ar.Run()

select {
case <-ar.WaitCh():
case <-time.After(10 * time.Second):
require.Fail(t, "timed out waiting for alloc to complete")
}

// test final persisted state upon completion
require.NoError(t, ar.PersistState())
allocs, _, err := conf.StateDB.GetAllAllocations()
require.NoError(t, err)
require.Len(t, allocs, 1)
require.Equal(t, alloc.ID, allocs[0].ID)
_, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
require.NoError(t, err)
require.Equal(t, structs.TaskStateDead, ts.State)

// check that DB alloc is empty after destroying AR
ar.Destroy()
select {
case <-ar.DestroyCh():
case <-time.After(10 * time.Second):
require.Fail(t, "timedout waiting for destruction")
}

allocs, _, err = conf.StateDB.GetAllAllocations()
require.NoError(t, err)
require.Empty(t, allocs)
_, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
require.NoError(t, err)
require.Nil(t, ts)

// check that DB alloc is empty after persisting state of destroyed AR
ar.PersistState()
allocs, _, err = conf.StateDB.GetAllAllocations()
require.NoError(t, err)
require.Empty(t, allocs)
_, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
require.NoError(t, err)
require.Nil(t, ts)
}
3 changes: 2 additions & 1 deletion client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ type AllocRunner interface {
ShutdownCh() <-chan struct{}
Signal(taskName, signal string) error
GetTaskEventHandler(taskName string) drivermanager.EventHandler
PersistState() error

RestartTask(taskName string, taskEvent *structs.TaskEvent) error
RestartAll(taskEvent *structs.TaskEvent) error
Expand Down Expand Up @@ -1084,7 +1085,7 @@ func (c *Client) saveState() error {

for id, ar := range runners {
go func(id string, ar AllocRunner) {
err := c.stateDB.PutAllocation(ar.Alloc())
err := ar.PersistState()
if err != nil {
c.logger.Error("error saving alloc state", "error", err, "alloc_id", id)
l.Lock()
Expand Down

0 comments on commit f616370

Please sign in to comment.