Skip to content

Commit

Permalink
Merge pull request #5905 from hashicorp/b-ar-failed-prestart
Browse files Browse the repository at this point in the history
Fail alloc if alloc runner prestart hooks fail
  • Loading branch information
Mahmood Ali committed Jul 2, 2019
2 parents 8fc7284 + 9980239 commit 72f67c5
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 0 deletions.
8 changes: 8 additions & 0 deletions client/allocrunner/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -248,10 +248,18 @@ func (ar *allocRunner) Run() {
default:
}

// When handling (potentially restored) terminal alloc, ensure tasks and post-run hooks are run
// to perform any cleanup that's necessary, potentially not done prior to earlier termination

// Run the prestart hooks if non-terminal
if ar.shouldRun() {
if err := ar.prerun(); err != nil {
ar.logger.Error("prerun failed", "error", err)

for _, tr := range ar.tasks {
tr.MarkFailedDead(fmt.Sprintf("failed to setup runner: %v", err))
}

goto POST
}
}
Expand Down
76 changes: 76 additions & 0 deletions client/allocrunner/alloc_runner_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,3 +215,79 @@ func TestAllocRunner_Restore_CompletedBatch(t *testing.T) {
events := ar2.AllocState().TaskStates[task.Name].Events
require.Equal(t, initialRunEvents, events)
}

// TestAllocRunner_PreStartFailuresLeadToFailed asserts that if an alloc
// prestart hooks failed, then the alloc and subsequent tasks transition
// to failed state
func TestAllocRunner_PreStartFailuresLeadToFailed(t *testing.T) {
t.Parallel()

alloc := mock.Alloc()
alloc.Job.Type = structs.JobTypeBatch
task := alloc.Job.TaskGroups[0].Tasks[0]
task.Driver = "mock_driver"
task.Config = map[string]interface{}{
"run_for": "2ms",
}
alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{
Attempts: 0,
}

conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
defer cleanup()

// Maintain state for subsequent run
conf.StateDB = state.NewMemDB(conf.Logger)

// Start and wait for task to be running
ar, err := NewAllocRunner(conf)
require.NoError(t, err)

ar.runnerHooks = append(ar.runnerHooks, &allocFailingPrestartHook{})

go ar.Run()
defer destroy(ar)

select {
case <-ar.WaitCh():
case <-time.After(10 * time.Second):
require.Fail(t, "alloc.waitCh wasn't closed")
}

testutil.WaitForResult(func() (bool, error) {
s := ar.AllocState()
if s.ClientStatus != structs.AllocClientStatusFailed {
return false, fmt.Errorf("expected complete, got %s", s.ClientStatus)
}
return true, nil
}, func(err error) {
require.NoError(t, err)
})

// once job finishes, it shouldn't run again
require.False(t, ar.shouldRun())
initialRunEvents := ar.AllocState().TaskStates[task.Name].Events
require.Len(t, initialRunEvents, 2)

ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
require.NoError(t, err)
require.NotNil(t, ls)
require.NotNil(t, ts)
require.Equal(t, structs.TaskStateDead, ts.State)
require.True(t, ts.Failed)

// TR waitCh must be closed too!
select {
case <-ar.tasks[task.Name].WaitCh():
case <-time.After(10 * time.Second):
require.Fail(t, "tr.waitCh wasn't closed")
}
}

type allocFailingPrestartHook struct{}

func (*allocFailingPrestartHook) Name() string { return "failing_prestart" }

func (*allocFailingPrestartHook) Prerun() error {
return fmt.Errorf("failing prestart hooks")
}
27 changes: 27 additions & 0 deletions client/allocrunner/taskrunner/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,33 @@ func (tr *TaskRunner) initLabels() {
}
}

// Mark a task as failed and not to run. Aimed to be invoked when alloc runner
// prestart hooks failed.
// Should never be called with Run().
func (tr *TaskRunner) MarkFailedDead(reason string) {
defer close(tr.waitCh)

tr.stateLock.Lock()
if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil {
//TODO Nomad will be unable to restore this task; try to kill
// it now and fail? In general we prefer to leave running
// tasks running even if the agent encounters an error.
tr.logger.Warn("error persisting local failed task state; may be unable to restore after a Nomad restart",
"error", err)
}
tr.stateLock.Unlock()

event := structs.NewTaskEvent(structs.TaskSetupFailure).
SetDisplayMessage(reason).
SetFailsTask()
tr.UpdateState(structs.TaskStateDead, event)

// Run the stop hooks in case task was a restored task that failed prestart
if err := tr.stop(); err != nil {
tr.logger.Error("stop failed while marking task dead", "error", err)
}
}

// Run the TaskRunner. Starts the user's task or reattaches to a restored task.
// Run closes WaitCh when it exits. Should be started in a goroutine.
func (tr *TaskRunner) Run() {
Expand Down

0 comments on commit 72f67c5

Please sign in to comment.