Skip to content

Commit

Permalink
Merge pull request #984 from hashicorp/f-unrecoverable-error
Browse files Browse the repository at this point in the history
Explain restart decision and display in alloc-status
  • Loading branch information
dadgar committed Mar 25, 2016
2 parents aaff02c + 6e779b8 commit 89c9fd1
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 8 deletions.
3 changes: 2 additions & 1 deletion api/tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ const (
TaskTerminated = "Terminated"
TaskKilled = "Killed"
TaskRestarting = "Restarting"
TaskNotRestarting = "Restarts Exceeded"
TaskNotRestarting = "Not Restarting"
TaskDownloadingArtifacts = "Downloading Artifacts"
TaskArtifactDownloadFailed = "Failed Artifact Download"
)
Expand All @@ -173,6 +173,7 @@ const (
type TaskEvent struct {
Type string
Time int64
RestartReason string
DriverError string
ExitCode int
Signal int
Expand Down
32 changes: 30 additions & 2 deletions client/restarts.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package client

import (
"fmt"
"math/rand"
"sync"
"time"
Expand All @@ -9,8 +10,15 @@ import (
"github.com/hashicorp/nomad/nomad/structs"
)

// jitter is the percent of jitter added to restart delays.
const jitter = 0.25
const (
// jitter is the percent of jitter added to restart delays.
jitter = 0.25

ReasonNoRestartsAllowed = "Policy allows no restarts"
ReasonUnrecoverableErrror = "Error was unrecoverable"
ReasonWithinPolicy = "Restart within policy"
ReasonDelay = "Exceeded allowed attempts, applying a delay"
)

func newRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker {
onSuccess := true
Expand All @@ -31,6 +39,7 @@ type RestartTracker struct {
count int // Current number of attempts.
onSuccess bool // Whether to restart on successful exit code.
startTime time.Time // When the interval began
reason string // The reason for the last state
policy *structs.RestartPolicy
rand *rand.Rand
lock sync.Mutex
Expand Down Expand Up @@ -60,6 +69,14 @@ func (r *RestartTracker) SetWaitResult(res *cstructs.WaitResult) *RestartTracker
return r
}

// GetReason returns a human-readable description for the last state returned by
// GetState.
func (r *RestartTracker) GetReason() string {
r.lock.Lock()
defer r.lock.Unlock()
return r.reason
}

// GetState returns the tasks next state given the set exit code and start
// error. One of the following states are returned:
// * TaskRestarting - Task should be restarted
Expand All @@ -76,6 +93,7 @@ func (r *RestartTracker) GetState() (string, time.Duration) {

// Hot path if no attempts are expected
if r.policy.Attempts == 0 {
r.reason = ReasonNoRestartsAllowed
if r.waitRes != nil && r.waitRes.Successful() {
return structs.TaskTerminated, 0
}
Expand Down Expand Up @@ -109,13 +127,17 @@ func (r *RestartTracker) GetState() (string, time.Duration) {
func (r *RestartTracker) handleStartError() (string, time.Duration) {
// If the error is not recoverable, do not restart.
if rerr, ok := r.startErr.(*cstructs.RecoverableError); !(ok && rerr.Recoverable) {
r.reason = ReasonUnrecoverableErrror
return structs.TaskNotRestarting, 0
}

if r.count > r.policy.Attempts {
r.reason = fmt.Sprintf("Exceeded allowed attempts %d in interval %v",
r.policy.Attempts, r.policy.Interval)
return structs.TaskNotRestarting, 0
}

r.reason = ReasonWithinPolicy
return structs.TaskRestarting, r.jitter()
}

Expand All @@ -125,17 +147,23 @@ func (r *RestartTracker) handleWaitResult() (string, time.Duration) {
// If the task started successfully and restart on success isn't specified,
// don't restart but don't mark as failed.
if r.waitRes.Successful() && !r.onSuccess {
r.reason = "Restart unnecessary as task terminated successfully"
return structs.TaskTerminated, 0
}

if r.count > r.policy.Attempts {
if r.policy.Mode == structs.RestartPolicyModeFail {
r.reason = fmt.Sprintf(
`Exceeded allowed atttempts %d in interval %v and mode is "fail"`,
r.policy.Attempts, r.policy.Interval)
return structs.TaskNotRestarting, 0
} else {
r.reason = ReasonDelay
return structs.TaskRestarting, r.getDelay()
}
}

r.reason = ReasonWithinPolicy
return structs.TaskRestarting, r.jitter()
}

Expand Down
10 changes: 8 additions & 2 deletions client/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -345,16 +345,22 @@ func (r *TaskRunner) run() {
RESTART:
state, when := r.restartTracker.GetState()
r.restartTracker.SetStartError(nil).SetWaitResult(nil)
reason := r.restartTracker.GetReason()
switch state {
case structs.TaskNotRestarting, structs.TaskTerminated:
r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID)
if state == structs.TaskNotRestarting {
r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskNotRestarting))
r.setState(structs.TaskStateDead,
structs.NewTaskEvent(structs.TaskNotRestarting).
SetRestartReason(reason))
}
return
case structs.TaskRestarting:
r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when)
r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskRestarting).SetRestartDelay(when))
r.setState(structs.TaskStatePending,
structs.NewTaskEvent(structs.TaskRestarting).
SetRestartDelay(when).
SetRestartReason(reason))
default:
r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state)
return
Expand Down
14 changes: 12 additions & 2 deletions command/alloc_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"time"

"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/client"
)

type AllocStatusCommand struct {
Expand Down Expand Up @@ -240,9 +241,18 @@ func (c *AllocStatusCommand) taskStatus(alloc *api.Allocation) {
}
desc = strings.Join(parts, ", ")
case api.TaskRestarting:
desc = fmt.Sprintf("Task restarting in %v", time.Duration(event.StartDelay))
in := fmt.Sprintf("Task restarting in %v", time.Duration(event.StartDelay))
if event.RestartReason != "" && event.RestartReason != client.ReasonWithinPolicy {
desc = fmt.Sprintf("%s - %s", event.RestartReason, in)
} else {
desc = in
}
case api.TaskNotRestarting:
desc = "Task exceeded restart policy"
if event.RestartReason != "" {
desc = event.RestartReason
} else {
desc = "Task exceeded restart policy"
}
}

// Reverse order so we are sorted by time
Expand Down
10 changes: 9 additions & 1 deletion nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -1830,7 +1830,7 @@ const (

// TaskNotRestarting indicates that the task has failed and is not being
// restarted because it has exceeded its restart policy.
TaskNotRestarting = "Restarts Exceeded"
TaskNotRestarting = "Not Restarting"

// TaskDownloadingArtifacts means the task is downloading the artifacts
// specified in the task.
Expand All @@ -1847,6 +1847,9 @@ type TaskEvent struct {
Type string
Time int64 // Unix Nanosecond timestamp

// Restart fields.
RestartReason string

// Driver Failure fields.
DriverError string // A driver error occured while starting the task.

Expand Down Expand Up @@ -1924,6 +1927,11 @@ func (e *TaskEvent) SetRestartDelay(delay time.Duration) *TaskEvent {
return e
}

func (e *TaskEvent) SetRestartReason(reason string) *TaskEvent {
e.RestartReason = reason
return e
}

func (e *TaskEvent) SetDownloadError(err error) *TaskEvent {
if err != nil {
e.DownloadError = err.Error()
Expand Down

0 comments on commit 89c9fd1

Please sign in to comment.