Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Explain restart decision and display in alloc-status #984

Merged
merged 1 commit into from
Mar 25, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion api/tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ const (
TaskTerminated = "Terminated"
TaskKilled = "Killed"
TaskRestarting = "Restarting"
TaskNotRestarting = "Restarts Exceeded"
TaskNotRestarting = "Not Restarting"
TaskDownloadingArtifacts = "Downloading Artifacts"
TaskArtifactDownloadFailed = "Failed Artifact Download"
)
Expand All @@ -173,6 +173,7 @@ const (
type TaskEvent struct {
Type string
Time int64
RestartReason string
DriverError string
ExitCode int
Signal int
Expand Down
32 changes: 30 additions & 2 deletions client/restarts.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package client

import (
"fmt"
"math/rand"
"sync"
"time"
Expand All @@ -9,8 +10,15 @@ import (
"github.com/hashicorp/nomad/nomad/structs"
)

// jitter is the percent of jitter added to restart delays.
const jitter = 0.25
const (
// jitter is the percent of jitter added to restart delays.
jitter = 0.25

ReasonNoRestartsAllowed = "Policy allows no restarts"
ReasonUnrecoverableErrror = "Error was unrecoverable"
ReasonWithinPolicy = "Restart within policy"
ReasonDelay = "Exceeded allowed attempts, applying a delay"
)

func newRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker {
onSuccess := true
Expand All @@ -31,6 +39,7 @@ type RestartTracker struct {
count int // Current number of attempts.
onSuccess bool // Whether to restart on successful exit code.
startTime time.Time // When the interval began
reason string // The reason for the last state
policy *structs.RestartPolicy
rand *rand.Rand
lock sync.Mutex
Expand Down Expand Up @@ -60,6 +69,14 @@ func (r *RestartTracker) SetWaitResult(res *cstructs.WaitResult) *RestartTracker
return r
}

// GetReason returns a human-readable description for the last state returned by
// GetState.
func (r *RestartTracker) GetReason() string {
r.lock.Lock()
defer r.lock.Unlock()
return r.reason
}

// GetState returns the tasks next state given the set exit code and start
// error. One of the following states are returned:
// * TaskRestarting - Task should be restarted
Expand All @@ -76,6 +93,7 @@ func (r *RestartTracker) GetState() (string, time.Duration) {

// Hot path if no attempts are expected
if r.policy.Attempts == 0 {
r.reason = ReasonNoRestartsAllowed
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't you need to acquire a lock before setting the reason.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is locked at the top of GetState

if r.waitRes != nil && r.waitRes.Successful() {
return structs.TaskTerminated, 0
}
Expand Down Expand Up @@ -109,13 +127,17 @@ func (r *RestartTracker) GetState() (string, time.Duration) {
func (r *RestartTracker) handleStartError() (string, time.Duration) {
// If the error is not recoverable, do not restart.
if rerr, ok := r.startErr.(*cstructs.RecoverableError); !(ok && rerr.Recoverable) {
r.reason = ReasonUnrecoverableErrror
return structs.TaskNotRestarting, 0
}

if r.count > r.policy.Attempts {
r.reason = fmt.Sprintf("Exceeded allowed attempts %d in interval %v",
r.policy.Attempts, r.policy.Interval)
return structs.TaskNotRestarting, 0
}

r.reason = ReasonWithinPolicy
return structs.TaskRestarting, r.jitter()
}

Expand All @@ -125,17 +147,23 @@ func (r *RestartTracker) handleWaitResult() (string, time.Duration) {
// If the task started successfully and restart on success isn't specified,
// don't restart but don't mark as failed.
if r.waitRes.Successful() && !r.onSuccess {
r.reason = "Restart unnecessary as task terminated successfully"
return structs.TaskTerminated, 0
}

if r.count > r.policy.Attempts {
if r.policy.Mode == structs.RestartPolicyModeFail {
r.reason = fmt.Sprintf(
`Exceeded allowed atttempts %d in interval %v and mode is "fail"`,
r.policy.Attempts, r.policy.Interval)
return structs.TaskNotRestarting, 0
} else {
r.reason = ReasonDelay
return structs.TaskRestarting, r.getDelay()
}
}

r.reason = ReasonWithinPolicy
return structs.TaskRestarting, r.jitter()
}

Expand Down
10 changes: 8 additions & 2 deletions client/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -345,16 +345,22 @@ func (r *TaskRunner) run() {
RESTART:
state, when := r.restartTracker.GetState()
r.restartTracker.SetStartError(nil).SetWaitResult(nil)
reason := r.restartTracker.GetReason()
switch state {
case structs.TaskNotRestarting, structs.TaskTerminated:
r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID)
if state == structs.TaskNotRestarting {
r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskNotRestarting))
r.setState(structs.TaskStateDead,
structs.NewTaskEvent(structs.TaskNotRestarting).
SetRestartReason(reason))
}
return
case structs.TaskRestarting:
r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when)
r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskRestarting).SetRestartDelay(when))
r.setState(structs.TaskStatePending,
structs.NewTaskEvent(structs.TaskRestarting).
SetRestartDelay(when).
SetRestartReason(reason))
default:
r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state)
return
Expand Down
14 changes: 12 additions & 2 deletions command/alloc_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"time"

"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/client"
)

type AllocStatusCommand struct {
Expand Down Expand Up @@ -240,9 +241,18 @@ func (c *AllocStatusCommand) taskStatus(alloc *api.Allocation) {
}
desc = strings.Join(parts, ", ")
case api.TaskRestarting:
desc = fmt.Sprintf("Task restarting in %v", time.Duration(event.StartDelay))
in := fmt.Sprintf("Task restarting in %v", time.Duration(event.StartDelay))
if event.RestartReason != "" && event.RestartReason != client.ReasonWithinPolicy {
desc = fmt.Sprintf("%s - %s", event.RestartReason, in)
} else {
desc = in
}
case api.TaskNotRestarting:
desc = "Task exceeded restart policy"
if event.RestartReason != "" {
desc = event.RestartReason
} else {
desc = "Task exceeded restart policy"
}
}

// Reverse order so we are sorted by time
Expand Down
10 changes: 9 additions & 1 deletion nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -1830,7 +1830,7 @@ const (

// TaskNotRestarting indicates that the task has failed and is not being
// restarted because it has exceeded its restart policy.
TaskNotRestarting = "Restarts Exceeded"
TaskNotRestarting = "Not Restarting"

// TaskDownloadingArtifacts means the task is downloading the artifacts
// specified in the task.
Expand All @@ -1847,6 +1847,9 @@ type TaskEvent struct {
Type string
Time int64 // Unix Nanosecond timestamp

// Restart fields.
RestartReason string

// Driver Failure fields.
DriverError string // A driver error occured while starting the task.

Expand Down Expand Up @@ -1924,6 +1927,11 @@ func (e *TaskEvent) SetRestartDelay(delay time.Duration) *TaskEvent {
return e
}

func (e *TaskEvent) SetRestartReason(reason string) *TaskEvent {
e.RestartReason = reason
return e
}

func (e *TaskEvent) SetDownloadError(err error) *TaskEvent {
if err != nil {
e.DownloadError = err.Error()
Expand Down