Skip to content

Commit

Permalink
Merge pull request #744 from hashicorp/f-on-restart
Browse files Browse the repository at this point in the history
Restart on-success shouldn't be user specifiable
  • Loading branch information
dadgar committed Feb 3, 2016
2 parents 6281ff5 + 2b43ddf commit 2f4bb31
Show file tree
Hide file tree
Showing 13 changed files with 56 additions and 71 deletions.
9 changes: 4 additions & 5 deletions api/tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@ import (
// RestartPolicy defines how the Nomad client restarts
// tasks in a taskgroup when they fail
type RestartPolicy struct {
Interval time.Duration
Attempts int
Delay time.Duration
RestartOnSuccess bool
Mode string
Interval time.Duration
Attempts int
Delay time.Duration
Mode string
}

// The ServiceCheck data model represents the consul health check that
Expand Down
4 changes: 2 additions & 2 deletions client/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ func (r *AllocRunner) RestoreState() error {
r.restored[name] = struct{}{}

task := &structs.Task{Name: name}
restartTracker := newRestartTracker(r.RestartPolicy)
restartTracker := newRestartTracker(r.RestartPolicy, r.alloc.Job.Type)
tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx,
r.alloc, task, restartTracker, r.consulService)
r.tasks[name] = tr
Expand Down Expand Up @@ -370,7 +370,7 @@ func (r *AllocRunner) Run() {

// Merge in the task resources
task.Resources = alloc.TaskResources[task.Name]
restartTracker := newRestartTracker(r.RestartPolicy)
restartTracker := newRestartTracker(r.RestartPolicy, r.alloc.Job.Type)
tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx,
r.alloc, task, restartTracker, r.consulService)
r.tasks[task.Name] = tr
Expand Down
3 changes: 2 additions & 1 deletion client/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ func testAllocRunner(restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
alloc := mock.Alloc()
consulClient, _ := NewConsulService(&consulServiceConfig{logger, "127.0.0.1:8500", "", "", false, false, &structs.Node{}})
if !restarts {
*alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0, RestartOnSuccess: false}
*alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0}
alloc.Job.Type = structs.JobTypeBatch
}

ar := NewAllocRunner(logger, conf, upd.Update, alloc, consulClient)
Expand Down
14 changes: 10 additions & 4 deletions client/restarts.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,22 @@ import (
// jitter is the percent of jitter added to restart delays.
const jitter = 0.25

func newRestartTracker(policy *structs.RestartPolicy) *RestartTracker {
func newRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker {
onSuccess := true
if jobType == structs.JobTypeBatch {
onSuccess = false
}
return &RestartTracker{
startTime: time.Now(),
onSuccess: onSuccess,
policy: policy,
rand: rand.New(rand.NewSource(time.Now().Unix())),
}
}

type RestartTracker struct {
count int // Current number of attempts.
onSuccess bool // Whether to restart on successful exit code.
startTime time.Time // When the interval began
policy *structs.RestartPolicy
rand *rand.Rand
Expand Down Expand Up @@ -57,9 +63,9 @@ func (r *RestartTracker) NextRestart(exitCode int) (bool, time.Duration) {
}

// shouldRestart returns whether a restart should occur based on the exit code
// and the RestartOnSuccess configuration.
// and job type.
func (r *RestartTracker) shouldRestart(exitCode int) bool {
return exitCode != 0 || r.policy.RestartOnSuccess
return exitCode != 0 || r.onSuccess
}

// jitter returns the delay time plus a jitter.
Expand All @@ -77,5 +83,5 @@ func (r *RestartTracker) jitter() time.Duration {
// Returns a tracker that never restarts.
func noRestartsTracker() *RestartTracker {
policy := &structs.RestartPolicy{Attempts: 0, Mode: structs.RestartPolicyModeFail}
return newRestartTracker(policy)
return newRestartTracker(policy, structs.JobTypeBatch)
}
17 changes: 8 additions & 9 deletions client/restarts_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,10 @@ import (

func testPolicy(success bool, mode string) *structs.RestartPolicy {
return &structs.RestartPolicy{
Interval: 2 * time.Minute,
Delay: 1 * time.Second,
Attempts: 3,
Mode: mode,
RestartOnSuccess: success,
Interval: 2 * time.Minute,
Delay: 1 * time.Second,
Attempts: 3,
Mode: mode,
}
}

Expand All @@ -27,7 +26,7 @@ func withinJitter(expected, actual time.Duration) bool {
func TestClient_RestartTracker_ModeDelay(t *testing.T) {
t.Parallel()
p := testPolicy(true, structs.RestartPolicyModeDelay)
rt := newRestartTracker(p)
rt := newRestartTracker(p, structs.JobTypeService)
for i := 0; i < p.Attempts; i++ {
actual, when := rt.NextRestart(127)
if !actual {
Expand All @@ -53,7 +52,7 @@ func TestClient_RestartTracker_ModeDelay(t *testing.T) {
func TestClient_RestartTracker_ModeFail(t *testing.T) {
t.Parallel()
p := testPolicy(true, structs.RestartPolicyModeFail)
rt := newRestartTracker(p)
rt := newRestartTracker(p, structs.JobTypeSystem)
for i := 0; i < p.Attempts; i++ {
actual, when := rt.NextRestart(127)
if !actual {
Expand All @@ -73,7 +72,7 @@ func TestClient_RestartTracker_ModeFail(t *testing.T) {
func TestClient_RestartTracker_NoRestartOnSuccess(t *testing.T) {
t.Parallel()
p := testPolicy(false, structs.RestartPolicyModeDelay)
rt := newRestartTracker(p)
rt := newRestartTracker(p, structs.JobTypeBatch)
if shouldRestart, _ := rt.NextRestart(0); shouldRestart {
t.Fatalf("NextRestart() returned %v, expected: %v", shouldRestart, false)
}
Expand All @@ -83,7 +82,7 @@ func TestClient_RestartTracker_ZeroAttempts(t *testing.T) {
t.Parallel()
p := testPolicy(true, structs.RestartPolicyModeFail)
p.Attempts = 0
rt := newRestartTracker(p)
rt := newRestartTracker(p, structs.JobTypeService)
if actual, when := rt.NextRestart(1); actual {
t.Fatalf("expect no restart, got restart/delay: %v", when)
}
Expand Down
2 changes: 1 addition & 1 deletion client/task_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func testTaskRunner(restarts bool) (*MockTaskStateUpdater, *TaskRunner) {

ctx := driver.NewExecContext(allocDir, alloc.ID)
rp := structs.NewRestartPolicy(structs.JobTypeService)
restartTracker := newRestartTracker(rp)
restartTracker := newRestartTracker(rp, alloc.Job.Type)
if !restarts {
restartTracker = noRestartsTracker()
}
Expand Down
3 changes: 0 additions & 3 deletions command/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,6 @@ job "example" {
# A delay between a task failing and a restart occuring.
delay = "25s"
# Whether the tasks should be restarted if the exit successfully.
on_success = true
# Mode controls what happens when a task has restarted "attempts"
# times within the interval. "delay" mode delays the next restart
# till the next interval. "fail" mode does not restart the task if
Expand Down
11 changes: 5 additions & 6 deletions jobspec/parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,10 @@ func TestParse(t *testing.T) {
"elb_checks": "3",
},
RestartPolicy: &structs.RestartPolicy{
Interval: 10 * time.Minute,
Attempts: 5,
Delay: 15 * time.Second,
RestartOnSuccess: true,
Mode: "delay",
Interval: 10 * time.Minute,
Attempts: 5,
Delay: 15 * time.Second,
Mode: "delay",
},
Tasks: []*structs.Task{
&structs.Task{
Expand Down Expand Up @@ -114,7 +113,7 @@ func TestParse(t *testing.T) {
CPU: 500,
MemoryMB: 128,
DiskMB: 10,
IOPS: 1,
IOPS: 0,
Networks: []*structs.NetworkResource{
&structs.NetworkResource{
MBits: 100,
Expand Down
1 change: 0 additions & 1 deletion jobspec/test-fixtures/basic.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ job "binstore-storagelocker" {
attempts = 5
interval = "10m"
delay = "15s"
on_success = true
mode = "delay"
}
task "binstore" {
Expand Down
18 changes: 8 additions & 10 deletions nomad/mock/mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,10 @@ func Job() *structs.Job {
Name: "web",
Count: 10,
RestartPolicy: &structs.RestartPolicy{
Attempts: 3,
Interval: 10 * time.Minute,
Delay: 1 * time.Minute,
RestartOnSuccess: true,
Mode: structs.RestartPolicyModeDelay,
Attempts: 3,
Interval: 10 * time.Minute,
Delay: 1 * time.Minute,
Mode: structs.RestartPolicyModeDelay,
},
Tasks: []*structs.Task{
&structs.Task{
Expand Down Expand Up @@ -156,11 +155,10 @@ func SystemJob() *structs.Job {
Name: "web",
Count: 1,
RestartPolicy: &structs.RestartPolicy{
Attempts: 3,
Interval: 10 * time.Minute,
Delay: 1 * time.Minute,
RestartOnSuccess: true,
Mode: structs.RestartPolicyModeDelay,
Attempts: 3,
Interval: 10 * time.Minute,
Delay: 1 * time.Minute,
Mode: structs.RestartPolicyModeDelay,
},
Tasks: []*structs.Task{
&structs.Task{
Expand Down
22 changes: 8 additions & 14 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -1146,18 +1146,16 @@ type PeriodicLaunch struct {

var (
defaultServiceJobRestartPolicy = RestartPolicy{
Delay: 15 * time.Second,
Attempts: 2,
Interval: 1 * time.Minute,
RestartOnSuccess: true,
Mode: RestartPolicyModeDelay,
Delay: 15 * time.Second,
Attempts: 2,
Interval: 1 * time.Minute,
Mode: RestartPolicyModeDelay,
}
defaultBatchJobRestartPolicy = RestartPolicy{
Delay: 15 * time.Second,
Attempts: 15,
Interval: 7 * 24 * time.Hour,
RestartOnSuccess: false,
Mode: RestartPolicyModeDelay,
Delay: 15 * time.Second,
Attempts: 15,
Interval: 7 * 24 * time.Hour,
Mode: RestartPolicyModeDelay,
}
)

Expand All @@ -1183,10 +1181,6 @@ type RestartPolicy struct {
// Delay is the time between a failure and a restart.
Delay time.Duration

// RestartOnSuccess determines whether a task should be restarted if it
// exited successfully.
RestartOnSuccess bool `mapstructure:"on_success"`

// Mode controls what happens when the task restarts more than attempt times
// in an interval.
Mode string
Expand Down
18 changes: 8 additions & 10 deletions nomad/structs/structs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,11 +189,10 @@ func TestJob_IsPeriodic(t *testing.T) {
func TestTaskGroup_Validate(t *testing.T) {
tg := &TaskGroup{
RestartPolicy: &RestartPolicy{
Interval: 5 * time.Minute,
Delay: 10 * time.Second,
Attempts: 10,
RestartOnSuccess: true,
Mode: RestartPolicyModeDelay,
Interval: 5 * time.Minute,
Delay: 10 * time.Second,
Attempts: 10,
Mode: RestartPolicyModeDelay,
},
}
err := tg.Validate()
Expand All @@ -217,11 +216,10 @@ func TestTaskGroup_Validate(t *testing.T) {
&Task{},
},
RestartPolicy: &RestartPolicy{
Interval: 5 * time.Minute,
Delay: 10 * time.Second,
Attempts: 10,
RestartOnSuccess: true,
Mode: RestartPolicyModeDelay,
Interval: 5 * time.Minute,
Delay: 10 * time.Second,
Attempts: 10,
Mode: RestartPolicyModeDelay,
},
}
err = tg.Validate()
Expand Down
5 changes: 0 additions & 5 deletions website/source/docs/jobspec/index.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -309,9 +309,6 @@ The `restart` object supports the following keys:
time duration using the `s`, `m`, and `h` suffixes, such as `30s`. A random
jitter of up to 25% is added to the the delay.

* `on_success` - `on_success` controls whether a task is restarted when the
task exits successfully.

* `mode` - Controls the behavior when the task fails more than `attempts`
times in an interval. Possible values are listed below:

Expand All @@ -327,7 +324,6 @@ restart {
attempts = 15
delay = "15s"
interval = "168h" # 7 days
on_success = false
mode = "delay"
}
```
Expand All @@ -339,7 +335,6 @@ restart {
interval = "1m"
attempts = 2
delay = "15s"
on_success = true
mode = "delay"
}
```
Expand Down

0 comments on commit 2f4bb31

Please sign in to comment.