Skip to content

Commit

Permalink
Merge pull request #3043 from hashicorp/f-2441-shutdown-delay
Browse files Browse the repository at this point in the history
Add optional shutdown delay to tasks
  • Loading branch information
schmichael committed Aug 17, 2017
2 parents 880c303 + fbe85b5 commit f143501
Show file tree
Hide file tree
Showing 11 changed files with 119 additions and 11 deletions.
1 change: 1 addition & 0 deletions api/tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,7 @@ type Task struct {
Templates []*Template
DispatchPayload *DispatchPayloadConfig
Leader bool
ShutdownDelay time.Duration `mapstructure:"shutdown_delay"`
}

func (t *Task) Canonicalize(tg *TaskGroup, job *Job) {
Expand Down
7 changes: 7 additions & 0 deletions client/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1171,6 +1171,13 @@ func (r *TaskRunner) run() {
interpTask := interpolateServices(r.envBuilder.Build(), r.task)
r.consul.RemoveTask(r.alloc.ID, interpTask)

// Delay actually killing the task if configured. See #244
if r.task.ShutdownDelay > 0 {
r.logger.Printf("[DEBUG] client: delaying shutdown of alloc %q task %q for %q",
r.alloc.ID, r.task.Name, r.task.ShutdownDelay)
<-time.After(r.task.ShutdownDelay)
}

// Store the task event that provides context on the task
// destroy. The Killed event is set from the alloc_runner and
// doesn't add detail
Expand Down
64 changes: 64 additions & 0 deletions client/task_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1614,3 +1614,67 @@ func TestTaskRunner_Pre06ScriptCheck(t *testing.T) {
t.Run(run("0.5.6", "java", "tcp", false))
t.Run(run("0.5.6", "mock_driver", "tcp", false))
}

func TestTaskRunner_ShutdownDelay(t *testing.T) {
t.Parallel()

alloc := mock.Alloc()
task := alloc.Job.TaskGroups[0].Tasks[0]
task.Driver = "mock_driver"
task.Config = map[string]interface{}{
"run_for": "1000s",
}

// No shutdown escape hatch for this delay, so don't set it too high
task.ShutdownDelay = 500 * time.Duration(testutil.TestMultiplier()) * time.Millisecond

ctx := testTaskRunnerFromAlloc(t, true, alloc)
ctx.tr.MarkReceived()
go ctx.tr.Run()
defer ctx.Cleanup()

// Wait for the task to start
testWaitForTaskToStart(t, ctx)

// Begin the tear down
ctx.tr.Destroy(structs.NewTaskEvent(structs.TaskKilled))
destroyed := time.Now()

// Service should get removed quickly; loop until RemoveTask is called
found := false
mockConsul := ctx.tr.consul.(*mockConsulServiceClient)
deadline := destroyed.Add(task.ShutdownDelay)
for time.Now().Before(deadline) {
time.Sleep(5 * time.Millisecond)

mockConsul.mu.Lock()
n := len(mockConsul.ops)
if n < 2 {
mockConsul.mu.Unlock()
continue
}

lastOp := mockConsul.ops[n-1].op
mockConsul.mu.Unlock()

if lastOp == "remove" {
found = true
break
}
}
if !found {
t.Errorf("task was not removed from Consul first")
}

// Wait for actual exit
select {
case <-ctx.tr.WaitCh():
case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second):
t.Fatalf("timeout")
}

// It should be impossible to reach here in less time than the shutdown delay
if time.Now().Before(destroyed.Add(task.ShutdownDelay)) {
t.Fatalf("task exited before shutdown delay")
}
}
1 change: 1 addition & 0 deletions command/agent/job_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,7 @@ func ApiTaskToStructsTask(apiTask *api.Task, structsTask *structs.Task) {
structsTask.Env = apiTask.Env
structsTask.Meta = apiTask.Meta
structsTask.KillTimeout = *apiTask.KillTimeout
structsTask.ShutdownDelay = apiTask.ShutdownDelay

if l := len(apiTask.Constraints); l != 0 {
structsTask.Constraints = make([]*structs.Constraint, l)
Expand Down
1 change: 1 addition & 0 deletions jobspec/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,7 @@ func parseTasks(jobName string, taskGroupName string, result *[]*api.Task, list
"meta",
"resources",
"service",
"shutdown_delay",
"template",
"user",
"vault",
Expand Down
3 changes: 2 additions & 1 deletion jobspec/parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ func TestParse(t *testing.T) {
},
},
},
KillTimeout: helper.TimeToPtr(22 * time.Second),
KillTimeout: helper.TimeToPtr(22 * time.Second),
ShutdownDelay: 11 * time.Second,
LogConfig: &api.LogConfig{
MaxFiles: helper.IntToPtr(14),
MaxFileSizeMB: helper.IntToPtr(101),
Expand Down
2 changes: 2 additions & 0 deletions jobspec/test-fixtures/basic.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@ job "binstore-storagelocker" {

kill_timeout = "22s"

shutdown_delay = "11s"

artifact {
source = "http://foo.com/artifact"

Expand Down
30 changes: 21 additions & 9 deletions nomad/structs/diff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1922,8 +1922,8 @@ func TestTaskGroupDiff(t *testing.T) {
Driver: "docker",
},
{
Name: "baz",
Driver: "docker",
Name: "baz",
ShutdownDelay: 1 * time.Second,
},
},
},
Expand All @@ -1933,14 +1933,14 @@ func TestTaskGroupDiff(t *testing.T) {
Name: "bar",
Driver: "docker",
},
{
Name: "baz",
Driver: "exec",
},
{
Name: "bam",
Driver: "docker",
},
{
Name: "baz",
ShutdownDelay: 2 * time.Second,
},
},
},
Expected: &TaskGroupDiff{
Expand Down Expand Up @@ -1968,6 +1968,12 @@ func TestTaskGroupDiff(t *testing.T) {
Old: "",
New: "false",
},
{
Type: DiffTypeAdded,
Name: "ShutdownDelay",
Old: "",
New: "0",
},
},
},
{
Expand All @@ -1980,9 +1986,9 @@ func TestTaskGroupDiff(t *testing.T) {
Fields: []*FieldDiff{
{
Type: DiffTypeEdited,
Name: "Driver",
Old: "docker",
New: "exec",
Name: "ShutdownDelay",
Old: "1000000000",
New: "2000000000",
},
},
},
Expand All @@ -2008,6 +2014,12 @@ func TestTaskGroupDiff(t *testing.T) {
Old: "false",
New: "",
},
{
Type: DiffTypeDeleted,
Name: "ShutdownDelay",
Old: "0",
New: "",
},
},
},
},
Expand Down
9 changes: 8 additions & 1 deletion nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -2997,6 +2997,10 @@ type Task struct {
// Leader marks the task as the leader within the group. When the leader
// task exits, other tasks will be gracefully terminated.
Leader bool

// ShutdownDelay is the duration of the delay between deregistering a
// task from Consul and sending it a signal to shutdown. See #2441
ShutdownDelay time.Duration
}

func (t *Task) Copy() *Task {
Expand Down Expand Up @@ -3104,9 +3108,12 @@ func (t *Task) Validate(ephemeralDisk *EphemeralDisk) error {
if t.Driver == "" {
mErr.Errors = append(mErr.Errors, errors.New("Missing task driver"))
}
if t.KillTimeout.Nanoseconds() < 0 {
if t.KillTimeout < 0 {
mErr.Errors = append(mErr.Errors, errors.New("KillTimeout must be a positive value"))
}
if t.ShutdownDelay < 0 {
mErr.Errors = append(mErr.Errors, errors.New("ShutdownDelay must be a positive value"))
}

// Validate the resources.
if t.Resources == nil {
Expand Down
6 changes: 6 additions & 0 deletions website/source/api/json-jobs.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,12 @@ The `Task` object supports the following keys:
- `TLSSkipVerify`: If true, Consul will not attempt to verify the
certificate when performing HTTPS checks. Requires Consul >= 0.7.2.

- `ShutdownDelay` - Specifies the duration to wait when killing a task between
removing it from Consul and sending it a shutdown signal. Ideally services
would fail healthchecks once they receive a shutdown signal. Alternatively
`ShutdownDelay` may be set to give in flight requests time to complete before
shutting down.

- `Templates` - Specifies the set of [`Template`](#template) objects to render for the task.
Templates can be used to inject both static and dynamic configuration with
data populated from environment variables, Consul and Vault.
Expand Down
6 changes: 6 additions & 0 deletions website/source/docs/job-specification/task.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,12 @@ job "docs" {
[Consul][] for service discovery. Nomad automatically registers when a task
is started and de-registers it when the task dies.

- `shutdown_delay` `(string: "0s")` - Specifies the duration to wait when
killing a task between removing it from Consul and sending it a shutdown
signal. Ideally services would fail healthchecks once they receive a shutdown
signal. Alternatively `shutdown_delay` may be set to give in flight requests
time to complete before shutting down.

- `user` `(string: <varies>)` - Specifies the user that will run the task.
Defaults to `nobody` for the [`exec`][exec] and [`java`][java] drivers.
[Docker][] and [rkt][] images specify their own default users. This can only
Expand Down

0 comments on commit f143501

Please sign in to comment.