Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optional shutdown delay to tasks #3043

Merged
merged 6 commits into from
Aug 17, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,7 @@ type Task struct {
Templates []*Template
DispatchPayload *DispatchPayloadConfig
Leader bool
ShutdownDelay time.Duration `mapstructure:"shutdown_delay"`
}

func (t *Task) Canonicalize(tg *TaskGroup, job *Job) {
Expand Down
7 changes: 7 additions & 0 deletions client/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1171,6 +1171,13 @@ func (r *TaskRunner) run() {
interpTask := interpolateServices(r.envBuilder.Build(), r.task)
r.consul.RemoveTask(r.alloc.ID, interpTask)

// Delay actually killing the task if configured. See #244
if r.task.ShutdownDelay > 0 {
r.logger.Printf("[DEBUG] client: delaying shutdown of alloc %q task %q for %q",
r.alloc.ID, r.task.Name, r.task.ShutdownDelay)
<-time.After(r.task.ShutdownDelay)
}

// Store the task event that provides context on the task
// destroy. The Killed event is set from the alloc_runner and
// doesn't add detail
Expand Down
64 changes: 64 additions & 0 deletions client/task_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1614,3 +1614,67 @@ func TestTaskRunner_Pre06ScriptCheck(t *testing.T) {
t.Run(run("0.5.6", "java", "tcp", false))
t.Run(run("0.5.6", "mock_driver", "tcp", false))
}

func TestTaskRunner_ShutdownDelay(t *testing.T) {
t.Parallel()

alloc := mock.Alloc()
task := alloc.Job.TaskGroups[0].Tasks[0]
task.Driver = "mock_driver"
task.Config = map[string]interface{}{
"run_for": "1000s",
}

// No shutdown escape hatch for this delay, so don't set it too high
task.ShutdownDelay = 500 * time.Duration(testutil.TestMultiplier()) * time.Millisecond

ctx := testTaskRunnerFromAlloc(t, true, alloc)
ctx.tr.MarkReceived()
go ctx.tr.Run()
defer ctx.Cleanup()

// Wait for the task to start
testWaitForTaskToStart(t, ctx)

// Begin the tear down
ctx.tr.Destroy(structs.NewTaskEvent(structs.TaskKilled))
destroyed := time.Now()

// Service should get removed quickly; loop until RemoveTask is called
found := false
mockConsul := ctx.tr.consul.(*mockConsulServiceClient)
deadline := destroyed.Add(task.ShutdownDelay)
for time.Now().Before(deadline) {
time.Sleep(5 * time.Millisecond)

mockConsul.mu.Lock()
n := len(mockConsul.ops)
if n < 2 {
mockConsul.mu.Unlock()
continue
}

lastOp := mockConsul.ops[n-1].op
mockConsul.mu.Unlock()

if lastOp == "remove" {
found = true
break
}
}
if !found {
t.Errorf("task was not removed from Consul first")
}

// Wait for actual exit
select {
case <-ctx.tr.WaitCh():
case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second):
t.Fatalf("timeout")
}

// It should be impossible to reach here in less time than the shutdown delay
if time.Now().Before(destroyed.Add(task.ShutdownDelay)) {
t.Fatalf("task exited before shutdown delay")
}
}
1 change: 1 addition & 0 deletions command/agent/job_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,7 @@ func ApiTaskToStructsTask(apiTask *api.Task, structsTask *structs.Task) {
structsTask.Env = apiTask.Env
structsTask.Meta = apiTask.Meta
structsTask.KillTimeout = *apiTask.KillTimeout
structsTask.ShutdownDelay = apiTask.ShutdownDelay

if l := len(apiTask.Constraints); l != 0 {
structsTask.Constraints = make([]*structs.Constraint, l)
Expand Down
1 change: 1 addition & 0 deletions jobspec/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,7 @@ func parseTasks(jobName string, taskGroupName string, result *[]*api.Task, list
"meta",
"resources",
"service",
"shutdown_delay",
"template",
"user",
"vault",
Expand Down
3 changes: 2 additions & 1 deletion jobspec/parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ func TestParse(t *testing.T) {
},
},
},
KillTimeout: helper.TimeToPtr(22 * time.Second),
KillTimeout: helper.TimeToPtr(22 * time.Second),
ShutdownDelay: 11 * time.Second,
LogConfig: &api.LogConfig{
MaxFiles: helper.IntToPtr(14),
MaxFileSizeMB: helper.IntToPtr(101),
Expand Down
2 changes: 2 additions & 0 deletions jobspec/test-fixtures/basic.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@ job "binstore-storagelocker" {

kill_timeout = "22s"

shutdown_delay = "11s"

artifact {
source = "http://foo.com/artifact"

Expand Down
30 changes: 21 additions & 9 deletions nomad/structs/diff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1922,8 +1922,8 @@ func TestTaskGroupDiff(t *testing.T) {
Driver: "docker",
},
{
Name: "baz",
Driver: "docker",
Name: "baz",
ShutdownDelay: 1 * time.Second,
},
},
},
Expand All @@ -1933,14 +1933,14 @@ func TestTaskGroupDiff(t *testing.T) {
Name: "bar",
Driver: "docker",
},
{
Name: "baz",
Driver: "exec",
},
{
Name: "bam",
Driver: "docker",
},
{
Name: "baz",
ShutdownDelay: 2 * time.Second,
},
},
},
Expected: &TaskGroupDiff{
Expand Down Expand Up @@ -1968,6 +1968,12 @@ func TestTaskGroupDiff(t *testing.T) {
Old: "",
New: "false",
},
{
Type: DiffTypeAdded,
Name: "ShutdownDelay",
Old: "",
New: "0",
},
},
},
{
Expand All @@ -1980,9 +1986,9 @@ func TestTaskGroupDiff(t *testing.T) {
Fields: []*FieldDiff{
{
Type: DiffTypeEdited,
Name: "Driver",
Old: "docker",
New: "exec",
Name: "ShutdownDelay",
Old: "1000000000",
New: "2000000000",
},
},
},
Expand All @@ -2008,6 +2014,12 @@ func TestTaskGroupDiff(t *testing.T) {
Old: "false",
New: "",
},
{
Type: DiffTypeDeleted,
Name: "ShutdownDelay",
Old: "0",
New: "",
},
},
},
},
Expand Down
9 changes: 8 additions & 1 deletion nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -2997,6 +2997,10 @@ type Task struct {
// Leader marks the task as the leader within the group. When the leader
// task exits, other tasks will be gracefully terminated.
Leader bool

// ShutdownDelay is the duration of the delay between deregistering a
// task from Consul and sending it a signal to shutdown. See #2441
ShutdownDelay time.Duration
}

func (t *Task) Copy() *Task {
Expand Down Expand Up @@ -3104,9 +3108,12 @@ func (t *Task) Validate(ephemeralDisk *EphemeralDisk) error {
if t.Driver == "" {
mErr.Errors = append(mErr.Errors, errors.New("Missing task driver"))
}
if t.KillTimeout.Nanoseconds() < 0 {
if t.KillTimeout < 0 {
mErr.Errors = append(mErr.Errors, errors.New("KillTimeout must be a positive value"))
}
if t.ShutdownDelay < 0 {
mErr.Errors = append(mErr.Errors, errors.New("ShutdownDelay must be a positive value"))
}

// Validate the resources.
if t.Resources == nil {
Expand Down
6 changes: 6 additions & 0 deletions website/source/api/json-jobs.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,12 @@ The `Task` object supports the following keys:
- `TLSSkipVerify`: If true, Consul will not attempt to verify the
certificate when performing HTTPS checks. Requires Consul >= 0.7.2.

- `ShutdownDelay` - Specifies the duration to wait when killing a task between
removing it from Consul and sending it a shutdown signal. Ideally services
would fail healthchecks once they receive a shutdown signal. Alternatively
`ShutdownDelay` may be set to give in flight requests time to complete before
shutting down.

- `Templates` - Specifies the set of [`Template`](#template) objects to render for the task.
Templates can be used to inject both static and dynamic configuration with
data populated from environment variables, Consul and Vault.
Expand Down
6 changes: 6 additions & 0 deletions website/source/docs/job-specification/task.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,12 @@ job "docs" {
[Consul][] for service discovery. Nomad automatically registers when a task
is started and de-registers it when the task dies.

- `shutdown_delay` `(string: "0s")` - Specifies the duration to wait when
killing a task between removing it from Consul and sending it a shutdown
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

JSON job spec as well

signal. Ideally services would fail healthchecks once they receive a shutdown
signal. Alternatively `shutdown_delay` may be set to give in flight requests
time to complete before shutting down.

- `user` `(string: <varies>)` - Specifies the user that will run the task.
Defaults to `nobody` for the [`exec`][exec] and [`java`][java] drivers.
[Docker][] and [rkt][] images specify their own default users. This can only
Expand Down