Skip to content

Commit

Permalink
Merge pull request #13626 from hashicorp/b-client-max-kill-timeout
Browse files Browse the repository at this point in the history
client: enforce max_kill_timeout client configuration
  • Loading branch information
shoenig committed Jul 7, 2022
2 parents d9a0d64 + dbcccc7 commit 7f11f88
Show file tree
Hide file tree
Showing 9 changed files with 30 additions and 14 deletions.
3 changes: 3 additions & 0 deletions .changelog/13626.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
client: Fixed a bug where max_kill_timeout client config was ignored
```
6 changes: 3 additions & 3 deletions client/allocrunner/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
}

taskEvent := structs.NewTaskEvent(structs.TaskKilling)
taskEvent.SetKillTimeout(tr.Task().KillTimeout)
taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
err := tr.Kill(context.TODO(), taskEvent)
if err != nil && err != taskrunner.ErrTaskNotRunning {
ar.logger.Warn("error stopping leader task", "error", err, "task_name", name)
Expand All @@ -643,7 +643,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
go func(name string, tr *taskrunner.TaskRunner) {
defer wg.Done()
taskEvent := structs.NewTaskEvent(structs.TaskKilling)
taskEvent.SetKillTimeout(tr.Task().KillTimeout)
taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
err := tr.Kill(context.TODO(), taskEvent)
if err != nil && err != taskrunner.ErrTaskNotRunning {
ar.logger.Warn("error stopping task", "error", err, "task_name", name)
Expand All @@ -667,7 +667,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
go func(name string, tr *taskrunner.TaskRunner) {
defer wg.Done()
taskEvent := structs.NewTaskEvent(structs.TaskKilling)
taskEvent.SetKillTimeout(tr.Task().KillTimeout)
taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
err := tr.Kill(context.TODO(), taskEvent)
if err != nil && err != taskrunner.ErrTaskNotRunning {
ar.logger.Warn("error stopping sidecar task", "error", err, "task_name", name)
Expand Down
2 changes: 1 addition & 1 deletion client/allocrunner/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {

expectedKillingMsg := "Sent interrupt. Waiting 10ms before force killing"
if killingMsg != expectedKillingMsg {
return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", killingMsg, expectedKillingMsg)
return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", expectedKillingMsg, killingMsg)
}

// Task Two should be dead
Expand Down
10 changes: 8 additions & 2 deletions client/allocrunner/taskrunner/driver_handle.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,24 @@ import (
"time"

cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/drivers"
)

// NewDriverHandle returns a handle for task operations on a specific task
func NewDriverHandle(driver drivers.DriverPlugin, taskID string, task *structs.Task, net *drivers.DriverNetwork) *DriverHandle {
func NewDriverHandle(
driver drivers.DriverPlugin,
taskID string,
task *structs.Task,
maxKillTimeout time.Duration,
net *drivers.DriverNetwork) *DriverHandle {
return &DriverHandle{
driver: driver,
net: net,
taskID: taskID,
killSignal: task.KillSignal,
killTimeout: task.KillTimeout,
killTimeout: helper.Min(task.KillTimeout, maxKillTimeout),
}
}

Expand Down
2 changes: 1 addition & 1 deletion client/allocrunner/taskrunner/remotetask_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ func (h *remoteTaskHook) Prestart(ctx context.Context, req *interfaces.TaskPrest
return nil
}

h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), taskInfo.NetworkOverride))
h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), h.tr.clientConfig.MaxKillTimeout, taskInfo.NetworkOverride))

h.tr.stateLock.Lock()
h.tr.localState.TaskHandle = th
Expand Down
4 changes: 2 additions & 2 deletions client/allocrunner/taskrunner/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -883,7 +883,7 @@ func (tr *TaskRunner) runDriver() error {
}
tr.stateLock.Unlock()

tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net))
tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net))

// Emit an event that we started
tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
Expand Down Expand Up @@ -1182,7 +1182,7 @@ func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers
}

// Update driver handle on task runner
tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net))
tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net))
return true
}

Expand Down
5 changes: 5 additions & 0 deletions client/config/testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"io/ioutil"
"os"
"path/filepath"
"time"

"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/helper"
Expand Down Expand Up @@ -64,5 +65,9 @@ func TestClientConfig(t testing.T) (*Config, func()) {
// Loosen GC threshold
conf.GCDiskUsageThreshold = 98.0
conf.GCInodeUsageThreshold = 98.0

// Same as default; necessary for task Event messages
conf.MaxKillTimeout = 30 * time.Second

return conf, cleanup
}
9 changes: 5 additions & 4 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -6979,7 +6979,7 @@ type Task struct {
// task exits, other tasks will be gracefully terminated.
Leader bool

// ShutdownDelay is the duration of the delay between deregistering a
// ShutdownDelay is the duration of the delay between de-registering a
// task from Consul and sending it a signal to shutdown. See #2441
ShutdownDelay time.Duration

Expand Down Expand Up @@ -8382,9 +8382,10 @@ func (e *TaskEvent) SetValidationError(err error) *TaskEvent {
return e
}

func (e *TaskEvent) SetKillTimeout(timeout time.Duration) *TaskEvent {
e.KillTimeout = timeout
e.Details["kill_timeout"] = timeout.String()
func (e *TaskEvent) SetKillTimeout(timeout, maxTimeout time.Duration) *TaskEvent {
actual := helper.Min(timeout, maxTimeout)
e.KillTimeout = actual
e.Details["kill_timeout"] = actual.String()
return e
}

Expand Down
3 changes: 2 additions & 1 deletion nomad/structs/structs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6137,7 +6137,8 @@ func TestTaskEventPopulate(t *testing.T) {
{NewTaskEvent(TaskRestarting).SetRestartReason("Chaos Monkey did it"), "Chaos Monkey did it - Task restarting in 0s"},
{NewTaskEvent(TaskKilling), "Sent interrupt"},
{NewTaskEvent(TaskKilling).SetKillReason("Its time for you to die"), "Its time for you to die"},
{NewTaskEvent(TaskKilling).SetKillTimeout(1 * time.Second), "Sent interrupt. Waiting 1s before force killing"},
{NewTaskEvent(TaskKilling).SetKillTimeout(1*time.Second, 5*time.Second), "Sent interrupt. Waiting 1s before force killing"},
{NewTaskEvent(TaskKilling).SetKillTimeout(10*time.Second, 5*time.Second), "Sent interrupt. Waiting 5s before force killing"},
{NewTaskEvent(TaskTerminated).SetExitCode(-1).SetSignal(3), "Exit Code: -1, Signal: 3"},
{NewTaskEvent(TaskTerminated).SetMessage("Goodbye"), "Exit Code: 0, Exit Message: \"Goodbye\""},
{NewTaskEvent(TaskKilled), "Task successfully killed"},
Expand Down

0 comments on commit 7f11f88

Please sign in to comment.