Skip to content

Commit

Permalink
client: enforce max_kill_timeout client configuration
Browse files Browse the repository at this point in the history
This PR fixes a bug where client configuration max_kill_timeout was
not being enforced. The feature was introduced in 9f44780 but seems
to have been removed during the major drivers refactoring.

We can make sure the value is enforced by pluming it through the DriverHandler,
which now uses the lesser of the task.killTimeout or client.maxKillTimeout.
Also updates Event.SetKillTimeout to require both the task.killTimeout and
client.maxKillTimeout so that we don't make the mistake of using the wrong
value - as it was being given only the task.killTimeout before.
  • Loading branch information
shoenig committed Jul 6, 2022
1 parent e193217 commit dbcccc7
Show file tree
Hide file tree
Showing 9 changed files with 30 additions and 14 deletions.
3 changes: 3 additions & 0 deletions .changelog/13626.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
client: Fixed a bug where max_kill_timeout client config was ignored
```
6 changes: 3 additions & 3 deletions client/allocrunner/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
}

taskEvent := structs.NewTaskEvent(structs.TaskKilling)
taskEvent.SetKillTimeout(tr.Task().KillTimeout)
taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
err := tr.Kill(context.TODO(), taskEvent)
if err != nil && err != taskrunner.ErrTaskNotRunning {
ar.logger.Warn("error stopping leader task", "error", err, "task_name", name)
Expand All @@ -643,7 +643,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
go func(name string, tr *taskrunner.TaskRunner) {
defer wg.Done()
taskEvent := structs.NewTaskEvent(structs.TaskKilling)
taskEvent.SetKillTimeout(tr.Task().KillTimeout)
taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
err := tr.Kill(context.TODO(), taskEvent)
if err != nil && err != taskrunner.ErrTaskNotRunning {
ar.logger.Warn("error stopping task", "error", err, "task_name", name)
Expand All @@ -667,7 +667,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
go func(name string, tr *taskrunner.TaskRunner) {
defer wg.Done()
taskEvent := structs.NewTaskEvent(structs.TaskKilling)
taskEvent.SetKillTimeout(tr.Task().KillTimeout)
taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
err := tr.Kill(context.TODO(), taskEvent)
if err != nil && err != taskrunner.ErrTaskNotRunning {
ar.logger.Warn("error stopping sidecar task", "error", err, "task_name", name)
Expand Down
2 changes: 1 addition & 1 deletion client/allocrunner/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {

expectedKillingMsg := "Sent interrupt. Waiting 10ms before force killing"
if killingMsg != expectedKillingMsg {
return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", killingMsg, expectedKillingMsg)
return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", expectedKillingMsg, killingMsg)
}

// Task Two should be dead
Expand Down
10 changes: 8 additions & 2 deletions client/allocrunner/taskrunner/driver_handle.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,24 @@ import (
"time"

cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/drivers"
)

// NewDriverHandle returns a handle for task operations on a specific task
func NewDriverHandle(driver drivers.DriverPlugin, taskID string, task *structs.Task, net *drivers.DriverNetwork) *DriverHandle {
func NewDriverHandle(
driver drivers.DriverPlugin,
taskID string,
task *structs.Task,
maxKillTimeout time.Duration,
net *drivers.DriverNetwork) *DriverHandle {
return &DriverHandle{
driver: driver,
net: net,
taskID: taskID,
killSignal: task.KillSignal,
killTimeout: task.KillTimeout,
killTimeout: helper.Min(task.KillTimeout, maxKillTimeout),
}
}

Expand Down
2 changes: 1 addition & 1 deletion client/allocrunner/taskrunner/remotetask_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ func (h *remoteTaskHook) Prestart(ctx context.Context, req *interfaces.TaskPrest
return nil
}

h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), taskInfo.NetworkOverride))
h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), h.tr.clientConfig.MaxKillTimeout, taskInfo.NetworkOverride))

h.tr.stateLock.Lock()
h.tr.localState.TaskHandle = th
Expand Down
4 changes: 2 additions & 2 deletions client/allocrunner/taskrunner/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -883,7 +883,7 @@ func (tr *TaskRunner) runDriver() error {
}
tr.stateLock.Unlock()

tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net))
tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net))

// Emit an event that we started
tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
Expand Down Expand Up @@ -1182,7 +1182,7 @@ func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers
}

// Update driver handle on task runner
tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net))
tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net))
return true
}

Expand Down
5 changes: 5 additions & 0 deletions client/config/testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"io/ioutil"
"os"
"path/filepath"
"time"

"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/helper"
Expand Down Expand Up @@ -64,5 +65,9 @@ func TestClientConfig(t testing.T) (*Config, func()) {
// Loosen GC threshold
conf.GCDiskUsageThreshold = 98.0
conf.GCInodeUsageThreshold = 98.0

// Same as default; necessary for task Event messages
conf.MaxKillTimeout = 30 * time.Second

return conf, cleanup
}
9 changes: 5 additions & 4 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -6976,7 +6976,7 @@ type Task struct {
// task exits, other tasks will be gracefully terminated.
Leader bool

// ShutdownDelay is the duration of the delay between deregistering a
// ShutdownDelay is the duration of the delay between de-registering a
// task from Consul and sending it a signal to shutdown. See #2441
ShutdownDelay time.Duration

Expand Down Expand Up @@ -8379,9 +8379,10 @@ func (e *TaskEvent) SetValidationError(err error) *TaskEvent {
return e
}

func (e *TaskEvent) SetKillTimeout(timeout time.Duration) *TaskEvent {
e.KillTimeout = timeout
e.Details["kill_timeout"] = timeout.String()
func (e *TaskEvent) SetKillTimeout(timeout, maxTimeout time.Duration) *TaskEvent {
actual := helper.Min(timeout, maxTimeout)
e.KillTimeout = actual
e.Details["kill_timeout"] = actual.String()
return e
}

Expand Down
3 changes: 2 additions & 1 deletion nomad/structs/structs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6137,7 +6137,8 @@ func TestTaskEventPopulate(t *testing.T) {
{NewTaskEvent(TaskRestarting).SetRestartReason("Chaos Monkey did it"), "Chaos Monkey did it - Task restarting in 0s"},
{NewTaskEvent(TaskKilling), "Sent interrupt"},
{NewTaskEvent(TaskKilling).SetKillReason("Its time for you to die"), "Its time for you to die"},
{NewTaskEvent(TaskKilling).SetKillTimeout(1 * time.Second), "Sent interrupt. Waiting 1s before force killing"},
{NewTaskEvent(TaskKilling).SetKillTimeout(1*time.Second, 5*time.Second), "Sent interrupt. Waiting 1s before force killing"},
{NewTaskEvent(TaskKilling).SetKillTimeout(10*time.Second, 5*time.Second), "Sent interrupt. Waiting 5s before force killing"},
{NewTaskEvent(TaskTerminated).SetExitCode(-1).SetSignal(3), "Exit Code: -1, Signal: 3"},
{NewTaskEvent(TaskTerminated).SetMessage("Goodbye"), "Exit Code: 0, Exit Message: \"Goodbye\""},
{NewTaskEvent(TaskKilled), "Task successfully killed"},
Expand Down

0 comments on commit dbcccc7

Please sign in to comment.