Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

client: enforce max_kill_timeout client configuration #13626

Merged
merged 1 commit into from
Jul 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/13626.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
client: Fixed a bug where max_kill_timeout client config was ignored
```
6 changes: 3 additions & 3 deletions client/allocrunner/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
}

taskEvent := structs.NewTaskEvent(structs.TaskKilling)
taskEvent.SetKillTimeout(tr.Task().KillTimeout)
taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
err := tr.Kill(context.TODO(), taskEvent)
if err != nil && err != taskrunner.ErrTaskNotRunning {
ar.logger.Warn("error stopping leader task", "error", err, "task_name", name)
Expand All @@ -643,7 +643,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
go func(name string, tr *taskrunner.TaskRunner) {
defer wg.Done()
taskEvent := structs.NewTaskEvent(structs.TaskKilling)
taskEvent.SetKillTimeout(tr.Task().KillTimeout)
taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
err := tr.Kill(context.TODO(), taskEvent)
if err != nil && err != taskrunner.ErrTaskNotRunning {
ar.logger.Warn("error stopping task", "error", err, "task_name", name)
Expand All @@ -667,7 +667,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
go func(name string, tr *taskrunner.TaskRunner) {
defer wg.Done()
taskEvent := structs.NewTaskEvent(structs.TaskKilling)
taskEvent.SetKillTimeout(tr.Task().KillTimeout)
taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
err := tr.Kill(context.TODO(), taskEvent)
if err != nil && err != taskrunner.ErrTaskNotRunning {
ar.logger.Warn("error stopping sidecar task", "error", err, "task_name", name)
Expand Down
2 changes: 1 addition & 1 deletion client/allocrunner/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {

expectedKillingMsg := "Sent interrupt. Waiting 10ms before force killing"
if killingMsg != expectedKillingMsg {
return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", killingMsg, expectedKillingMsg)
return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", expectedKillingMsg, killingMsg)
}

// Task Two should be dead
Expand Down
10 changes: 8 additions & 2 deletions client/allocrunner/taskrunner/driver_handle.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,24 @@ import (
"time"

cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/drivers"
)

// NewDriverHandle returns a handle for task operations on a specific task
func NewDriverHandle(driver drivers.DriverPlugin, taskID string, task *structs.Task, net *drivers.DriverNetwork) *DriverHandle {
func NewDriverHandle(
driver drivers.DriverPlugin,
taskID string,
task *structs.Task,
maxKillTimeout time.Duration,
net *drivers.DriverNetwork) *DriverHandle {
return &DriverHandle{
driver: driver,
net: net,
taskID: taskID,
killSignal: task.KillSignal,
killTimeout: task.KillTimeout,
killTimeout: helper.Min(task.KillTimeout, maxKillTimeout),
}
}

Expand Down
2 changes: 1 addition & 1 deletion client/allocrunner/taskrunner/remotetask_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ func (h *remoteTaskHook) Prestart(ctx context.Context, req *interfaces.TaskPrest
return nil
}

h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), taskInfo.NetworkOverride))
h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), h.tr.clientConfig.MaxKillTimeout, taskInfo.NetworkOverride))

h.tr.stateLock.Lock()
h.tr.localState.TaskHandle = th
Expand Down
4 changes: 2 additions & 2 deletions client/allocrunner/taskrunner/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -883,7 +883,7 @@ func (tr *TaskRunner) runDriver() error {
}
tr.stateLock.Unlock()

tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net))
tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net))

// Emit an event that we started
tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
Expand Down Expand Up @@ -1182,7 +1182,7 @@ func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers
}

// Update driver handle on task runner
tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net))
tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net))
return true
}

Expand Down
5 changes: 5 additions & 0 deletions client/config/testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"io/ioutil"
"os"
"path/filepath"
"time"

"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/helper"
Expand Down Expand Up @@ -64,5 +65,9 @@ func TestClientConfig(t testing.T) (*Config, func()) {
// Loosen GC threshold
conf.GCDiskUsageThreshold = 98.0
conf.GCInodeUsageThreshold = 98.0

// Same as default; necessary for task Event messages
conf.MaxKillTimeout = 30 * time.Second

return conf, cleanup
}
9 changes: 5 additions & 4 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -6976,7 +6976,7 @@ type Task struct {
// task exits, other tasks will be gracefully terminated.
Leader bool

// ShutdownDelay is the duration of the delay between deregistering a
// ShutdownDelay is the duration of the delay between de-registering a
// task from Consul and sending it a signal to shutdown. See #2441
ShutdownDelay time.Duration

Expand Down Expand Up @@ -8379,9 +8379,10 @@ func (e *TaskEvent) SetValidationError(err error) *TaskEvent {
return e
}

func (e *TaskEvent) SetKillTimeout(timeout time.Duration) *TaskEvent {
e.KillTimeout = timeout
e.Details["kill_timeout"] = timeout.String()
func (e *TaskEvent) SetKillTimeout(timeout, maxTimeout time.Duration) *TaskEvent {
actual := helper.Min(timeout, maxTimeout)
e.KillTimeout = actual
e.Details["kill_timeout"] = actual.String()
return e
}

Expand Down
3 changes: 2 additions & 1 deletion nomad/structs/structs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6137,7 +6137,8 @@ func TestTaskEventPopulate(t *testing.T) {
{NewTaskEvent(TaskRestarting).SetRestartReason("Chaos Monkey did it"), "Chaos Monkey did it - Task restarting in 0s"},
{NewTaskEvent(TaskKilling), "Sent interrupt"},
{NewTaskEvent(TaskKilling).SetKillReason("Its time for you to die"), "Its time for you to die"},
{NewTaskEvent(TaskKilling).SetKillTimeout(1 * time.Second), "Sent interrupt. Waiting 1s before force killing"},
{NewTaskEvent(TaskKilling).SetKillTimeout(1*time.Second, 5*time.Second), "Sent interrupt. Waiting 1s before force killing"},
{NewTaskEvent(TaskKilling).SetKillTimeout(10*time.Second, 5*time.Second), "Sent interrupt. Waiting 5s before force killing"},
{NewTaskEvent(TaskTerminated).SetExitCode(-1).SetSignal(3), "Exit Code: -1, Signal: 3"},
{NewTaskEvent(TaskTerminated).SetMessage("Goodbye"), "Exit Code: 0, Exit Message: \"Goodbye\""},
{NewTaskEvent(TaskKilled), "Task successfully killed"},
Expand Down