Skip to content

Commit

Permalink
docker: kill signal API should include timeout context (#9502)
Browse files Browse the repository at this point in the history
When the Docker driver kills as task, we send a request via the Docker API for
dockerd to fire the signal. We send that signal and then block for the
`kill_timeout` waiting for the container to exit. But if the Docker API
blocks, we will block indefinitely because we haven't configured the API call
with the same timeout.

This changeset is a minimal intervention to add the timeout to the Docker API
call _only_ when we have the `kill_timeout` set. Future work should examine
whether we should be threading contexts through other `go-dockerclient` API
calls.
  • Loading branch information
tgross committed Dec 2, 2020
2 parents cf6055e + a1bfc03 commit dc2bec7
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ BUG FIXES:
* csi: Fixed a bug where `nomad volume detach` would not accept prefixes for the node ID parameter. [[GH-9041](https://github.com/hashicorp/nomad/issues/9041)]
* csi: Fixed a bug where `nomad alloc status -verbose` would display an error when querying volumes. [[GH-9354](https://github.com/hashicorp/nomad/issues/9354)]
* csi: Fixed a bug where queries for CSI plugins could be interleaved, resulting in inconsistent counts of plugins. [[GH-9438](https://github.com/hashicorp/nomad/issues/9438)]
* driver/docker: Fixed a bug where the Docker daemon could block longer than the `kill_timeout`. [[GH-9502](https://github.com/hashicorp/nomad/issues/9502)
* driver/docker: Fixed a bug where the default `image_delay` configuration was ignored if the `gc` configuration was not set. [[GH-9101](https://github.com/hashicorp/nomad/issues/9101)]
* driver/raw_exec: Fixed a bug where raw_exec attempts to create a freezer cgroups for the tasks even when `no_cgroups` is set. [[GH-9328](https://github.com/hashicorp/nomad/issues/9328)]
* ui: Fixed a bug in the volume status page where read allocations and write allocations were not displayed. [[GH-9377](https://github.com/hashicorp/nomad/issues/9377)]
Expand Down
5 changes: 4 additions & 1 deletion drivers/docker/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -1461,7 +1461,10 @@ func (d *Driver) SignalTask(taskID string, signal string) error {
return fmt.Errorf("failed to parse signal: %v", err)
}

return h.Signal(sig)
// TODO: review whether we can timeout in this and other Docker API
// calls without breaking the expected client behavior.
// see https://github.com/hashicorp/nomad/issues/9503
return h.Signal(context.Background(), sig)
}

func (d *Driver) ExecTask(taskID string, cmd []string, timeout time.Duration) (*drivers.ExecTaskResult, error) {
Expand Down
14 changes: 9 additions & 5 deletions drivers/docker/handle.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func (h *taskHandle) Exec(ctx context.Context, cmd string, args []string) (*driv
return execResult, nil
}

func (h *taskHandle) Signal(s os.Signal) error {
func (h *taskHandle) Signal(ctx context.Context, s os.Signal) error {
// Convert types
sysSig, ok := s.(syscall.Signal)
if !ok {
Expand All @@ -116,8 +116,9 @@ func (h *taskHandle) Signal(s os.Signal) error {

dockerSignal := docker.Signal(sysSig)
opts := docker.KillContainerOptions{
ID: h.containerID,
Signal: dockerSignal,
ID: h.containerID,
Signal: dockerSignal,
Context: ctx,
}
return h.client.KillContainer(opts)

Expand All @@ -127,7 +128,10 @@ func (h *taskHandle) Signal(s os.Signal) error {
func (h *taskHandle) Kill(killTimeout time.Duration, signal os.Signal) error {
// Only send signal if killTimeout is set, otherwise stop container
if killTimeout > 0 {
if err := h.Signal(signal); err != nil {
ctx, cancel := context.WithTimeout(context.Background(), killTimeout)
defer cancel()

if err := h.Signal(ctx, signal); err != nil {
// Container has already been removed.
if strings.Contains(err.Error(), NoSuchContainerError) {
h.logger.Debug("attempted to signal nonexistent container")
Expand All @@ -146,7 +150,7 @@ func (h *taskHandle) Kill(killTimeout time.Duration, signal os.Signal) error {
select {
case <-h.waitCh:
return nil
case <-time.After(killTimeout):
case <-ctx.Done():
}
}

Expand Down

0 comments on commit dc2bec7

Please sign in to comment.