Skip to content

Commit

Permalink
Making pull activity timeout configurable in Docker
Browse files Browse the repository at this point in the history
* Making pull activity timeout configurable in Docker plugin config, first pass

* Fixing broken function call

* Fixing broken tests

* Fixing linter suggestion

* Adding documentation on new parameter in Docker plugin config

* Adding unit test

* Setting min value for pull_activity_timeout, making pull activity duration a private var
  • Loading branch information
schledererj authored and endocrimes committed Dec 18, 2019
1 parent 309b4ff commit 8159273
Show file tree
Hide file tree
Showing 8 changed files with 77 additions and 29 deletions.
43 changes: 32 additions & 11 deletions drivers/docker/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
"time"

docker "github.com/fsouza/go-dockerclient"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/helper/pluginutils/hclutils"
"github.com/hashicorp/nomad/helper/pluginutils/loader"
"github.com/hashicorp/nomad/plugins/base"
Expand Down Expand Up @@ -258,6 +258,13 @@ var (
hclspec.NewLiteral(`"gcr.io/google_containers/pause-amd64:3.0"`),
),

// the duration that the driver will wait for activity from the Docker engine during an image pull
// before canceling the request
"pull_activity_timeout": hclspec.NewDefault(
hclspec.NewAttr("pull_activity_timeout", "string", false),
hclspec.NewLiteral(`"2m"`),
),

// disable_log_collection indicates whether docker driver should collect logs of docker
// task containers. If true, nomad doesn't start docker_logger/logmon processes
"disable_log_collection": hclspec.NewAttr("disable_log_collection", "bool", false),
Expand Down Expand Up @@ -553,16 +560,18 @@ type ContainerGCConfig struct {
}

type DriverConfig struct {
Endpoint string `codec:"endpoint"`
Auth AuthConfig `codec:"auth"`
TLS TLSConfig `codec:"tls"`
GC GCConfig `codec:"gc"`
Volumes VolumeConfig `codec:"volumes"`
AllowPrivileged bool `codec:"allow_privileged"`
AllowCaps []string `codec:"allow_caps"`
GPURuntimeName string `codec:"nvidia_runtime"`
InfraImage string `codec:"infra_image"`
DisableLogCollection bool `codec:"disable_log_collection"`
Endpoint string `codec:"endpoint"`
Auth AuthConfig `codec:"auth"`
TLS TLSConfig `codec:"tls"`
GC GCConfig `codec:"gc"`
Volumes VolumeConfig `codec:"volumes"`
AllowPrivileged bool `codec:"allow_privileged"`
AllowCaps []string `codec:"allow_caps"`
GPURuntimeName string `codec:"nvidia_runtime"`
InfraImage string `codec:"infra_image"`
DisableLogCollection bool `codec:"disable_log_collection"`
PullActivityTimeout string `codec:"pull_activity_timeout"`
pullActivityTimeoutDuration time.Duration `codec:"-"`
}

type AuthConfig struct {
Expand Down Expand Up @@ -599,6 +608,7 @@ func (d *Driver) ConfigSchema() (*hclspec.Spec, error) {
}

const danglingContainersCreationGraceMinimum = 1 * time.Minute
const pullActivityTimeoutMinimum = 1 * time.Minute

func (d *Driver) SetConfig(c *base.Config) error {
var config DriverConfig
Expand Down Expand Up @@ -636,6 +646,17 @@ func (d *Driver) SetConfig(c *base.Config) error {
d.config.GC.DanglingContainers.CreationGrace = dur
}

if len(d.config.PullActivityTimeout) > 0 {
dur, err := time.ParseDuration(d.config.PullActivityTimeout)
if err != nil {
return fmt.Errorf("failed to parse 'pull_activity_timeout' duaration: %v", err)
}
if dur < pullActivityTimeoutMinimum {
return fmt.Errorf("pull_activity_timeout is less than minimum, %v", pullActivityTimeoutMinimum)
}
d.config.pullActivityTimeoutDuration = dur
}

if c.AgentConfig != nil {
d.clientConfig = c.AgentConfig.Driver
}
Expand Down
27 changes: 27 additions & 0 deletions drivers/docker/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -524,3 +524,30 @@ func TestConfig_InternalCapabilities(t *testing.T) {
}

}

func TestConfig_DriverConfig_PullActivityTimeout(t *testing.T) {
cases := []struct {
name string
config string
expected string
}{
{
name: "default",
config: `{}`,
expected: "2m",
},
{
name: "set explicitly",
config: `{ pull_activity_timeout = "5m" }`,
expected: "5m",
},
}

for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
var tc DriverConfig
hclutils.NewConfigParser(configSpec).ParseHCL(t, "config "+c.config, &tc)
require.Equal(t, c.expected, tc.PullActivityTimeout)
})
}
}
8 changes: 4 additions & 4 deletions drivers/docker/coordinator.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ func newDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator {

// PullImage is used to pull an image. It returns the pulled imaged ID or an
// error that occurred during the pull
func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn) (imageID string, err error) {
func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn, pullActivityTimeout time.Duration) (imageID string, err error) {
// Get the future
d.imageLock.Lock()
future, ok := d.pullFutures[image]
Expand All @@ -138,7 +138,7 @@ func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConf
// Make the future
future = newPullFuture()
d.pullFutures[image] = future
go d.pullImageImpl(image, authOptions, future)
go d.pullImageImpl(image, authOptions, pullActivityTimeout, future)
}
d.imageLock.Unlock()

Expand All @@ -165,14 +165,14 @@ func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConf

// pullImageImpl is the implementation of pulling an image. The results are
// returned via the passed future
func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, future *pullFuture) {
func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, pullActivityTimeout time.Duration, future *pullFuture) {
defer d.clearPullLogger(image)
// Parse the repo and tag
repo, tag := parseDockerImage(image)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

pm := newImageProgressManager(image, cancel, d.handlePullInactivity,
pm := newImageProgressManager(image, cancel, pullActivityTimeout, d.handlePullInactivity,
d.handlePullProgressReport, d.handleSlowPullProgressReport)
defer pm.stop()

Expand Down
12 changes: 6 additions & 6 deletions drivers/docker/coordinator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,10 @@ func TestDockerCoordinator_ConcurrentPulls(t *testing.T) {
// Create a coordinator
coordinator := newDockerCoordinator(config)

id, _ := coordinator.PullImage(image, nil, uuid.Generate(), nil)
id, _ := coordinator.PullImage(image, nil, uuid.Generate(), nil, 2 * time.Minute)
for i := 0; i < 9; i++ {
go func() {
coordinator.PullImage(image, nil, uuid.Generate(), nil)
coordinator.PullImage(image, nil, uuid.Generate(), nil, 2 * time.Minute)
}()
}

Expand Down Expand Up @@ -125,7 +125,7 @@ func TestDockerCoordinator_Pull_Remove(t *testing.T) {
callerIDs := make([]string, 10, 10)
for i := 0; i < 10; i++ {
callerIDs[i] = uuid.Generate()
id, _ = coordinator.PullImage(image, nil, callerIDs[i], nil)
id, _ = coordinator.PullImage(image, nil, callerIDs[i], nil, 2 * time.Minute)
}

// Check the reference count
Expand Down Expand Up @@ -190,7 +190,7 @@ func TestDockerCoordinator_Remove_Cancel(t *testing.T) {
callerID := uuid.Generate()

// Pull image
id, _ := coordinator.PullImage(image, nil, callerID, nil)
id, _ := coordinator.PullImage(image, nil, callerID, nil, 2 * time.Minute)

// Check the reference count
if references := coordinator.imageRefCount[id]; len(references) != 1 {
Expand All @@ -206,7 +206,7 @@ func TestDockerCoordinator_Remove_Cancel(t *testing.T) {
}

// Pull image again within delay
id, _ = coordinator.PullImage(image, nil, callerID, nil)
id, _ = coordinator.PullImage(image, nil, callerID, nil, 2 * time.Minute)

// Check the reference count
if references := coordinator.imageRefCount[id]; len(references) != 1 {
Expand Down Expand Up @@ -238,7 +238,7 @@ func TestDockerCoordinator_No_Cleanup(t *testing.T) {
callerID := uuid.Generate()

// Pull image
id, _ := coordinator.PullImage(image, nil, callerID, nil)
id, _ := coordinator.PullImage(image, nil, callerID, nil, 2 * time.Minute)

// Check the reference count
if references := coordinator.imageRefCount[id]; len(references) != 0 {
Expand Down
2 changes: 1 addition & 1 deletion drivers/docker/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,7 @@ func (d *Driver) pullImage(task *drivers.TaskConfig, driverConfig *TaskConfig, c
},
})

return d.coordinator.PullImage(driverConfig.Image, authOptions, task.ID, d.emitEventFunc(task))
return d.coordinator.PullImage(driverConfig.Image, authOptions, task.ID, d.emitEventFunc(task), d.config.pullActivityTimeoutDuration)
}

func (d *Driver) emitEventFunc(task *drivers.TaskConfig) LogEventFn {
Expand Down
2 changes: 1 addition & 1 deletion drivers/docker/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ func (d *Driver) CreateNetwork(allocID string) (*drivers.NetworkIsolationSpec, b
if err != nil {
d.logger.Debug("auth failed for infra container image pull", "image", d.config.InfraImage, "error", err)
}
_, err = d.coordinator.PullImage(d.config.InfraImage, authOptions, allocID, noopLogEventFn)
_, err = d.coordinator.PullImage(d.config.InfraImage, authOptions, allocID, noopLogEventFn, d.config.pullActivityTimeoutDuration)
if err != nil {
return nil, false, err
}
Expand Down
8 changes: 2 additions & 6 deletions drivers/docker/progress.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@ import (
)

const (
// dockerPullActivityDeadline is the default value set in the imageProgressManager
// when newImageProgressManager is called
dockerPullActivityDeadline = 2 * time.Minute

// dockerImageProgressReportInterval is the default value set in the
// imageProgressManager when newImageProgressManager is called
dockerImageProgressReportInterval = 10 * time.Second
Expand Down Expand Up @@ -203,11 +199,11 @@ type imageProgressManager struct {

func newImageProgressManager(
image string, cancel context.CancelFunc,
inactivityFunc, reporter, slowReporter progressReporterFunc) *imageProgressManager {
pullActivityTimeout time.Duration, inactivityFunc, reporter, slowReporter progressReporterFunc) *imageProgressManager {

pm := &imageProgressManager{
image: image,
activityDeadline: dockerPullActivityDeadline,
activityDeadline: pullActivityTimeout,
inactivityFunc: inactivityFunc,
reportInterval: dockerImageProgressReportInterval,
reporter: reporter,
Expand Down
4 changes: 4 additions & 0 deletions website/source/docs/drivers/docker.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -684,6 +684,10 @@ plugin "docker" {
the host's devices. Note that you must set a similar setting on the Docker
daemon for this to work.

* `pull_activity_timeout` - Defaults to `2m`. If Nomad receives no communication
from the Docker engine during an image pull within this timeframe, Nomad will
timeout the request that initiated the pull command. (Minimum of `1m`)

* `allow_caps`<a id="plugin_caps"></a> - A list of allowed Linux capabilities.
Defaults to
"CHOWN,DAC_OVERRIDE,FSETID,FOWNER,MKNOD,NET_RAW,SETGID,SETUID,SETFCAP,SETPCAP,
Expand Down

0 comments on commit 8159273

Please sign in to comment.