Skip to content

Commit

Permalink
driver/docker: allow configurable pull context timeout setting.
Browse files Browse the repository at this point in the history
Pulling large docker containers can take longer than the default
context timeout. Without a way to change this it is very hard for
users to utilise Nomad properly without hacky work arounds.

This change adds an optional pull_timeout config parameter which
gives operators the possibility to account for increase pull times
where needed. The infra docker image also has the option to set a
custom timeout to keep consistency.
  • Loading branch information
jrasell committed Aug 12, 2020
1 parent a2a727b commit a40a140
Show file tree
Hide file tree
Showing 10 changed files with 199 additions and 119 deletions.
45 changes: 32 additions & 13 deletions drivers/docker/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,14 +259,18 @@ var (
hclspec.NewAttr("infra_image", "string", false),
hclspec.NewLiteral(`"gcr.io/google_containers/pause-amd64:3.0"`),
),
// timeout to use when pulling the infra image.
"infra_image_pull_timeout": hclspec.NewDefault(
hclspec.NewAttr("infra_image_pull_timeout", "string", false),
hclspec.NewLiteral(`"5m"`),
),

// the duration that the driver will wait for activity from the Docker engine during an image pull
// before canceling the request
"pull_activity_timeout": hclspec.NewDefault(
hclspec.NewAttr("pull_activity_timeout", "string", false),
hclspec.NewLiteral(`"2m"`),
),

// disable_log_collection indicates whether docker driver should collect logs of docker
// task containers. If true, nomad doesn't start docker_logger/logmon processes
"disable_log_collection": hclspec.NewAttr("disable_log_collection", "bool", false),
Expand Down Expand Up @@ -349,6 +353,10 @@ var (
"pid_mode": hclspec.NewAttr("pid_mode", "string", false),
"port_map": hclspec.NewAttr("port_map", "list(map(number))", false),
"privileged": hclspec.NewAttr("privileged", "bool", false),
"image_pull_timeout": hclspec.NewDefault(
hclspec.NewAttr("image_pull_timeout", "string", false),
hclspec.NewLiteral(`"5m"`),
),
"readonly_rootfs": hclspec.NewAttr("readonly_rootfs", "bool", false),
"security_opt": hclspec.NewAttr("security_opt", "list(string)", false),
"shm_size": hclspec.NewAttr("shm_size", "number", false),
Expand Down Expand Up @@ -415,6 +423,7 @@ type TaskConfig struct {
PidMode string `codec:"pid_mode"`
PortMap hclutils.MapStrInt `codec:"port_map"`
Privileged bool `codec:"privileged"`
ImagePullTimeout string `codec:"image_pull_timeout"`
ReadonlyRootfs bool `codec:"readonly_rootfs"`
SecurityOpt []string `codec:"security_opt"`
ShmSize int64 `codec:"shm_size"`
Expand Down Expand Up @@ -567,18 +576,20 @@ type ContainerGCConfig struct {
}

type DriverConfig struct {
Endpoint string `codec:"endpoint"`
Auth AuthConfig `codec:"auth"`
TLS TLSConfig `codec:"tls"`
GC GCConfig `codec:"gc"`
Volumes VolumeConfig `codec:"volumes"`
AllowPrivileged bool `codec:"allow_privileged"`
AllowCaps []string `codec:"allow_caps"`
GPURuntimeName string `codec:"nvidia_runtime"`
InfraImage string `codec:"infra_image"`
DisableLogCollection bool `codec:"disable_log_collection"`
PullActivityTimeout string `codec:"pull_activity_timeout"`
pullActivityTimeoutDuration time.Duration `codec:"-"`
Endpoint string `codec:"endpoint"`
Auth AuthConfig `codec:"auth"`
TLS TLSConfig `codec:"tls"`
GC GCConfig `codec:"gc"`
Volumes VolumeConfig `codec:"volumes"`
AllowPrivileged bool `codec:"allow_privileged"`
AllowCaps []string `codec:"allow_caps"`
GPURuntimeName string `codec:"nvidia_runtime"`
InfraImage string `codec:"infra_image"`
InfraImagePullTimeout string `codec:"infra_image_pull_timeout"`
infraImagePullTimeoutDuration time.Duration `codec:"-"`
DisableLogCollection bool `codec:"disable_log_collection"`
PullActivityTimeout string `codec:"pull_activity_timeout"`
pullActivityTimeoutDuration time.Duration `codec:"-"`

AllowRuntimesList []string `codec:"allow_runtimes"`
allowRuntimes map[string]struct{} `codec:"-"`
Expand Down Expand Up @@ -667,6 +678,14 @@ func (d *Driver) SetConfig(c *base.Config) error {
d.config.pullActivityTimeoutDuration = dur
}

if d.config.InfraImagePullTimeout != "" {
dur, err := time.ParseDuration(d.config.InfraImagePullTimeout)
if err != nil {
return fmt.Errorf("failed to parse 'infra_image_pull_timeout' duaration: %v", err)
}
d.config.infraImagePullTimeoutDuration = dur
}

d.config.allowRuntimes = make(map[string]struct{}, len(d.config.AllowRuntimesList))
for _, r := range d.config.AllowRuntimesList {
d.config.allowRuntimes[r] = struct{}{}
Expand Down
74 changes: 54 additions & 20 deletions drivers/docker/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ func TestConfig_ParseHCL(t *testing.T) {
image = "redis:3.2"
}`,
&TaskConfig{
Image: "redis:3.2",
Devices: []DockerDevice{},
Mounts: []DockerMount{},
CPUCFSPeriod: 100000,
Image: "redis:3.2",
Devices: []DockerDevice{},
Mounts: []DockerMount{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
},
}
Expand Down Expand Up @@ -53,40 +54,44 @@ func TestConfig_ParseJSON(t *testing.T) {
name: "nil values for blocks are safe",
input: `{"Config": {"image": "bash:3", "mounts": null}}`,
expected: TaskConfig{
Image: "bash:3",
Mounts: []DockerMount{},
Devices: []DockerDevice{},
CPUCFSPeriod: 100000,
Image: "bash:3",
Mounts: []DockerMount{},
Devices: []DockerDevice{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
},
{
name: "nil values for 'volumes' field are safe",
input: `{"Config": {"image": "bash:3", "volumes": null}}`,
expected: TaskConfig{
Image: "bash:3",
Mounts: []DockerMount{},
Devices: []DockerDevice{},
CPUCFSPeriod: 100000,
Image: "bash:3",
Mounts: []DockerMount{},
Devices: []DockerDevice{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
},
{
name: "nil values for 'args' field are safe",
input: `{"Config": {"image": "bash:3", "args": null}}`,
expected: TaskConfig{
Image: "bash:3",
Mounts: []DockerMount{},
Devices: []DockerDevice{},
CPUCFSPeriod: 100000,
Image: "bash:3",
Mounts: []DockerMount{},
Devices: []DockerDevice{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
},
{
name: "nil values for string fields are safe",
input: `{"Config": {"image": "bash:3", "command": null}}`,
expected: TaskConfig{
Image: "bash:3",
Mounts: []DockerMount{},
Devices: []DockerDevice{},
CPUCFSPeriod: 100000,
Image: "bash:3",
Mounts: []DockerMount{},
Devices: []DockerDevice{},
CPUCFSPeriod: 100000,
ImagePullTimeout: "5m",
},
},
}
Expand Down Expand Up @@ -178,6 +183,7 @@ func TestConfig_ParseAllHCL(t *testing.T) {
cfgStr := `
config {
image = "redis:3.2"
image_pull_timeout = "15m"
advertise_ipv6_address = true
args = ["command_arg1", "command_arg2"]
auth {
Expand Down Expand Up @@ -301,6 +307,7 @@ config {

expected := &TaskConfig{
Image: "redis:3.2",
ImagePullTimeout: "15m",
AdvertiseIPv6Addr: true,
Args: []string{"command_arg1", "command_arg2"},
Auth: DockerAuth{
Expand Down Expand Up @@ -528,6 +535,33 @@ func TestConfig_InternalCapabilities(t *testing.T) {
}
}

func TestConfig_DriverConfig_InfraImagePullTimeout(t *testing.T) {
cases := []struct {
name string
config string
expected string
}{
{
name: "default",
config: `{}`,
expected: "5m",
},
{
name: "set explicitly",
config: `{ infra_image_pull_timeout = "1m" }`,
expected: "1m",
},
}

for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
var tc DriverConfig
hclutils.NewConfigParser(configSpec).ParseHCL(t, "config "+c.config, &tc)
require.Equal(t, c.expected, tc.InfraImagePullTimeout)
})
}
}

func TestConfig_DriverConfig_PullActivityTimeout(t *testing.T) {
cases := []struct {
name string
Expand Down
11 changes: 7 additions & 4 deletions drivers/docker/coordinator.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ func newDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator {

// PullImage is used to pull an image. It returns the pulled imaged ID or an
// error that occurred during the pull
func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn, pullActivityTimeout time.Duration) (imageID string, err error) {
func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string,
emitFn LogEventFn, pullTimeout, pullActivityTimeout time.Duration) (imageID string, err error) {
// Get the future
d.imageLock.Lock()
future, ok := d.pullFutures[image]
Expand All @@ -140,7 +141,7 @@ func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConf
// Make the future
future = newPullFuture()
d.pullFutures[image] = future
go d.pullImageImpl(image, authOptions, pullActivityTimeout, future)
go d.pullImageImpl(image, authOptions, pullTimeout, pullActivityTimeout, future)
}
d.imageLock.Unlock()

Expand All @@ -167,11 +168,13 @@ func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConf

// pullImageImpl is the implementation of pulling an image. The results are
// returned via the passed future
func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, pullActivityTimeout time.Duration, future *pullFuture) {
func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration,
pullTimeout, pullActivityTimeout time.Duration, future *pullFuture) {

defer d.clearPullLogger(image)
// Parse the repo and tag
repo, tag := parseDockerImage(image)
ctx, cancel := context.WithCancel(context.Background())
ctx, cancel := context.WithTimeout(context.Background(), pullTimeout)
defer cancel()

pm := newImageProgressManager(image, cancel, pullActivityTimeout, d.handlePullInactivity,
Expand Down
16 changes: 8 additions & 8 deletions drivers/docker/coordinator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,10 @@ func TestDockerCoordinator_ConcurrentPulls(t *testing.T) {
// Create a coordinator
coordinator := newDockerCoordinator(config)

id, _ := coordinator.PullImage(image, nil, uuid.Generate(), nil, 2*time.Minute)
id, _ := coordinator.PullImage(image, nil, uuid.Generate(), nil, 5*time.Minute, 2*time.Minute)
for i := 0; i < 9; i++ {
go func() {
coordinator.PullImage(image, nil, uuid.Generate(), nil, 2*time.Minute)
coordinator.PullImage(image, nil, uuid.Generate(), nil, 5*time.Minute, 2*time.Minute)
}()
}

Expand Down Expand Up @@ -129,7 +129,7 @@ func TestDockerCoordinator_Pull_Remove(t *testing.T) {
callerIDs := make([]string, 10, 10)
for i := 0; i < 10; i++ {
callerIDs[i] = uuid.Generate()
id, _ = coordinator.PullImage(image, nil, callerIDs[i], nil, 2*time.Minute)
id, _ = coordinator.PullImage(image, nil, callerIDs[i], nil, 5*time.Minute, 2*time.Minute)
}

// Check the reference count
Expand Down Expand Up @@ -195,7 +195,7 @@ func TestDockerCoordinator_Remove_Cancel(t *testing.T) {
callerID := uuid.Generate()

// Pull image
id, _ := coordinator.PullImage(image, nil, callerID, nil, 2*time.Minute)
id, _ := coordinator.PullImage(image, nil, callerID, nil, 5*time.Minute, 2*time.Minute)

// Check the reference count
if references := coordinator.imageRefCount[id]; len(references) != 1 {
Expand All @@ -211,7 +211,7 @@ func TestDockerCoordinator_Remove_Cancel(t *testing.T) {
}

// Pull image again within delay
id, _ = coordinator.PullImage(image, nil, callerID, nil, 2*time.Minute)
id, _ = coordinator.PullImage(image, nil, callerID, nil, 5*time.Minute, 2*time.Minute)

// Check the reference count
if references := coordinator.imageRefCount[id]; len(references) != 1 {
Expand Down Expand Up @@ -244,7 +244,7 @@ func TestDockerCoordinator_No_Cleanup(t *testing.T) {
callerID := uuid.Generate()

// Pull image
id, _ := coordinator.PullImage(image, nil, callerID, nil, 2*time.Minute)
id, _ := coordinator.PullImage(image, nil, callerID, nil, 5*time.Minute, 2*time.Minute)

// Check the reference count
if references := coordinator.imageRefCount[id]; len(references) != 0 {
Expand Down Expand Up @@ -283,10 +283,10 @@ func TestDockerCoordinator_Cleanup_HonorsCtx(t *testing.T) {
callerID := uuid.Generate()

// Pull image
id1, _ := coordinator.PullImage(image1ID, nil, callerID, nil, 2*time.Minute)
id1, _ := coordinator.PullImage(image1ID, nil, callerID, nil, 5*time.Minute, 2*time.Minute)
require.Len(t, coordinator.imageRefCount[id1], 1, "image reference count")

id2, _ := coordinator.PullImage(image2ID, nil, callerID, nil, 2*time.Minute)
id2, _ := coordinator.PullImage(image2ID, nil, callerID, nil, 5*time.Minute, 2*time.Minute)
require.Len(t, coordinator.imageRefCount[id2], 1, "image reference count")

// remove one image, cancel ctx, remove second, and assert only first image is cleanedup
Expand Down
7 changes: 6 additions & 1 deletion drivers/docker/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,12 @@ func (d *Driver) pullImage(task *drivers.TaskConfig, driverConfig *TaskConfig, c
},
})

return d.coordinator.PullImage(driverConfig.Image, authOptions, task.ID, d.emitEventFunc(task), d.config.pullActivityTimeoutDuration)
pullDur, err := time.ParseDuration(driverConfig.ImagePullTimeout)
if err != nil {
return "", fmt.Errorf("Failed to parse image_pull_timeout: %v", err)
}

return d.coordinator.PullImage(driverConfig.Image, authOptions, task.ID, d.emitEventFunc(task), pullDur, d.config.pullActivityTimeoutDuration)
}

func (d *Driver) emitEventFunc(task *drivers.TaskConfig) LogEventFn {
Expand Down
5 changes: 3 additions & 2 deletions drivers/docker/driver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,8 +452,9 @@ func TestDockerDriver_Start_BadPull_Recoverable(t *testing.T) {
testutil.DockerCompatible(t)

taskCfg := TaskConfig{
Image: "127.0.0.1:32121/foo", // bad path
Command: "echo",
Image: "127.0.0.1:32121/foo", // bad path
ImagePullTimeout: "5m",
Command: "echo",
Args: []string{
"hello",
},
Expand Down
12 changes: 7 additions & 5 deletions drivers/docker/driver_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,8 @@ func TestDockerDriver_Start_Image_HTTPS(t *testing.T) {
testutil.DockerCompatible(t)

taskCfg := TaskConfig{
Image: "https://gcr.io/google_containers/pause:0.8.0",
Image: "https://gcr.io/google_containers/pause:0.8.0",
ImagePullTimeout: "5m",
}
task := &drivers.TaskConfig{
ID: uuid.Generate(),
Expand Down Expand Up @@ -746,10 +747,11 @@ func newTaskConfig(variant string, command []string) TaskConfig {
}

return TaskConfig{
Image: image,
LoadImage: loadImage,
Command: command[0],
Args: command[1:],
Image: image,
ImagePullTimeout: "5m",
LoadImage: loadImage,
Command: command[0],
Args: command[1:],
}
}

Expand Down
7 changes: 4 additions & 3 deletions drivers/docker/driver_windows_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ func newTaskConfig(variant string, command []string) TaskConfig {
busyboxImageID := "hashicorpnomad/busybox-windows:server2016-0.1"

return TaskConfig{
Image: busyboxImageID,
Command: command[0],
Args: command[1:],
Image: busyboxImageID,
ImagePullTimeout: "5m",
Command: command[0],
Args: command[1:],
}
}

Expand Down
2 changes: 1 addition & 1 deletion drivers/docker/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ func (d *Driver) CreateNetwork(allocID string) (*drivers.NetworkIsolationSpec, b
if err != nil {
d.logger.Debug("auth failed for infra container image pull", "image", d.config.InfraImage, "error", err)
}
_, err = d.coordinator.PullImage(d.config.InfraImage, authOptions, allocID, noopLogEventFn, d.config.pullActivityTimeoutDuration)
_, err = d.coordinator.PullImage(d.config.InfraImage, authOptions, allocID, noopLogEventFn, d.config.infraImagePullTimeoutDuration, d.config.pullActivityTimeoutDuration)
if err != nil {
return nil, false, err
}
Expand Down
Loading

0 comments on commit a40a140

Please sign in to comment.