diff --git a/.changelog/13265.txt b/.changelog/13265.txt new file mode 100644 index 000000000000..7e34eb38ae98 --- /dev/null +++ b/.changelog/13265.txt @@ -0,0 +1,3 @@ +```release-note:improvement +driver/docker: Eliminate excess Docker registry pulls for the `infra_image` when it already exists locally. +``` diff --git a/drivers/docker/network.go b/drivers/docker/network.go index 30a49f6eba52..13b3e6b73f94 100644 --- a/drivers/docker/network.go +++ b/drivers/docker/network.go @@ -28,16 +28,7 @@ func (d *Driver) CreateNetwork(allocID string, createSpec *drivers.NetworkCreate return nil, false, fmt.Errorf("failed to connect to docker daemon: %s", err) } - repo, _ := parseDockerImage(d.config.InfraImage) - authOptions, err := firstValidAuth(repo, []authBackend{ - authFromDockerConfig(d.config.Auth.Config), - authFromHelper(d.config.Auth.Helper), - }) - if err != nil { - d.logger.Debug("auth failed for infra container image pull", "image", d.config.InfraImage, "error", err) - } - _, err = d.coordinator.PullImage(d.config.InfraImage, authOptions, allocID, noopLogEventFn, d.config.infraImagePullTimeoutDuration, d.config.pullActivityTimeoutDuration) - if err != nil { + if err := d.pullInfraImage(allocID); err != nil { return nil, false, err } @@ -101,10 +92,28 @@ func (d *Driver) DestroyNetwork(allocID string, spec *drivers.NetworkIsolationSp return fmt.Errorf("failed to connect to docker daemon: %s", err) } - return client.RemoveContainer(docker.RemoveContainerOptions{ + if err := client.RemoveContainer(docker.RemoveContainerOptions{ Force: true, ID: spec.Labels[dockerNetSpecLabelKey], - }) + }); err != nil { + return err + } + + if d.config.GC.Image { + + // The Docker image ID is needed in order to correctly update the image + // reference count. Any error finding this, however, should not result + // in an error shutting down the allocrunner. + dockerImage, err := client.InspectImage(d.config.InfraImage) + if err != nil { + d.logger.Warn("InspectImage failed for infra_image container destroy", + "image", d.config.InfraImage, "error", err) + return nil + } + d.coordinator.RemoveImage(dockerImage.ID, allocID) + } + + return nil } // createSandboxContainerConfig creates a docker container configuration which @@ -124,3 +133,46 @@ func (d *Driver) createSandboxContainerConfig(allocID string, createSpec *driver }, }, nil } + +// pullInfraImage conditionally pulls the `infra_image` from the Docker registry +// only if its name uses the "latest" tag or the image doesn't already exist locally. +func (d *Driver) pullInfraImage(allocID string) error { + repo, tag := parseDockerImage(d.config.InfraImage) + + // There's a (narrow) time-of-check-time-of-use race here. If we call + // InspectImage and then a concurrent task shutdown happens before we call + // IncrementImageReference, we could end up removing the image, and it + // would no longer exist by the time we get to PullImage below. + d.coordinator.imageLock.Lock() + + if tag != "latest" { + dockerImage, err := client.InspectImage(d.config.InfraImage) + if err != nil { + d.logger.Debug("InspectImage failed for infra_image container pull", + "image", d.config.InfraImage, "error", err) + } else if dockerImage != nil { + // Image exists, so no pull is attempted; just increment its reference + // count and unlock the image lock. + d.coordinator.incrementImageReferenceImpl(dockerImage.ID, d.config.InfraImage, allocID) + d.coordinator.imageLock.Unlock() + return nil + } + } + + // At this point we have performed all the image work needed, so unlock. It + // is possible in environments with slow networks that the image pull may + // take a while, so while defer unlock would be best, this allows us to + // remove the lock sooner. + d.coordinator.imageLock.Unlock() + + authOptions, err := firstValidAuth(repo, []authBackend{ + authFromDockerConfig(d.config.Auth.Config), + authFromHelper(d.config.Auth.Helper), + }) + if err != nil { + d.logger.Debug("auth failed for infra_image container pull", "image", d.config.InfraImage, "error", err) + } + + _, err = d.coordinator.PullImage(d.config.InfraImage, authOptions, allocID, noopLogEventFn, d.config.infraImagePullTimeoutDuration, d.config.pullActivityTimeoutDuration) + return err +} diff --git a/website/content/docs/drivers/docker.mdx b/website/content/docs/drivers/docker.mdx index c9e6c0987cc6..5d3379610ed4 100644 --- a/website/content/docs/drivers/docker.mdx +++ b/website/content/docs/drivers/docker.mdx @@ -981,7 +981,8 @@ host system. - `infra_image` - This is the Docker image to use when creating the parent container necessary when sharing network namespaces between tasks. Defaults to - `gcr.io/google_containers/pause-:3.1`. + `gcr.io/google_containers/pause-:3.1`. The image will only be pulled from + the container registry if its tag is `latest` or the image doesn't yet exist locally. - `infra_image_pull_timeout` - A time duration that controls how long Nomad will wait before cancelling an in-progress pull of the Docker image as specified in