Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bugs around docker start retries #6326

Merged
merged 3 commits into from
Sep 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 36 additions & 5 deletions drivers/docker/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ var (
// running operations such as waiting on containers and collect stats
waitClient *docker.Client

dockerTransientErrs = []string{
"Client.Timeout exceeded while awaiting headers",
"EOF",
"API error (500)",
}

// recoverableErrTimeouts returns a recoverable error if the error was due
// to timeouts
recoverableErrTimeouts = func(err error) error {
Expand Down Expand Up @@ -449,13 +455,17 @@ CREATE:

if attempted < 5 {
attempted++
time.Sleep(1 * time.Second)
time.Sleep(nextBackoff(attempted))
goto CREATE
}
} else if strings.Contains(strings.ToLower(createErr.Error()), "no such image") {
// There is still a very small chance this is possible even with the
// coordinator so retry.
return nil, nstructs.NewRecoverableError(createErr, true)
} else if isDockerTransientError(createErr) && attempted < 5 {
attempted++
time.Sleep(nextBackoff(attempted))
goto CREATE
}

return nil, recoverableErrTimeouts(createErr)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not obvious to me why recoverableErrTimeout are used. StartTask doesn't special case them, and wraps them before returning. I considered removing it completely and doing more retries locally, but decided to be conservative and double check goal before making further changes to driver.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have much context to add here other than that it is was brought over from pre-0.9 drivers:

return nil, recoverableErrTimeouts(createErr)

Copy link
Contributor Author

@notnoop notnoop Sep 18, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. Makes sense for more aggressive refactor follow up PR.

Expand All @@ -468,17 +478,16 @@ func (d *Driver) startContainer(c *docker.Container) error {
attempted := 0
START:
startErr := client.StartContainer(c.ID, c.HostConfig)
if startErr == nil {
if startErr == nil || strings.Contains(startErr.Error(), "Container already running") {
return nil
}

d.logger.Debug("failed to start container", "container_id", c.ID, "attempt", attempted+1, "error", startErr)

// If it is a 500 error it is likely we can retry and be successful
if strings.Contains(startErr.Error(), "API error (500)") {
if isDockerTransientError(startErr) {
if attempted < 5 {
attempted++
time.Sleep(1 * time.Second)
time.Sleep(nextBackoff(attempted))
goto START
}
return nstructs.NewRecoverableError(startErr, true)
Expand All @@ -487,6 +496,13 @@ START:
return recoverableErrTimeouts(startErr)
}

// nextBackoff returns appropriate docker backoff durations after attempted attempts.
func nextBackoff(attempted int) time.Duration {
// attempts in 200ms, 800ms, 3.2s, 12.8s, 51.2s
// TODO: add randomization factor and extract to a helper
return 1 << (2 * uint64(attempted)) * 50 * time.Millisecond
}

// createImage creates a docker image either by pulling it from a registry or by
// loading it from the file system
func (d *Driver) createImage(task *drivers.TaskConfig, driverConfig *TaskConfig, client *docker.Client) (string, error) {
Expand Down Expand Up @@ -1458,3 +1474,18 @@ func sliceMergeUlimit(ulimitsRaw map[string]string) ([]docker.ULimit, error) {
func (d *Driver) Shutdown() {
d.signalShutdown()
}

func isDockerTransientError(err error) bool {
if err == nil {
return false
}

errMsg := err.Error()
for _, te := range dockerTransientErrs {
if strings.Contains(errMsg, te) {
return true
}
}

return false
}
71 changes: 71 additions & 0 deletions drivers/docker/driver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2376,3 +2376,74 @@ func waitForExist(t *testing.T, client *docker.Client, containerID string) {
require.NoError(t, err)
})
}

// TestDockerDriver_CreationIdempotent asserts that createContainer and
// and startContainers functions are idempotent, as we have some retry
// logic there without ensureing we delete/destroy containers
func TestDockerDriver_CreationIdempotent(t *testing.T) {
if !tu.IsCI() {
t.Parallel()
}
testutil.DockerCompatible(t)

task, cfg, _ := dockerTask(t)
require.NoError(t, task.EncodeConcreteDriverConfig(cfg))

client := newTestDockerClient(t)
driver := dockerDriverHarness(t, nil)
cleanup := driver.MkAllocDir(task, true)
defer cleanup()

copyImage(t, task.TaskDir(), "busybox.tar")

d, ok := driver.Impl().(*Driver)
require.True(t, ok)

_, err := d.createImage(task, cfg, client)
require.NoError(t, err)

containerCfg, err := d.createContainerConfig(task, cfg, cfg.Image)
require.NoError(t, err)

c, err := d.createContainer(client, containerCfg, cfg.Image)
require.NoError(t, err)
defer client.RemoveContainer(docker.RemoveContainerOptions{
ID: c.ID,
Force: true,
})

// calling createContainer again creates a new one and remove old one
c2, err := d.createContainer(client, containerCfg, cfg.Image)
require.NoError(t, err)
defer client.RemoveContainer(docker.RemoveContainerOptions{
ID: c2.ID,
Force: true,
})

require.NotEqual(t, c.ID, c2.ID)
// old container was destroyed
{
_, err := client.InspectContainer(c.ID)
require.Error(t, err)
require.Contains(t, err.Error(), NoSuchContainerError)
}

// now start container twice
require.NoError(t, d.startContainer(c2))
require.NoError(t, d.startContainer(c2))

tu.WaitForResult(func() (bool, error) {
c, err := client.InspectContainer(c2.ID)
if err != nil {
return false, fmt.Errorf("failed to get container status: %v", err)
}

if !c.State.Running {
return false, fmt.Errorf("container is not running but %v", c.State)
}

return true, nil
}, func(err error) {
require.NoError(t, err)
})
}