From 2f47a6d86c8f707406f9f9d3394baa70d64d5a19 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 13 Sep 2019 12:59:14 -0400 Subject: [PATCH 1/3] docker: defensive against failed starts This handles a bug where we may start a container successfully, yet we fail due to retries and startContainer not being idempotent call. Here, we ensure that when starting a container fails with 500 error, the retry succeeds if container was started successfully. --- drivers/docker/driver.go | 2 +- drivers/docker/driver_test.go | 71 +++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index ac83f376d865..dac2dbdd6cfc 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -468,7 +468,7 @@ func (d *Driver) startContainer(c *docker.Container) error { attempted := 0 START: startErr := client.StartContainer(c.ID, c.HostConfig) - if startErr == nil { + if startErr == nil || strings.Contains(startErr.Error(), "Container already running") { return nil } diff --git a/drivers/docker/driver_test.go b/drivers/docker/driver_test.go index 997ce1d9c1b9..a100aa7431e2 100644 --- a/drivers/docker/driver_test.go +++ b/drivers/docker/driver_test.go @@ -2376,3 +2376,74 @@ func waitForExist(t *testing.T, client *docker.Client, containerID string) { require.NoError(t, err) }) } + +// TestDockerDriver_CreationIdempotent asserts that createContainer and +// and startContainers functions are idempotent, as we have some retry +// logic there without ensureing we delete/destroy containers +func TestDockerDriver_CreationIdempotent(t *testing.T) { + if !tu.IsCI() { + t.Parallel() + } + testutil.DockerCompatible(t) + + task, cfg, _ := dockerTask(t) + require.NoError(t, task.EncodeConcreteDriverConfig(cfg)) + + client := newTestDockerClient(t) + driver := dockerDriverHarness(t, nil) + cleanup := driver.MkAllocDir(task, true) + defer cleanup() + + copyImage(t, task.TaskDir(), "busybox.tar") + + d, ok := driver.Impl().(*Driver) + require.True(t, ok) + + _, err := d.createImage(task, cfg, client) + require.NoError(t, err) + + containerCfg, err := d.createContainerConfig(task, cfg, cfg.Image) + require.NoError(t, err) + + c, err := d.createContainer(client, containerCfg, cfg.Image) + require.NoError(t, err) + defer client.RemoveContainer(docker.RemoveContainerOptions{ + ID: c.ID, + Force: true, + }) + + // calling createContainer again creates a new one and remove old one + c2, err := d.createContainer(client, containerCfg, cfg.Image) + require.NoError(t, err) + defer client.RemoveContainer(docker.RemoveContainerOptions{ + ID: c2.ID, + Force: true, + }) + + require.NotEqual(t, c.ID, c2.ID) + // old container was destroyed + { + _, err := client.InspectContainer(c.ID) + require.Error(t, err) + require.Contains(t, err.Error(), NoSuchContainerError) + } + + // now start container twice + require.NoError(t, d.startContainer(c2)) + require.NoError(t, d.startContainer(c2)) + + tu.WaitForResult(func() (bool, error) { + c, err := client.InspectContainer(c2.ID) + if err != nil { + return false, fmt.Errorf("failed to get container status: %v", err) + } + + if !c.State.Running { + return false, fmt.Errorf("container is not running but %v", c.State) + } + + return true, nil + }, func(err error) { + require.NoError(t, err) + }) +} From d5051687b8c2ce4085aa425cde8668b1f5ee3396 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 13 Sep 2019 15:25:31 -0400 Subject: [PATCH 2/3] retry transient docker errors within function --- drivers/docker/driver.go | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index dac2dbdd6cfc..59bc8c55a17c 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -41,6 +41,12 @@ var ( // running operations such as waiting on containers and collect stats waitClient *docker.Client + dockerTransientErrs = []string{ + "Client.Timeout exceeded while awaiting headers", + "EOF", + "API error (500)", + } + // recoverableErrTimeouts returns a recoverable error if the error was due // to timeouts recoverableErrTimeouts = func(err error) error { @@ -456,6 +462,10 @@ CREATE: // There is still a very small chance this is possible even with the // coordinator so retry. return nil, nstructs.NewRecoverableError(createErr, true) + } else if isDockerTransientError(createErr) && attempted < 5 { + attempted++ + time.Sleep(1 * time.Second) + goto CREATE } return nil, recoverableErrTimeouts(createErr) @@ -474,8 +484,7 @@ START: d.logger.Debug("failed to start container", "container_id", c.ID, "attempt", attempted+1, "error", startErr) - // If it is a 500 error it is likely we can retry and be successful - if strings.Contains(startErr.Error(), "API error (500)") { + if isDockerTransientError(startErr) { if attempted < 5 { attempted++ time.Sleep(1 * time.Second) @@ -1458,3 +1467,18 @@ func sliceMergeUlimit(ulimitsRaw map[string]string) ([]docker.ULimit, error) { func (d *Driver) Shutdown() { d.signalShutdown() } + +func isDockerTransientError(err error) bool { + if err == nil { + return false + } + + errMsg := err.Error() + for _, te := range dockerTransientErrs { + if strings.Contains(errMsg, te) { + return true + } + } + + return false +} From b5b445c1012c413bce47348b404c6d45c10a2713 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Wed, 18 Sep 2019 08:09:13 -0400 Subject: [PATCH 3/3] add exponential backoff for docker api calls --- drivers/docker/driver.go | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index 59bc8c55a17c..3367caeba8d3 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -455,7 +455,7 @@ CREATE: if attempted < 5 { attempted++ - time.Sleep(1 * time.Second) + time.Sleep(nextBackoff(attempted)) goto CREATE } } else if strings.Contains(strings.ToLower(createErr.Error()), "no such image") { @@ -464,7 +464,7 @@ CREATE: return nil, nstructs.NewRecoverableError(createErr, true) } else if isDockerTransientError(createErr) && attempted < 5 { attempted++ - time.Sleep(1 * time.Second) + time.Sleep(nextBackoff(attempted)) goto CREATE } @@ -487,7 +487,7 @@ START: if isDockerTransientError(startErr) { if attempted < 5 { attempted++ - time.Sleep(1 * time.Second) + time.Sleep(nextBackoff(attempted)) goto START } return nstructs.NewRecoverableError(startErr, true) @@ -496,6 +496,13 @@ START: return recoverableErrTimeouts(startErr) } +// nextBackoff returns appropriate docker backoff durations after attempted attempts. +func nextBackoff(attempted int) time.Duration { + // attempts in 200ms, 800ms, 3.2s, 12.8s, 51.2s + // TODO: add randomization factor and extract to a helper + return 1 << (2 * uint64(attempted)) * 50 * time.Millisecond +} + // createImage creates a docker image either by pulling it from a registry or by // loading it from the file system func (d *Driver) createImage(task *drivers.TaskConfig, driverConfig *TaskConfig, client *docker.Client) (string, error) {