Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(executor): Make docker executor more robust. #5363

Merged
merged 3 commits into from
Mar 11, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 22 additions & 14 deletions workflow/executor/docker/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ import (

var errContainerNotExist = fmt.Errorf("container does not exist") // sentinel error

var started = time.Now()

type DockerExecutor struct {
namespace string
podName string
Expand Down Expand Up @@ -206,35 +208,41 @@ func (d *DockerExecutor) syncContainerIDs(ctx context.Context, containerNames []
"ps",
"--all", // container could have already exited, but there could also have been two containers for the same pod (old container not yet cleaned-up)
"--no-trunc", // display long container IDs
"--format={{.Status}}|{{.Label \"io.kubernetes.container.name\"}}|{{.ID}}", // similar to `Up 3 hours,main,035a98c4e72e`
"--format={{.Status}}|{{.Label \"io.kubernetes.container.name\"}}|{{.ID}}|{{.CreatedAt}}", // similar to `Up 3 hours,main,035a98c4e72e,2021-03-08 17:25:15 -0800 PST`
// https://github.com/kubernetes/kubernetes/blob/ca6bdba014f0a98efe0e0dd4e15f57d1c121d6c9/pkg/kubelet/dockertools/labels.go#L37
"--filter=label=io.kubernetes.pod.namespace="+d.namespace,
"--filter=label=io.kubernetes.pod.name="+d.podName,
)
if err != nil {
return err
}
containerStatus := make(map[string]string)
for _, l := range strings.Split(string(output), "\n") {
parts := strings.Split(strings.TrimSpace(l), "|")
if len(parts) != 3 {
if len(parts) != 4 {
continue
}
status := strings.SplitN(parts[0], " ", 2)[0] // Created,Exited,Up,
containerName := parts[1]
containerID := parts[2]
if d.containers[containerName] == "" && containerID != "" {
if status == "Created" { // for "Created" we must check to see if it was an early (non-zero) exit
output, err := common.RunCommand("docker", "inspect", containerID, "--format={{.State.ExitCode}}")
if err != nil {
return err
}
if strings.TrimSpace(string(output)) == "0" { // this remain "0" until it is not "Created" anymore
continue
}
}
d.containers[containerName] = containerID
log.Infof("mapped container name %q to container ID %q", containerName, containerID)
createdAt, err := time.Parse("2006-01-02 15:04:05 -0700 MST", parts[3])
if err != nil {
return err
}
if containerID == "" || d.containers[containerName] == containerID {
continue
}
if createdAt.Before(started.Add(-15 * time.Second)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is 15 seconds the approximate start time?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

correct

log.Infof("ignoring container %q created at %v, too long before process started", containerName, createdAt)
continue
}
if status == "Created" && containerStatus[containerName] != "" {
log.Infof("ignoring created container %q that would %s -> %s", containerName, containerStatus[containerName], status)
continue
}
d.containers[containerName] = containerID
containerStatus[containerName] = status
log.Infof("mapped container name %q to container ID %q (created at %v, status %s)", containerName, containerID, createdAt, status)
}
if d.haveContainers(containerNames) {
return nil
Expand Down