Skip to content

Commit

Permalink
add retries to a subset of k8s runner exec failures
Browse files Browse the repository at this point in the history
Signed-off-by: Josh Wolf <josh@wolfs.io>
  • Loading branch information
joshrwolf committed Jul 17, 2023
1 parent 0488212 commit e816baa
Showing 1 changed file with 44 additions and 9 deletions.
53 changes: 44 additions & 9 deletions pkg/container/kubernetes_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/client-go/tools/remotecommand"
"k8s.io/client-go/util/exec"
"knative.dev/pkg/ptr"
"sigs.k8s.io/yaml"

Expand All @@ -51,9 +52,10 @@ import (
)

const (
KubernetesName = "kubernetes"
KubernetesConfigFileName = ".melange.k8s.yaml"
kubernetesBuilderPodWorkspaceContainerName = "workspace"
KubernetesName = "kubernetes"
KubernetesConfigFileName = ".melange.k8s.yaml"

k8sBuilderPodWorkspaceContainerName = "workspace"
)

// k8s is a Runner implementation that uses kubernetes pods.
Expand Down Expand Up @@ -253,6 +255,16 @@ func (k *k8s) OCIImageLoader() Loader {

// Exec runs a command on the pod
func (k *k8s) Exec(ctx context.Context, podName string, cmd []string, streamOpts remotecommand.StreamOptions) error {
// The k8s runner has no concept of a "WorkingDir", so we mutate the standard
// command to first root us in WorkingDir
if len(cmd) != 3 && cmd[0] == "/bin/sh" && cmd[1] == "-c" {
k.logger.Warnf("tried to mutate '/bin/sh -c' with working dir but got [%s %s], this might not work...", cmd[0], cmd[1])
}

cmd[2] = fmt.Sprintf(`[ -d '%s' ] || mkdir -p '%s'
cd '%s'
%s`, runnerWorkdir, runnerWorkdir, runnerWorkdir, cmd[2])

req := k.clientset.
CoreV1().
RESTClient().
Expand All @@ -262,20 +274,43 @@ func (k *k8s) Exec(ctx context.Context, podName string, cmd []string, streamOpts
Namespace(k.Config.Namespace).
SubResource("exec").
VersionedParams(&corev1.PodExecOptions{
Container: kubernetesBuilderPodWorkspaceContainerName,
Container: k8sBuilderPodWorkspaceContainerName,
Command: cmd,
Stdout: true,
Stderr: true,
}, scheme.ParameterCodec)

k.logger.Infof("executing command %v", cmd)
exec, err := remotecommand.NewSPDYExecutor(k.restConfig, "POST", req.URL())
executor, err := remotecommand.NewSPDYExecutor(k.restConfig, "POST", req.URL())
if err != nil {
return fmt.Errorf("failed to create remote command executor: %v", err)
}

if err := exec.StreamWithContext(ctx, streamOpts); err != nil {
return fmt.Errorf("failed to stream remote command: %v", err)
// Backoff up to 4 times with a 1 second initial delay, tripling each time
backoff := wait.Backoff{
Steps: 4,
Duration: 1 * time.Second,
Factor: 3,
Jitter: 0.1,
}

k.logger.Infof("remote executing command %v", cmd)
if err := wait.ExponentialBackoffWithContext(ctx, backoff, func(ctx context.Context) (bool, error) {
err := executor.StreamWithContext(ctx, streamOpts)
switch e := err.(type) {
case *exec.CodeExitError, exec.ExitError:
// Non recoverable error
k.logger.Warnf("non-recoverable error (%T) executing remote command: %v", e, err)
return false, err
case nil:
// Succeeded without error
return true, nil
}

// Everything else is retryable without altering the existing build step
k.logger.Warnf("attempting to recover (%T) after failing to execute remote command: %v", err, err)
return false, nil
}); err != nil {
return fmt.Errorf("failed executing remote command: %v", err)
}

return nil
Expand Down Expand Up @@ -441,7 +476,7 @@ func (c KubernetesRunnerConfig) defaultBuilderPod(cfg *Config) *corev1.Pod {
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{{
Name: kubernetesBuilderPodWorkspaceContainerName,
Name: k8sBuilderPodWorkspaceContainerName,
Image: cfg.ImgRef,
// ldconfig is run to prime ld.so.cache for glibc packages which require it.
Command: []string{"/bin/sh", "-c", "[ -x /sbin/ldconfig ] && /sbin/ldconfig /lib || true\nwhile true; do sleep 5; done"},
Expand Down

0 comments on commit e816baa

Please sign in to comment.