Skip to content

Commit

Permalink
Prevent RemoveRunner spam on ephemeral runner scale down
Browse files Browse the repository at this point in the history
  • Loading branch information
mumoshu committed Mar 11, 2022
1 parent 736a53f commit 03f5cca
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 21 deletions.
4 changes: 4 additions & 0 deletions controllers/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ const (
// AnnotationKeyUnregistrationCompleteTimestamp is the annotation that is added onto the pod once the previously started unregistration process has been completed.
AnnotationKeyUnregistrationCompleteTimestamp = annotationKeyPrefix + "unregistration-complete-timestamp"

// AnnotationKeyRunnerCompletionWaitStartTimestamp is the annotation that is added onto the pod when
// ARC decided to wait until the pod to complete by itself, without the need for ARC to unregister the corresponding runner.
AnnotationKeyRunnerCompletionWaitStartTimestamp = annotationKeyPrefix + "runner-completion-wait-start-timestamp"

// unregistarionStartTimestamp is the annotation that contains the time that the requested unregistration process has been started
AnnotationKeyUnregistrationStartTimestamp = annotationKeyPrefix + "unregistration-start-timestamp"

Expand Down
3 changes: 2 additions & 1 deletion controllers/runner_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ const (
EnvVarOrg = "RUNNER_ORG"
EnvVarRepo = "RUNNER_REPO"
EnvVarEnterprise = "RUNNER_ENTERPRISE"
EnvVarEphemeral = "RUNNER_EPHEMERAL"
)

// RunnerReconciler reconciles a Runner object
Expand Down Expand Up @@ -535,7 +536,7 @@ func newRunnerPod(runnerName string, template corev1.Pod, runnerSpec v1alpha1.Ru
Value: workDir,
},
{
Name: "RUNNER_EPHEMERAL",
Name: EnvVarEphemeral,
Value: fmt.Sprintf("%v", ephemeral),
},
}
Expand Down
82 changes: 62 additions & 20 deletions controllers/runner_graceful_stop.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Dura
return nil, &ctrl.Result{}, err
}

if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, enterprise, organization, repository, runner, pod); res != nil {
if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil {
return nil, res, err
}

Expand Down Expand Up @@ -69,7 +69,7 @@ func annotatePodOnce(ctx context.Context, c client.Client, log logr.Logger, pod
}

// If the first return value is nil, it's safe to delete the runner pod.
func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
var runnerID *int64

if id, ok := getAnnotation(pod, AnnotationKeyRunnerID); ok {
Expand All @@ -81,8 +81,25 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
runnerID = &v
}

ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, runnerID)
if err != nil {
code := runnerContainerExitCode(pod)

if pod != nil && pod.Annotations[AnnotationKeyUnregistrationCompleteTimestamp] != "" {
// If it's already unregistered in the previous reconcilation loop,
// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
log.Info("Runner pod is marked as already unregistered.")
} else if pod != nil && runnerPodOrContainerIsStopped(pod) {
// If it's an ephemeral runner with the actions/runner container exited with 0,
// we can safely assume that it has unregistered itself from GitHub Actions
// so it's natural that RemoveRunner fails due to 404.

// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
log.Info("Runner pod has been stopped with a successful status.")
} else if pod != nil && pod.Annotations[AnnotationKeyRunnerCompletionWaitStartTimestamp] != "" {
log.Info("Runner pod is annotated to wait for completion")

return &ctrl.Result{RequeueAfter: retryDelay}, nil
} else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, runnerID); err != nil {
if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
Expand All @@ -97,12 +114,12 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
return &ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}

log.Error(err, "Failed to unregister runner before deleting the pod.")
log.V(1).Info("Failed to unregister runner before deleting the pod.", "error", err)

var runnerBusy bool

errRes := &gogithub.ErrorResponse{}
if errors.As(err, &errRes) {
code := runnerContainerExitCode(pod)

runner, _ := getRunner(ctx, ghClient, enterprise, organization, repository, runner)

var runnerID int64
Expand All @@ -111,7 +128,9 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
runnerID = *runner.ID
}

if errRes.Response.StatusCode == 422 && code != nil {
runnerBusy = errRes.Response.StatusCode == 422

if runnerBusy && code != nil {
log.V(2).Info("Runner container has already stopped but the unregistration attempt failed. "+
"This can happen when the runner container crashed due to an unhandled error, OOM, etc. "+
"ARC terminates the pod anyway. You'd probably need to manually delete the runner later by calling the GitHub API",
Expand All @@ -123,6 +142,28 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
}
}

if runnerBusy {
// We want to prevent spamming the deletion attemps but returning ctrl.Result with RequeueAfter doesn't
// work as the reconcilation can happen earlier due to pod status update.
// For ephemeral runners, we can expect it to stop and unregister itself on completion.
// So we can just wait for the completion without actively retrying unregistration.
ephemeral := getRunnerEnv(pod, EnvVarEphemeral)
if ephemeral == "true" {
pod, err = annotatePodOnce(ctx, c, log, pod, AnnotationKeyRunnerCompletionWaitStartTimestamp, time.Now().Format(time.RFC3339))
if err != nil {
return &ctrl.Result{}, err
}

return &ctrl.Result{}, nil
}

log.V(2).Info("Retrying runner unregistration because the static runner is still busy")
// Otherwise we may end up spamming 422 errors,
// each call consuming GitHub API rate limit
// https://github.com/actions-runner-controller/actions-runner-controller/pull/1167#issuecomment-1064213271
return &ctrl.Result{RequeueAfter: retryDelay}, nil
}

return &ctrl.Result{}, err
} else if ok {
log.Info("Runner has just been unregistered.")
Expand All @@ -135,18 +176,6 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
// In that case we can safely assume that the runner will never be registered.

log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.")
} else if pod.Annotations[AnnotationKeyUnregistrationCompleteTimestamp] != "" {
// If it's already unregistered in the previous reconcilation loop,
// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
log.Info("Runner pod is marked as already unregistered.")
} else if runnerPodOrContainerIsStopped(pod) {
// If it's an ephemeral runner with the actions/runner container exited with 0,
// we can safely assume that it has unregistered itself from GitHub Actions
// so it's natural that RemoveRunner fails due to 404.

// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
log.Info("Runner pod has been stopped with a successful status.")
} else if ts := pod.Annotations[AnnotationKeyUnregistrationStartTimestamp]; ts != "" {
t, err := time.Parse(time.RFC3339, ts)
if err != nil {
Expand Down Expand Up @@ -241,6 +270,19 @@ func podRunnerID(pod *corev1.Pod) string {
return id
}

func getRunnerEnv(pod *corev1.Pod, key string) string {
for _, c := range pod.Spec.Containers {
if c.Name == containerName {
for _, e := range c.Env {
if e.Name == key {
return e.Value
}
}
}
}
return ""
}

// unregisterRunner unregisters the runner from GitHub Actions by name.
//
// This function returns:
Expand Down

0 comments on commit 03f5cca

Please sign in to comment.