Skip to content

Commit

Permalink
feat: Prevent blocking on transient runner registration failure
Browse files Browse the repository at this point in the history
This enhances the controller to recreate the runner pod if the corresponding runner has failed to register itself to GitHub within 10 minutes(currently hard-coded). This should alleviate #288 in case the root cause is some kind of transient failures(network unreliability, GitHub down, temporarly compute resource shortage, etc). Formerly you had to manually detect and remove those pods to unblock the controller. Since this enhancement, the controller does it automatically after 10 minutes after pod creation.

Ref #288
  • Loading branch information
mumoshu committed Feb 8, 2021
1 parent ab1c39d commit 4988c47
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 47 deletions.
86 changes: 64 additions & 22 deletions controllers/runner_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@ package controllers

import (
"context"
"errors"
"fmt"
gogithub "github.com/google/go-github/v33/github"
"github.com/summerwind/actions-runner-controller/hash"
"strings"
"time"

"github.com/go-logr/logr"
"k8s.io/apimachinery/pkg/api/errors"
kerrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
Expand All @@ -41,6 +44,8 @@ const (
finalizerName = "runner.actions.summerwind.dev"

LabelKeyPodTemplateHash = "pod-template-hash"

retryDelayOnGitHubAPIRateLimitError = 30 * time.Second
)

// RunnerReconciler reconciles a Runner object
Expand Down Expand Up @@ -97,7 +102,20 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
if len(runner.Status.Registration.Token) > 0 {
ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
if err != nil {
log.Error(err, "Failed to unregister runner")
if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)

return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}

return ctrl.Result{}, err
}

Expand All @@ -124,7 +142,7 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {

var pod corev1.Pod
if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
if !errors.IsNotFound(err) {
if !kerrors.IsNotFound(err) {
return ctrl.Result{}, err
}

Expand Down Expand Up @@ -194,10 +212,31 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
return ctrl.Result{}, err
}

runnerBusy, err := r.isRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
var notRegistered bool

runnerBusy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
if err != nil {
log.Error(err, "Failed to check if runner is busy")
return ctrl.Result{}, nil
if errors.Is(err, github.RunnerNotFound{}) {
log.Error(err, "Failed to check if runner is busy. Probably this runner has never been successfully registered to GitHub.")

notRegistered = true
} else {
if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)

return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}

return ctrl.Result{}, err
}
}

// See the `newPod` function called above for more information
Expand All @@ -209,9 +248,27 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
restart = true
}

registrationTimeout := 10 * time.Minute
currentTime := time.Now()
registrationDidTimeout := pod.CreationTimestamp.Add(registrationTimeout).Sub(currentTime) > 0

if !notRegistered && registrationDidTimeout {
log.Info(
"Runner failed to register itself to GitHub in timely manner. "+
"Recreating the pod to see if it resolves the issue. "+
"CAUTION: If you see this a lot, you should investigate the root cause. "+
"See https://github.com/summerwind/actions-runner-controller/issues/288",
"podCreationTimestamp", pod.CreationTimestamp,
"currentTime", currentTime,
"configuredRegistrationTimeout", registrationTimeout,
)

restart = true
}

// Don't do anything if there's no need to restart the runner
if !restart {
return ctrl.Result{}, err
return ctrl.Result{}, nil
}

// Delete current pod if recreation is needed
Expand All @@ -227,21 +284,6 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
return ctrl.Result{}, nil
}

func (r *RunnerReconciler) isRunnerBusy(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
runners, err := r.GitHubClient.ListRunners(ctx, enterprise, org, repo)
if err != nil {
return false, err
}

for _, runner := range runners {
if runner.GetName() == name {
return runner.GetBusy(), nil
}
}

return false, fmt.Errorf("runner not found")
}

func (r *RunnerReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
runners, err := r.GitHubClient.ListRunners(ctx, enterprise, org, repo)
if err != nil {
Expand Down
75 changes: 52 additions & 23 deletions controllers/runnerreplicaset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,13 @@ package controllers

import (
"context"
"errors"
"fmt"
gogithub "github.com/google/go-github/v33/github"
"time"

"github.com/go-logr/logr"
"k8s.io/apimachinery/pkg/api/errors"
kerrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
Expand Down Expand Up @@ -65,7 +68,7 @@ func (r *RunnerReplicaSetReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e

var allRunners v1alpha1.RunnerList
if err := r.List(ctx, &allRunners, client.InNamespace(req.Namespace)); err != nil {
if !errors.IsNotFound(err) {
if !kerrors.IsNotFound(err) {
return ctrl.Result{}, err
}
}
Expand Down Expand Up @@ -102,12 +105,54 @@ func (r *RunnerReplicaSetReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e
// get runners that are currently not busy
var notBusy []v1alpha1.Runner
for _, runner := range myRunners {
busy, err := r.isRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
busy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
if err != nil {
log.Error(err, "Failed to check if runner is busy")
return ctrl.Result{}, err
}
if !busy {
log.Error(err, "Failed to check if runner is busy. Probably this runner has never been successfully registered to GitHub, and therefore we prioritize it for deletion", "runnerName", runner.Name)

var notRegistered bool

if err != nil {
if errors.Is(err, github.RunnerNotFound{}) {
log.Error(err, "Failed to check if runner is busy. Probably this runner has never been successfully registered to GitHub.")

notRegistered = true
} else {
if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to check if runner is busy due to GitHub API rate limit. Retrying in %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)

return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}

return ctrl.Result{}, err
}
}

registrationTimeout := 15 * time.Minute
currentTime := time.Now()
registrationDidTimeout := runner.CreationTimestamp.Add(registrationTimeout).Sub(currentTime) > 0

if !notRegistered && registrationDidTimeout {
log.Info(
"Runner failed to register itself to GitHub in timely manner. "+
"Recreating the pod to see if it resolves the issue. "+
"CAUTION: If you see this a lot, you should investigate the root cause. "+
"See https://github.com/summerwind/actions-runner-controller/issues/288",
"runnerCreationTimestamp", runner.CreationTimestamp,
"currentTime", currentTime,
"configuredRegistrationTimeout", registrationTimeout,
)

notBusy = append(notBusy, runner)
}
} else if !busy {
notBusy = append(notBusy, runner)
}
}
Expand Down Expand Up @@ -186,19 +231,3 @@ func (r *RunnerReplicaSetReconciler) SetupWithManager(mgr ctrl.Manager) error {
Owns(&v1alpha1.Runner{}).
Complete(r)
}

func (r *RunnerReplicaSetReconciler) isRunnerBusy(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
runners, err := r.GitHubClient.ListRunners(ctx, enterprise, org, repo)
r.Log.Info("runners", "github", runners)
if err != nil {
return false, err
}

for _, runner := range runners {
if runner.GetName() == name {
return runner.GetBusy(), nil
}
}

return false, fmt.Errorf("runner not found")
}
27 changes: 25 additions & 2 deletions github/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ func (c *Client) RemoveRunner(ctx context.Context, enterprise, org, repo string,
res, err := c.removeRunner(ctx, enterprise, owner, repo, runnerID)

if err != nil {
return fmt.Errorf("failed to remove runner: %v", err)
return fmt.Errorf("failed to remove runner: %w", err)
}

if res.StatusCode != 204 {
Expand All @@ -149,7 +149,7 @@ func (c *Client) ListRunners(ctx context.Context, enterprise, org, repo string)
list, res, err := c.listRunners(ctx, enterprise, owner, repo, &opts)

if err != nil {
return runners, fmt.Errorf("failed to list runners: %v", err)
return runners, fmt.Errorf("failed to list runners: %w", err)
}

runners = append(runners, list.Runners...)
Expand Down Expand Up @@ -254,3 +254,26 @@ func getEnterpriseApiUrl(baseURL string) (string, error) {
// Trim trailing slash, otherwise there's double slash added to token endpoint
return fmt.Sprintf("%s://%s%s", baseEndpoint.Scheme, baseEndpoint.Host, strings.TrimSuffix(baseEndpoint.Path, "/")), nil
}

type RunnerNotFound struct {
runnerName string
}

func (e RunnerNotFound) Error() string {
return fmt.Sprintf("runner %q not found", e.runnerName)
}

func (r *Client) IsRunnerBusy(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
runners, err := r.ListRunners(ctx, enterprise, org, repo)
if err != nil {
return false, err
}

for _, runner := range runners {
if runner.GetName() == name {
return runner.GetBusy(), nil
}
}

return false, RunnerNotFound{runnerName: name}
}

0 comments on commit 4988c47

Please sign in to comment.