Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Only refresh bootstrap token if needed, requeue in all cases where node hasn't joined yet #9229

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 64 additions & 8 deletions bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
kerrors "k8s.io/apimachinery/pkg/util/errors"
bootstrapapi "k8s.io/cluster-bootstrap/token/api"
bootstrapsecretutil "k8s.io/cluster-bootstrap/util/secrets"
"k8s.io/klog/v2"
"k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"
Expand Down Expand Up @@ -272,7 +274,7 @@ func (r *KubeadmConfigReconciler) reconcile(ctx context.Context, scope *Scope, c
// If the BootstrapToken has been generated for a join but the config owner has no nodeRefs,
// this indicates that the node has not yet joined and the token in the join config has not
// been consumed and it may need a refresh.
return r.refreshBootstrapToken(ctx, config, cluster)
return r.refreshBootstrapTokenIfNeeded(ctx, config, cluster)
}
if configOwner.IsMachinePool() {
// If the BootstrapToken has been generated and infrastructure is ready but the configOwner is a MachinePool,
Expand Down Expand Up @@ -310,7 +312,7 @@ func (r *KubeadmConfigReconciler) reconcile(ctx context.Context, scope *Scope, c
return r.joinWorker(ctx, scope)
}

func (r *KubeadmConfigReconciler) refreshBootstrapToken(ctx context.Context, config *bootstrapv1.KubeadmConfig, cluster *clusterv1.Cluster) (ctrl.Result, error) {
func (r *KubeadmConfigReconciler) refreshBootstrapTokenIfNeeded(ctx context.Context, config *bootstrapv1.KubeadmConfig, cluster *clusterv1.Cluster) (ctrl.Result, error) {
log := ctrl.LoggerFrom(ctx)
token := config.Spec.JoinConfiguration.Discovery.BootstrapToken.Token

Expand All @@ -319,12 +321,42 @@ func (r *KubeadmConfigReconciler) refreshBootstrapToken(ctx context.Context, con
return ctrl.Result{}, err
}

log.Info("Refreshing token until the infrastructure has a chance to consume it")
if err := refreshToken(ctx, remoteClient, token, r.TokenTTL); err != nil {
AndiDog marked this conversation as resolved.
Show resolved Hide resolved
secret, err := getToken(ctx, remoteClient, token)
if err != nil {
return ctrl.Result{}, errors.Wrapf(err, "failed to get bootstrap token secret in order to refresh it")
}
log = log.WithValues("Secret", klog.KObj(secret))

secretExpiration := bootstrapsecretutil.GetData(secret, bootstrapapi.BootstrapTokenExpirationKey)
if secretExpiration == "" {
log.Info(fmt.Sprintf("Token has no valid value for %s, writing new expiration timestamp", bootstrapapi.BootstrapTokenExpirationKey))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In which case would the expiration be empty?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It shouldn't be, as of CAPA code. However external operators and humans interact with Kubernetes as well, so this is regular error handling code for the mere theoretical possibility of this happening.

} else {
// Assuming UTC, since we create the label value with that timezone
expiration, err := time.Parse(time.RFC3339, secretExpiration)
if err != nil {
return ctrl.Result{}, errors.Wrapf(err, "can't parse expiration time of bootstrap token")
}

now := time.Now().UTC()
skipTokenRefreshIfExpiringAfter := now.Add(r.skipTokenRefreshIfExpiringAfter())
if expiration.After(skipTokenRefreshIfExpiringAfter) {
log.V(3).Info("Token needs no refresh", "tokenExpiresInSeconds", expiration.Sub(now).Seconds())
return ctrl.Result{
RequeueAfter: r.tokenCheckRefreshOrRotationInterval(),
}, nil
}
}

// Extend TTL for existing token
newExpiration := time.Now().UTC().Add(r.TokenTTL).Format(time.RFC3339)
secret.Data[bootstrapapi.BootstrapTokenExpirationKey] = []byte(newExpiration)
log.Info("Refreshing token until the infrastructure has a chance to consume it", "oldExpiration", secretExpiration, "newExpiration", newExpiration)
AndiDog marked this conversation as resolved.
Show resolved Hide resolved
err = remoteClient.Update(ctx, secret)
if err != nil {
AndiDog marked this conversation as resolved.
Show resolved Hide resolved
return ctrl.Result{}, errors.Wrapf(err, "failed to refresh bootstrap token")
}
return ctrl.Result{
RequeueAfter: r.TokenTTL / 2,
RequeueAfter: r.tokenCheckRefreshOrRotationInterval(),
}, nil
}

Expand Down Expand Up @@ -355,7 +387,7 @@ func (r *KubeadmConfigReconciler) rotateMachinePoolBootstrapToken(ctx context.Co
return r.joinWorker(ctx, scope)
}
return ctrl.Result{
RequeueAfter: r.TokenTTL / 3,
RequeueAfter: r.tokenCheckRefreshOrRotationInterval(),
}, nil
}

Expand Down Expand Up @@ -632,7 +664,9 @@ func (r *KubeadmConfigReconciler) joinWorker(ctx context.Context, scope *Scope)
scope.Error(err, "Failed to store bootstrap data")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil

// Ensure reconciling this object again so we keep refreshing the bootstrap token until it is consumed
return ctrl.Result{RequeueAfter: r.tokenCheckRefreshOrRotationInterval()}, nil
}

func (r *KubeadmConfigReconciler) joinControlplane(ctx context.Context, scope *Scope) (ctrl.Result, error) {
Expand Down Expand Up @@ -737,7 +771,8 @@ func (r *KubeadmConfigReconciler) joinControlplane(ctx context.Context, scope *S
return ctrl.Result{}, err
}

return ctrl.Result{}, nil
// Ensure reconciling this object again so we keep refreshing the bootstrap token until it is consumed
return ctrl.Result{RequeueAfter: r.tokenCheckRefreshOrRotationInterval()}, nil
}

// resolveFiles maps .Spec.Files into cloudinit.Files, resolving any object references
Expand Down Expand Up @@ -817,6 +852,27 @@ func (r *KubeadmConfigReconciler) resolveSecretPasswordContent(ctx context.Conte
return data, nil
}

// skipTokenRefreshIfExpiringAfter returns a duration. If the token's expiry timestamp is after
// `now + skipTokenRefreshIfExpiringAfter()`, it does not yet need a refresh.
func (r *KubeadmConfigReconciler) skipTokenRefreshIfExpiringAfter() time.Duration {
// Choose according to how often reconciliation is "woken up" by `tokenCheckRefreshOrRotationInterval`.
// Reconciliation should get triggered at least two times, i.e. have two chances to refresh the token (in case of
// one temporary failure), while the token is not refreshed.
return r.TokenTTL * 5 / 6
}

// tokenCheckRefreshOrRotationInterval defines when to trigger a reconciliation loop again to refresh or rotate a token.
func (r *KubeadmConfigReconciler) tokenCheckRefreshOrRotationInterval() time.Duration {
// This interval defines how often the reconciler should get triggered.
//
// `r.TokenTTL / 3` means reconciliation gets triggered at least 3 times within the expiry time of the token. The
// third call may be too late, so the first/second call have a chance to extend the expiry (refresh/rotate),
// allowing for one temporary failure.
//
// Related to `skipTokenRefreshIfExpiringAfter` and also token rotation (which is different from refreshing).
return r.TokenTTL / 3
}

// ClusterToKubeadmConfigs is a handler.ToRequestsFunc to be used to enqueue
// requests for reconciliation of KubeadmConfigs.
func (r *KubeadmConfigReconciler) ClusterToKubeadmConfigs(ctx context.Context, o client.Object) []ctrl.Request {
Expand Down
Loading
Loading