-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow marking releases stuck in a pending state as failed #16
Changes from all commits
a40503e
1758074
43c9d29
6c6491c
ede5514
45344b5
4674a18
6addcb0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -78,6 +78,7 @@ type Reconciler struct { | |
skipDependentWatches bool | ||
maxConcurrentReconciles int | ||
reconcilePeriod time.Duration | ||
markFailedAfter time.Duration | ||
maxHistory int | ||
|
||
annotSetupOnce sync.Once | ||
|
@@ -304,6 +305,18 @@ func WithMaxReleaseHistory(maxHistory int) Option { | |
} | ||
} | ||
|
||
// WithMarkFailedAfter specifies the duration after which the reconciler will mark a release in a pending (locked) | ||
// state as false in order to allow rolling forward. | ||
func WithMarkFailedAfter(duration time.Duration) Option { | ||
return func(r *Reconciler) error { | ||
if duration < 0 { | ||
return errors.New("auto-rollback after duration must not be negative") | ||
} | ||
r.markFailedAfter = duration | ||
return nil | ||
} | ||
} | ||
|
||
// WithInstallAnnotations is an Option that configures Install annotations | ||
// to enable custom action.Install fields to be set based on the value of | ||
// annotations found in the custom resource watched by this reconciler. | ||
|
@@ -553,6 +566,10 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl. | |
) | ||
return ctrl.Result{}, err | ||
} | ||
if state == statePending { | ||
return r.handlePending(actionClient, rel, &u, log) | ||
} | ||
|
||
u.UpdateStatus(updater.EnsureCondition(conditions.Irreconcilable(corev1.ConditionFalse, "", ""))) | ||
|
||
for _, h := range r.preHooks { | ||
|
@@ -630,6 +647,7 @@ const ( | |
stateNeedsInstall helmReleaseState = "needs install" | ||
stateNeedsUpgrade helmReleaseState = "needs upgrade" | ||
stateUnchanged helmReleaseState = "unchanged" | ||
statePending helmReleaseState = "pending" | ||
stateError helmReleaseState = "error" | ||
) | ||
|
||
|
@@ -678,6 +696,10 @@ func (r *Reconciler) getReleaseState(client helmclient.ActionInterface, obj meta | |
return nil, stateNeedsInstall, nil | ||
} | ||
|
||
if currentRelease.Info != nil && currentRelease.Info.Status.IsPending() { | ||
return currentRelease, statePending, nil | ||
} | ||
|
||
var opts []helmclient.UpgradeOption | ||
if r.maxHistory > 0 { | ||
opts = append(opts, func(u *action.Upgrade) error { | ||
|
@@ -755,6 +777,35 @@ func (r *Reconciler) doUpgrade(actionClient helmclient.ActionInterface, u *updat | |
return rel, nil | ||
} | ||
|
||
func (r *Reconciler) handlePending(actionClient helmclient.ActionInterface, rel *release.Release, u *updater.Updater, log logr.Logger) (ctrl.Result, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add a new test case for a reconciliation with a |
||
err := r.doHandlePending(actionClient, rel, log) | ||
if err == nil { | ||
err = errors.New("unknown error handling pending release") | ||
} | ||
u.UpdateStatus( | ||
updater.EnsureCondition(conditions.Irreconcilable(corev1.ConditionTrue, conditions.ReasonPendingError, err))) | ||
return ctrl.Result{}, err | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think adding an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussed offline: the operator already uses a builtin rate-limiter to prevent overloading the API server. The situation we are handling here should be rare, and we expect that the primary cause are people running manual Helm operations. For these, the pending state will only last for ~10s at most, and we don't want to block the operator for the next 2m if they encounter such a state. Note: with dependent watches, this might be less of an issue, but we currently don't have those. |
||
} | ||
|
||
func (r *Reconciler) doHandlePending(actionClient helmclient.ActionInterface, rel *release.Release, log logr.Logger) error { | ||
if r.markFailedAfter <= 0 { | ||
return errors.New("Release is in a pending (locked) state and cannot be modified. User intervention is required.") | ||
} | ||
if rel.Info == nil || rel.Info.LastDeployed.IsZero() { | ||
return errors.New("Release is in a pending (locked) state and lacks 'last deployed' timestamp. User intervention is required.") | ||
} | ||
if pendingSince := time.Since(rel.Info.LastDeployed.Time); pendingSince < r.markFailedAfter { | ||
return fmt.Errorf("Release is in a pending (locked) state and cannot currently be modified. Release will be marked failed to allow a roll-forward in %v.", r.markFailedAfter-pendingSince) | ||
} | ||
|
||
log.Info("Marking release as failed", "releaseName", rel.Name) | ||
err := actionClient.MarkFailed(rel, fmt.Sprintf("operator marked pending (locked) release as failed after state did not change for %v", r.markFailedAfter)) | ||
if err != nil { | ||
return fmt.Errorf("Failed to mark pending (locked) release as failed: %w", err) | ||
} | ||
return fmt.Errorf("marked release %s as failed to allow upgrade to succeed in next reconcile attempt", rel.Name) | ||
} | ||
|
||
func (r *Reconciler) reportOverrideEvents(obj runtime.Object) { | ||
for k, v := range r.overrideValues { | ||
r.eventRecorder.Eventf(obj, "Warning", "ValueOverridden", | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I really like the idea to mark the release as failed and being reconciled by the existing logic. Nice! 💯