Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow marking releases stuck in a pending state as failed #16

Merged
merged 8 commits into from
Jun 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pkg/client/actionclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ type ActionInterface interface {
Get(name string, opts ...GetOption) (*release.Release, error)
Install(name, namespace string, chrt *chart.Chart, vals map[string]interface{}, opts ...InstallOption) (*release.Release, error)
Upgrade(name, namespace string, chrt *chart.Chart, vals map[string]interface{}, opts ...UpgradeOption) (*release.Release, error)
MarkFailed(release *release.Release, reason string) error
Uninstall(name string, opts ...UninstallOption) (*release.UninstallReleaseResponse, error)
Reconcile(rel *release.Release) error
}
Expand Down Expand Up @@ -180,6 +181,14 @@ func (c *actionClient) Upgrade(name, namespace string, chrt *chart.Chart, vals m
return rel, nil
}

func (c *actionClient) MarkFailed(rel *release.Release, reason string) error {
infoCopy := *rel.Info
releaseCopy := *rel
releaseCopy.Info = &infoCopy
releaseCopy.SetStatus(release.StatusFailed, reason)
return c.conf.Releases.Update(&releaseCopy)
}

func (c *actionClient) Uninstall(name string, opts ...UninstallOption) (*release.UninstallReleaseResponse, error) {
uninstall := action.NewUninstall(c.conf)
for _, o := range opts {
Expand Down
1 change: 1 addition & 0 deletions pkg/reconciler/internal/conditions/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ const (
ReasonUpgradeError = status.ConditionReason("UpgradeError")
ReasonReconcileError = status.ConditionReason("ReconcileError")
ReasonUninstallError = status.ConditionReason("UninstallError")
ReasonPendingError = status.ConditionReason("PendingError")
)

func Initialized(stat corev1.ConditionStatus, reason status.ConditionReason, message interface{}) status.Condition {
Expand Down
34 changes: 23 additions & 11 deletions pkg/reconciler/internal/fake/actionclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,19 @@ func (hcg *fakeActionClientGetter) ActionClientFor(_ crclient.Object) (client.Ac
}

type ActionClient struct {
Gets []GetCall
Installs []InstallCall
Upgrades []UpgradeCall
Uninstalls []UninstallCall
Reconciles []ReconcileCall

HandleGet func() (*release.Release, error)
HandleInstall func() (*release.Release, error)
HandleUpgrade func() (*release.Release, error)
HandleUninstall func() (*release.UninstallReleaseResponse, error)
HandleReconcile func() error
Gets []GetCall
Installs []InstallCall
Upgrades []UpgradeCall
MarkFaileds []MarkFailedCall
Uninstalls []UninstallCall
Reconciles []ReconcileCall

HandleGet func() (*release.Release, error)
HandleInstall func() (*release.Release, error)
HandleUpgrade func() (*release.Release, error)
HandleMarkFailed func() error
HandleUninstall func() (*release.UninstallReleaseResponse, error)
HandleReconcile func() error
}

func NewActionClient() ActionClient {
Expand Down Expand Up @@ -109,6 +111,11 @@ type UpgradeCall struct {
Opts []client.UpgradeOption
}

type MarkFailedCall struct {
Release *release.Release
Reason string
}

type UninstallCall struct {
Name string
Opts []client.UninstallOption
Expand All @@ -133,6 +140,11 @@ func (c *ActionClient) Upgrade(name, namespace string, chrt *chart.Chart, vals m
return c.HandleUpgrade()
}

func (c *ActionClient) MarkFailed(rel *release.Release, reason string) error {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really like the idea to mark the release as failed and being reconciled by the existing logic. Nice! 💯

c.MarkFaileds = append(c.MarkFaileds, MarkFailedCall{rel, reason})
return c.HandleMarkFailed()
}

func (c *ActionClient) Uninstall(name string, opts ...client.UninstallOption) (*release.UninstallReleaseResponse, error) {
c.Uninstalls = append(c.Uninstalls, UninstallCall{name, opts})
return c.HandleUninstall()
Expand Down
51 changes: 51 additions & 0 deletions pkg/reconciler/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ type Reconciler struct {
skipDependentWatches bool
maxConcurrentReconciles int
reconcilePeriod time.Duration
markFailedAfter time.Duration
maxHistory int

annotSetupOnce sync.Once
Expand Down Expand Up @@ -304,6 +305,18 @@ func WithMaxReleaseHistory(maxHistory int) Option {
}
}

// WithMarkFailedAfter specifies the duration after which the reconciler will mark a release in a pending (locked)
// state as false in order to allow rolling forward.
func WithMarkFailedAfter(duration time.Duration) Option {
return func(r *Reconciler) error {
if duration < 0 {
return errors.New("auto-rollback after duration must not be negative")
}
r.markFailedAfter = duration
return nil
}
}

// WithInstallAnnotations is an Option that configures Install annotations
// to enable custom action.Install fields to be set based on the value of
// annotations found in the custom resource watched by this reconciler.
Expand Down Expand Up @@ -553,6 +566,10 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.
)
return ctrl.Result{}, err
}
if state == statePending {
return r.handlePending(actionClient, rel, &u, log)
}

u.UpdateStatus(updater.EnsureCondition(conditions.Irreconcilable(corev1.ConditionFalse, "", "")))

for _, h := range r.preHooks {
Expand Down Expand Up @@ -630,6 +647,7 @@ const (
stateNeedsInstall helmReleaseState = "needs install"
stateNeedsUpgrade helmReleaseState = "needs upgrade"
stateUnchanged helmReleaseState = "unchanged"
statePending helmReleaseState = "pending"
stateError helmReleaseState = "error"
)

Expand Down Expand Up @@ -678,6 +696,10 @@ func (r *Reconciler) getReleaseState(client helmclient.ActionInterface, obj meta
return nil, stateNeedsInstall, nil
}

if currentRelease.Info != nil && currentRelease.Info.Status.IsPending() {
return currentRelease, statePending, nil
}

var opts []helmclient.UpgradeOption
if r.maxHistory > 0 {
opts = append(opts, func(u *action.Upgrade) error {
Expand Down Expand Up @@ -755,6 +777,35 @@ func (r *Reconciler) doUpgrade(actionClient helmclient.ActionInterface, u *updat
return rel, nil
}

func (r *Reconciler) handlePending(actionClient helmclient.ActionInterface, rel *release.Release, u *updater.Updater, log logr.Logger) (ctrl.Result, error) {
Copy link
Member

@SimonBaeumer SimonBaeumer Jun 29, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a new test case for a reconciliation with a MarkFailedAfter which tests the transisition from a pending to a failed release (and maybe even the roll-forward)?
As these changes affect the core logic of our reconciliations it is important from my point of view.

err := r.doHandlePending(actionClient, rel, log)
if err == nil {
err = errors.New("unknown error handling pending release")
}
u.UpdateStatus(
updater.EnsureCondition(conditions.Irreconcilable(corev1.ConditionTrue, conditions.ReasonPendingError, err)))
return ctrl.Result{}, err
Copy link
Member

@SimonBaeumer SimonBaeumer Jun 29, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think adding an interval to set the RequeueAfter property is useful here?
My fear is that the reconciliations are really fast in production systems and could create bigger loads on the API server which in turn leads to a slower API server.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed offline: the operator already uses a builtin rate-limiter to prevent overloading the API server. The situation we are handling here should be rare, and we expect that the primary cause are people running manual Helm operations. For these, the pending state will only last for ~10s at most, and we don't want to block the operator for the next 2m if they encounter such a state.

Note: with dependent watches, this might be less of an issue, but we currently don't have those.

}

func (r *Reconciler) doHandlePending(actionClient helmclient.ActionInterface, rel *release.Release, log logr.Logger) error {
if r.markFailedAfter <= 0 {
return errors.New("Release is in a pending (locked) state and cannot be modified. User intervention is required.")
}
if rel.Info == nil || rel.Info.LastDeployed.IsZero() {
return errors.New("Release is in a pending (locked) state and lacks 'last deployed' timestamp. User intervention is required.")
}
if pendingSince := time.Since(rel.Info.LastDeployed.Time); pendingSince < r.markFailedAfter {
return fmt.Errorf("Release is in a pending (locked) state and cannot currently be modified. Release will be marked failed to allow a roll-forward in %v.", r.markFailedAfter-pendingSince)
}

log.Info("Marking release as failed", "releaseName", rel.Name)
err := actionClient.MarkFailed(rel, fmt.Sprintf("operator marked pending (locked) release as failed after state did not change for %v", r.markFailedAfter))
if err != nil {
return fmt.Errorf("Failed to mark pending (locked) release as failed: %w", err)
}
return fmt.Errorf("marked release %s as failed to allow upgrade to succeed in next reconcile attempt", rel.Name)
}

func (r *Reconciler) reportOverrideEvents(obj runtime.Object) {
for k, v := range r.overrideValues {
r.eventRecorder.Eventf(obj, "Warning", "ValueOverridden",
Expand Down