From e0f5b03a5b493d37a4d37b42c540f6aed6371ce3 Mon Sep 17 00:00:00 2001 From: Tiger Kaovilai Date: Fri, 31 May 2024 19:01:24 -0400 Subject: [PATCH] add backup finalizer patch retries Signed-off-by: Tiger Kaovilai --- pkg/client/retry.go | 22 ++++++++++--------- pkg/controller/backup_finalizer_controller.go | 6 ++++- .../restore_finalizer_controller_test.go | 4 +--- pkg/util/kube/client.go | 5 +++-- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/pkg/client/retry.go b/pkg/client/retry.go index 815419b477..c80c393766 100644 --- a/pkg/client/retry.go +++ b/pkg/client/retry.go @@ -45,7 +45,7 @@ var twoMinBackoff = wait.Backoff{ Steps: 8, Duration: 10 * time.Millisecond, Jitter: 0.1, - Cap: 2 * time.Minute, + Cap: 2 * time.Minute, } // override backoff for testing @@ -55,7 +55,7 @@ func TestOverrideBackoff() { Steps: twoMinBackoff.Steps, Duration: 1, Jitter: 0.0, - Cap: 2 * time.Minute, + Cap: 2 * time.Minute, } } @@ -65,7 +65,7 @@ func TestResetBackoff() { Steps: 8, Duration: 10 * time.Millisecond, Jitter: 0.1, - Cap: 2 * time.Minute, + Cap: 2 * time.Minute, } } @@ -75,10 +75,12 @@ func GetBackoffSteps() int { // RetriesPhasePatchFunc accepts a patch function param, retrying when the provided retriable function returns true. // We want retry to last up to 2 minutes: https://github.com/vmware-tanzu/velero/issues/7207#:~:text=A%20two%2Dminute%20retry%20is%20reasonable%20when%20there%20is%20API%20outage%20due%20to%20cert%20rotation. -func RetriesPhasePatchFunc(fn func() error, retriable func(error) bool, ) error { - return retry.OnError(twoMinBackoff, func(err error) bool { - return retriable(err) - }, func() error { - return fn() - }) -} \ No newline at end of file +func RetriesPhasePatchFunc(fn func() error, retriable func(error) bool) error { + return retry.OnError(twoMinBackoff, func(err error) bool { return retriable(err) }, fn) +} + +// RetriesPhasePatchFunc accepts a patch function param, retrying when the provided retriable function returns true. +// We want retry to last up to 2 minutes: https://github.com/vmware-tanzu/velero/issues/7207#:~:text=A%20two%2Dminute%20retry%20is%20reasonable%20when%20there%20is%20API%20outage%20due%20to%20cert%20rotation. +func RetriesPhasePatchFuncOnErrors(fn func() error) error { + return RetriesPhasePatchFunc(fn, func(err error) bool { return err != nil }) +} diff --git a/pkg/controller/backup_finalizer_controller.go b/pkg/controller/backup_finalizer_controller.go index 2bf17e9bfb..026d4b98d1 100644 --- a/pkg/controller/backup_finalizer_controller.go +++ b/pkg/controller/backup_finalizer_controller.go @@ -31,6 +31,7 @@ import ( velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" pkgbackup "github.com/vmware-tanzu/velero/pkg/backup" + "github.com/vmware-tanzu/velero/pkg/client" "github.com/vmware-tanzu/velero/pkg/itemoperation" "github.com/vmware-tanzu/velero/pkg/kuberesource" "github.com/vmware-tanzu/velero/pkg/metrics" @@ -119,7 +120,10 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ r.backupTracker.Delete(backup.Namespace, backup.Name) } // Always attempt to Patch the backup object and status after each reconciliation. - if err := r.client.Patch(ctx, backup, kbclient.MergeFrom(original)); err != nil { + // + // if this patch fails, there may not be another opportunity to update the backup object without external update event. + // so we retry + if err := client.RetriesPhasePatchFuncOnErrors(func() error { return r.client.Patch(ctx, backup, kbclient.MergeFrom(original)) }); err != nil { log.WithError(err).Error("Error updating backup") return } diff --git a/pkg/controller/restore_finalizer_controller_test.go b/pkg/controller/restore_finalizer_controller_test.go index 67370f2909..c2cc5ff210 100644 --- a/pkg/controller/restore_finalizer_controller_test.go +++ b/pkg/controller/restore_finalizer_controller_test.go @@ -34,8 +34,6 @@ import ( ctrl "sigs.k8s.io/controller-runtime" crclient "sigs.k8s.io/controller-runtime/pkg/client" - testingclock "k8s.io/utils/clock/testing" - "github.com/vmware-tanzu/velero/internal/hook" "github.com/vmware-tanzu/velero/internal/volume" velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" @@ -617,7 +615,7 @@ func Test_restoreFinalizerReconciler_finishProcessing(t *testing.T) { r := &restoreFinalizerReconciler{ Client: client, metrics: metrics.NewServerMetrics(), - clock: testingclock.NewFakeClock(time.Now()), + clock: testclocks.NewFakeClock(time.Now()), } restore := builder.ForRestore(velerov1api.DefaultNamespace, "restoreName").Result() if err := r.finishProcessing(velerov1api.RestorePhaseInProgress, restore, restore); (err != nil) != tt.wantErr { diff --git a/pkg/util/kube/client.go b/pkg/util/kube/client.go index 217b9f96c6..03d888b6bf 100644 --- a/pkg/util/kube/client.go +++ b/pkg/util/kube/client.go @@ -19,8 +19,9 @@ package kube import ( "context" - veleroPkgClient "github.com/vmware-tanzu/velero/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client" + + veleroPkgClient "github.com/vmware-tanzu/velero/pkg/client" ) func PatchResource(original, updated client.Object, kbClient client.Client) error { @@ -32,7 +33,7 @@ func PatchResource(original, updated client.Object, kbClient client.Client) erro // PatchResourceWithRetries patches the original resource with the updated resource, retrying when the provided retriable function returns true. // We want retry to last up to 2 minutes: https://github.com/vmware-tanzu/velero/issues/7207#:~:text=A%20two%2Dminute%20retry%20is%20reasonable%20when%20there%20is%20API%20outage%20due%20to%20cert%20rotation. func PatchResourceWithRetries(original, updated client.Object, kbClient client.Client, retriable func(error) bool) error { - return veleroPkgClient.RetriesPhasePatchFunc(func () error {return PatchResource(original, updated, kbClient)}, retriable) + return veleroPkgClient.RetriesPhasePatchFunc(func() error { return PatchResource(original, updated, kbClient) }, retriable) } // PatchResourceWithRetriesOnErrors patches the original resource with the updated resource, retrying when the operation returns an error.