Skip to content

Commit

Permalink
kubeutil.PatchResourceWithRetriesOnErrors and unit tests updates
Browse files Browse the repository at this point in the history
Signed-off-by: Tiger Kaovilai <tkaovila@redhat.com>
  • Loading branch information
kaovilai committed May 31, 2024
1 parent 8ef8fcb commit ba7cd07
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 14 deletions.
46 changes: 46 additions & 0 deletions pkg/client/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ package client

import (
"context"
"time"

apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/util/retry"
kbclient "sigs.k8s.io/controller-runtime/pkg/client"
)
Expand All @@ -36,3 +38,47 @@ func CreateRetryGenerateName(client kbclient.Client, ctx context.Context, obj kb
return client.Create(ctx, obj, &kbclient.CreateOptions{})
}
}

// two minute backoff
var twoMinBackoff = wait.Backoff{
Factor: 5.0,
Steps: 8,
Duration: 10 * time.Millisecond,
Jitter: 0.1,
Cap: 2 * time.Minute,
}

// override backoff for testing
func TestOverrideBackoff() {
twoMinBackoff = wait.Backoff{
Factor: 1.0,
Steps: twoMinBackoff.Steps,
Duration: 1,
Jitter: 0.0,
Cap: 2 * time.Minute,
}
}

func TestResetBackoff() {
twoMinBackoff = wait.Backoff{
Factor: 5.0,
Steps: 8,
Duration: 10 * time.Millisecond,
Jitter: 0.1,
Cap: 2 * time.Minute,
}
}

func GetBackoffSteps() int {
return twoMinBackoff.Steps
}

// RetriesPhasePatchFunc accepts a patch function param, retrying when the provided retriable function returns true.
// We want retry to last up to 2 minutes: https://github.com/vmware-tanzu/velero/issues/7207#:~:text=A%20two%2Dminute%20retry%20is%20reasonable%20when%20there%20is%20API%20outage%20due%20to%20cert%20rotation.
func RetriesPhasePatchFunc(fn func() error, retriable func(error) bool, ) error {
return retry.OnError(twoMinBackoff, func(err error) bool {
return retriable(err)
}, func() error {
return fn()
})
}
2 changes: 1 addition & 1 deletion pkg/controller/backup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
b.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure)
}
log.Info("Updating backup's final status")
if err := kubeutil.PatchResource(original, request.Backup, b.kbClient); err != nil {
if err := kubeutil.PatchResourceWithRetriesOnErrors(original, request.Backup, b.kbClient); err != nil {
log.WithError(err).Error("error updating backup's final status")
}
return ctrl.Result{}, nil
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/restore_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ func (r *restoreReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
}
log.Debug("Updating restore's final status")

if err = kubeutil.PatchResource(original, restore, r.kbClient); err != nil {
if err = kubeutil.PatchResourceWithRetriesOnErrors(original, restore, r.kbClient); err != nil {
log.WithError(errors.WithStack(err)).Info("Error updating restore's final status")
// No need to re-enqueue here, because restore's already set to InProgress before.
// Controller only handle New restore.
Expand Down
12 changes: 2 additions & 10 deletions pkg/controller/restore_finalizer_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ import (
"github.com/sirupsen/logrus"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/util/retry"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

Expand Down Expand Up @@ -232,15 +231,8 @@ func (r *restoreFinalizerReconciler) finishProcessing(restorePhase velerov1api.R
r.metrics.RegisterRestoreSuccess(restore.Spec.ScheduleName)
}
restore.Status.CompletionTimestamp = &metav1.Time{Time: r.clock.Now()}
// retry using DefaultBackoff to resolve connection refused error that may occur when the server is under heavy load
// TODO: consider using a more specific error type to retry, for now, we retry on all errors
// specific errors:
// - connection refused: https://pkg.go.dev/syscall#:~:text=Errno(0x67)-,ECONNREFUSED,-%3D%20Errno(0x6f
return retry.OnError(retry.DefaultBackoff, func(err error) bool {
return err != nil
}, func() error {
return kubeutil.PatchResource(original, restore, r.Client)
})

return kubeutil.PatchResourceWithRetriesOnErrors(original, restore, r.Client)
}

// finalizerContext includes all the dependencies required by finalization tasks and
Expand Down
6 changes: 4 additions & 2 deletions pkg/controller/restore_finalizer_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ import (
corev1api "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/util/retry"
testclocks "k8s.io/utils/clock/testing"
ctrl "sigs.k8s.io/controller-runtime"
crclient "sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -41,6 +40,7 @@ import (
"github.com/vmware-tanzu/velero/internal/volume"
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
"github.com/vmware-tanzu/velero/pkg/builder"
veleroPkgClient "github.com/vmware-tanzu/velero/pkg/client"
"github.com/vmware-tanzu/velero/pkg/metrics"
persistencemocks "github.com/vmware-tanzu/velero/pkg/persistence/mocks"
"github.com/vmware-tanzu/velero/pkg/plugin/clientmgmt"
Expand Down Expand Up @@ -560,6 +560,8 @@ func TestWaitRestoreExecHook(t *testing.T) {

// test finishprocessing with mocks of kube client to simulate connection refused
func Test_restoreFinalizerReconciler_finishProcessing(t *testing.T) {
veleroPkgClient.TestOverrideBackoff()
defer veleroPkgClient.TestResetBackoff()
type args struct {
// mockClientActions simulate different client errors
mockClientActions func(*pkgUtilKubeMocks.Client)
Expand Down Expand Up @@ -590,7 +592,7 @@ func Test_restoreFinalizerReconciler_finishProcessing(t *testing.T) {
client.On("Patch", mock.Anything, mock.Anything, mock.Anything).Return(syscall.ECONNREFUSED)
},
mockClientAsserts: func(client *pkgUtilKubeMocks.Client) bool {
return client.AssertNumberOfCalls(t, "Patch", retry.DefaultBackoff.Steps)
return client.AssertNumberOfCalls(t, "Patch", veleroPkgClient.GetBackoffSteps())
},
},
wantErr: true,
Expand Down
2 changes: 2 additions & 0 deletions pkg/controller/restore_operations_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ func (r *restoreOperationsReconciler) Reconcile(ctx context.Context, req ctrl.Re
if err != nil {
log.Warnf("Cannot check progress on Restore operations because backup info is unavailable %s; marking restore FinalizingPartiallyFailed", err.Error())
restore.Status.Phase = velerov1api.RestorePhaseFinalizingPartiallyFailed
// Don't need to retry as Reconcile will be called again
err2 := r.updateRestoreAndOperationsJSON(ctx, original, restore, nil, &itemoperationmap.OperationsForRestore{ErrsSinceUpdate: []string{err.Error()}}, false, false)
if err2 != nil {
log.WithError(err2).Error("error updating Restore")
Expand Down Expand Up @@ -183,6 +184,7 @@ func (r *restoreOperationsReconciler) Reconcile(ctx context.Context, req ctrl.Re
restore.Status.Phase = velerov1api.RestorePhaseFinalizingPartiallyFailed
}
}
// No need to retry as Reconcile will be called again
err = r.updateRestoreAndOperationsJSON(ctx, original, restore, backupStore, operations, changes, completionChanges)
if err != nil {
return ctrl.Result{}, errors.Wrap(err, "error updating Restore")
Expand Down
18 changes: 18 additions & 0 deletions pkg/util/kube/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package kube
import (
"context"

veleroPkgClient "github.com/vmware-tanzu/velero/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client"
)

Expand All @@ -27,3 +28,20 @@ func PatchResource(original, updated client.Object, kbClient client.Client) erro

return err
}

// PatchResourceWithRetries patches the original resource with the updated resource, retrying when the provided retriable function returns true.
// We want retry to last up to 2 minutes: https://github.com/vmware-tanzu/velero/issues/7207#:~:text=A%20two%2Dminute%20retry%20is%20reasonable%20when%20there%20is%20API%20outage%20due%20to%20cert%20rotation.
func PatchResourceWithRetries(original, updated client.Object, kbClient client.Client, retriable func(error) bool) error {
return veleroPkgClient.RetriesPhasePatchFunc(func () error {return PatchResource(original, updated, kbClient)}, retriable)
}

// PatchResourceWithRetriesOnErrors patches the original resource with the updated resource, retrying when the operation returns an error.
func PatchResourceWithRetriesOnErrors(original, updated client.Object, kbClient client.Client) error {
return PatchResourceWithRetries(original, updated, kbClient, func(err error) bool {
// retry using DefaultBackoff to resolve connection refused error that may occur when the server is under heavy load
// TODO: consider using a more specific error type to retry, for now, we retry on all errors
// specific errors:
// - connection refused: https://pkg.go.dev/syscall#:~:text=Errno(0x67)-,ECONNREFUSED,-%3D%20Errno(0x6f
return err != nil
})
}

0 comments on commit ba7cd07

Please sign in to comment.