-
Notifications
You must be signed in to change notification settings - Fork 152
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Retry on transient error when waiting for snapshot to be ready #2508
Changes from 3 commits
5b8f944
fbd6e78
026debb
ea245c8
25b1ab0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,7 +33,6 @@ import ( | |
|
||
"github.com/kanisterio/kanister/pkg/blockstorage" | ||
"github.com/kanisterio/kanister/pkg/kube/snapshot/apis/v1alpha1" | ||
"github.com/kanisterio/kanister/pkg/poll" | ||
) | ||
|
||
const ( | ||
|
@@ -318,24 +317,22 @@ func (sna *SnapshotAlpha) CreateContentFromSource(ctx context.Context, source *S | |
return nil | ||
} | ||
|
||
func isReadyToUseAlpha(us *unstructured.Unstructured) (bool, error) { | ||
vs := v1alpha1.VolumeSnapshot{} | ||
if err := TransformUnstructured(us, &vs); err != nil { | ||
return false, err | ||
} | ||
// Error can be set while waiting for creation | ||
if vs.Status.Error != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just wanted to confirm that the events are in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. I don't think we read |
||
return false, errors.New(vs.Status.Error.Message) | ||
} | ||
return (vs.Status.ReadyToUse && vs.Status.CreationTime != nil), nil | ||
} | ||
|
||
// WaitOnReadyToUse will block until the Volumesnapshot in namespace 'namespace' with name 'snapshotName' | ||
// has status 'ReadyToUse' or 'ctx.Done()' is signalled. | ||
func (sna *SnapshotAlpha) WaitOnReadyToUse(ctx context.Context, snapshotName, namespace string) error { | ||
return poll.Wait(ctx, func(context.Context) (bool, error) { | ||
us, err := sna.dynCli.Resource(v1alpha1.VolSnapGVR).Namespace(namespace).Get(ctx, snapshotName, metav1.GetOptions{}) | ||
if err != nil { | ||
return false, err | ||
} | ||
vs := v1alpha1.VolumeSnapshot{} | ||
if err := TransformUnstructured(us, &vs); err != nil { | ||
return false, err | ||
} | ||
// Error can be set while waiting for creation | ||
if vs.Status.Error != nil { | ||
return false, errors.New(vs.Status.Error.Message) | ||
} | ||
return (vs.Status.ReadyToUse && vs.Status.CreationTime != nil), nil | ||
}) | ||
return waitOnReadyToUse(ctx, sna.dynCli, v1alpha1.VolSnapGVR, snapshotName, namespace, isReadyToUseAlpha) | ||
} | ||
|
||
func (sna *SnapshotAlpha) GroupVersion(ctx context.Context) schema.GroupVersion { | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -364,6 +364,158 @@ func (s *SnapshotTestSuite) TestVolumeSnapshotCloneFake(c *C) { | |||||||||||||||||||
} | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
func (s *SnapshotTestSuite) TestWaitOnReadyToUse(c *C) { | ||||||||||||||||||||
snapshotNameBase := "snap-1-fake" | ||||||||||||||||||||
volName := "pvc-1-fake" | ||||||||||||||||||||
scheme := runtime.NewScheme() | ||||||||||||||||||||
scheme.AddKnownTypeWithName(schema.GroupVersionKind{Group: "snapshot.storage.k8s.io", Version: "v1alpha1", Kind: "VolumeSnapshotClassList"}, &unstructured.UnstructuredList{}) | ||||||||||||||||||||
scheme.AddKnownTypeWithName(schema.GroupVersionKind{Group: "snapshot.storage.k8s.io", Version: "v1beta1", Kind: "VolumeSnapshotClassList"}, &unstructured.UnstructuredList{}) | ||||||||||||||||||||
scheme.AddKnownTypeWithName(schema.GroupVersionKind{Group: "snapshot.storage.k8s.io", Version: "v1", Kind: "VolumeSnapshotClassList"}, &unstructured.UnstructuredList{}) | ||||||||||||||||||||
|
||||||||||||||||||||
fakeCli := fake.NewSimpleClientset() | ||||||||||||||||||||
|
||||||||||||||||||||
size, err := resource.ParseQuantity("1Gi") | ||||||||||||||||||||
c.Assert(err, IsNil) | ||||||||||||||||||||
|
||||||||||||||||||||
pvc := &corev1.PersistentVolumeClaim{ | ||||||||||||||||||||
ObjectMeta: metav1.ObjectMeta{ | ||||||||||||||||||||
Name: volName, | ||||||||||||||||||||
}, | ||||||||||||||||||||
Spec: corev1.PersistentVolumeClaimSpec{ | ||||||||||||||||||||
Resources: corev1.ResourceRequirements{ | ||||||||||||||||||||
Requests: corev1.ResourceList{ | ||||||||||||||||||||
corev1.ResourceStorage: size, | ||||||||||||||||||||
}, | ||||||||||||||||||||
}, | ||||||||||||||||||||
}, | ||||||||||||||||||||
} | ||||||||||||||||||||
_, err = fakeCli.CoreV1().PersistentVolumeClaims(defaultNamespace).Create(context.TODO(), pvc, metav1.CreateOptions{}) | ||||||||||||||||||||
c.Assert(err, IsNil) | ||||||||||||||||||||
|
||||||||||||||||||||
dynCli := dynfake.NewSimpleDynamicClient(scheme) | ||||||||||||||||||||
|
||||||||||||||||||||
for _, fakeSs := range []snapshot.Snapshotter{ | ||||||||||||||||||||
snapshot.NewSnapshotAlpha(fakeCli, dynCli), | ||||||||||||||||||||
snapshot.NewSnapshotBeta(fakeCli, dynCli), | ||||||||||||||||||||
snapshot.NewSnapshotStable(fakeCli, dynCli), | ||||||||||||||||||||
} { | ||||||||||||||||||||
ctx := context.Background() | ||||||||||||||||||||
|
||||||||||||||||||||
var volumeSnapshotGVR schema.GroupVersionResource | ||||||||||||||||||||
var snapshotName string | ||||||||||||||||||||
switch fakeSs.(type) { | ||||||||||||||||||||
case *snapshot.SnapshotAlpha: | ||||||||||||||||||||
volumeSnapshotGVR = v1alpha1.VolSnapGVR | ||||||||||||||||||||
snapshotName = snapshotNameBase + "-alpha" | ||||||||||||||||||||
case *snapshot.SnapshotBeta: | ||||||||||||||||||||
volumeSnapshotGVR = v1beta1.VolSnapGVR | ||||||||||||||||||||
snapshotName = snapshotNameBase + "-beta" | ||||||||||||||||||||
case *snapshot.SnapshotStable: | ||||||||||||||||||||
volumeSnapshotGVR = snapshot.VolSnapGVR | ||||||||||||||||||||
snapshotName = snapshotNameBase + "-stable" | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
err = fakeSs.Create(ctx, snapshotName, defaultNamespace, volName, &fakeClass, false, nil) | ||||||||||||||||||||
c.Assert(err, IsNil) | ||||||||||||||||||||
|
||||||||||||||||||||
// This function should timeout | ||||||||||||||||||||
timeout := 500 * time.Millisecond | ||||||||||||||||||||
bgTimeout := 5 * time.Second | ||||||||||||||||||||
// We don't have readyToUse and no error, waiting indefinitely | ||||||||||||||||||||
err = waitOnReadyToUseWithTimeout(c, ctx, fakeSs, snapshotName, defaultNamespace, timeout) | ||||||||||||||||||||
c.Assert(err, NotNil) | ||||||||||||||||||||
c.Assert(err.Error(), Matches, ".*context deadline exceeded*") | ||||||||||||||||||||
|
||||||||||||||||||||
reply := waitOnReadyToUseInBackground(c, ctx, fakeSs, snapshotName, defaultNamespace, bgTimeout) | ||||||||||||||||||||
setReadyStatus(c, dynCli, volumeSnapshotGVR, snapshotName, defaultNamespace) | ||||||||||||||||||||
select { | ||||||||||||||||||||
case err = <-reply: | ||||||||||||||||||||
c.Assert(err, IsNil) | ||||||||||||||||||||
case <-time.After(2 * time.Second): | ||||||||||||||||||||
c.Error("timeout waiting on ready to use") | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
setVolumeSnapshotStatus(c, dynCli, volumeSnapshotGVR, snapshotName, defaultNamespace, nil) | ||||||||||||||||||||
|
||||||||||||||||||||
// Set non-transient error | ||||||||||||||||||||
message := "some error" | ||||||||||||||||||||
setErrorStatus(c, dynCli, volumeSnapshotGVR, snapshotName, defaultNamespace, message) | ||||||||||||||||||||
|
||||||||||||||||||||
// If there is non-transient error, exit right away | ||||||||||||||||||||
err = waitOnReadyToUseWithTimeout(c, ctx, fakeSs, snapshotName, defaultNamespace, timeout) | ||||||||||||||||||||
c.Assert(err, NotNil) | ||||||||||||||||||||
c.Assert(err.Error(), Matches, ".*some error.*") | ||||||||||||||||||||
|
||||||||||||||||||||
// Set transient error | ||||||||||||||||||||
message = "the object has been modified; please apply your changes to the latest version and try again" | ||||||||||||||||||||
setErrorStatus(c, dynCli, volumeSnapshotGVR, snapshotName, defaultNamespace, message) | ||||||||||||||||||||
|
||||||||||||||||||||
// If there is a transient error, wait with exp backoff which is long | ||||||||||||||||||||
err = waitOnReadyToUseWithTimeout(c, ctx, fakeSs, snapshotName, defaultNamespace, timeout) | ||||||||||||||||||||
c.Assert(err, NotNil) | ||||||||||||||||||||
c.Assert(err.Error(), Matches, ".*context deadline exceeded*") | ||||||||||||||||||||
|
||||||||||||||||||||
reply = waitOnReadyToUseInBackground(c, ctx, fakeSs, snapshotName, defaultNamespace, bgTimeout) | ||||||||||||||||||||
setReadyStatus(c, dynCli, volumeSnapshotGVR, snapshotName, defaultNamespace) | ||||||||||||||||||||
select { | ||||||||||||||||||||
case err = <-reply: | ||||||||||||||||||||
c.Assert(err, IsNil) | ||||||||||||||||||||
case <-time.After(2 * time.Second): | ||||||||||||||||||||
c.Error("timeout waiting on ready to use") | ||||||||||||||||||||
} | ||||||||||||||||||||
} | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
// Helpers to work with volume snapshot status used in TestWaitOnReadyToUse | ||||||||||||||||||||
// ---------------------------------------------------------------------------- | ||||||||||||||||||||
|
||||||||||||||||||||
func waitOnReadyToUseInBackground(c *C, ctx context.Context, fakeSs snapshot.Snapshotter, snapshotName string, namespace string, timeout time.Duration) chan error { | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
[optional] formatting for readability |
||||||||||||||||||||
reply := make(chan error) | ||||||||||||||||||||
go func() { | ||||||||||||||||||||
err := waitOnReadyToUseWithTimeout(c, ctx, fakeSs, snapshotName, namespace, timeout) | ||||||||||||||||||||
reply <- err | ||||||||||||||||||||
}() | ||||||||||||||||||||
return reply | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
func waitOnReadyToUseWithTimeout(c *C, ctx context.Context, fakeSs snapshot.Snapshotter, snapshotName string, namespace string, timeout time.Duration) error { | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||
deadline := time.Now().Add(timeout) | ||||||||||||||||||||
deadlineCtx, cancel := context.WithDeadline(ctx, deadline) | ||||||||||||||||||||
defer cancel() | ||||||||||||||||||||
|
||||||||||||||||||||
err := fakeSs.WaitOnReadyToUse(deadlineCtx, snapshotName, defaultNamespace) | ||||||||||||||||||||
return err | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
func setReadyStatus(c *C, dynCli *dynfake.FakeDynamicClient, volumeSnapshotGVR schema.GroupVersionResource, snapshotName string, namespace string) { | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||
status := make(map[string]interface{}) | ||||||||||||||||||||
status["readyToUse"] = true | ||||||||||||||||||||
status["creationTime"] = time.Now().Format(time.RFC3339) | ||||||||||||||||||||
|
||||||||||||||||||||
setVolumeSnapshotStatus(c, dynCli, volumeSnapshotGVR, snapshotName, namespace, status) | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
func setErrorStatus(c *C, dynCli *dynfake.FakeDynamicClient, volumeSnapshotGVR schema.GroupVersionResource, snapshotName string, namespace string, message string) { | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||
status := make(map[string]interface{}) | ||||||||||||||||||||
status["Error"] = map[string]interface{}{ | ||||||||||||||||||||
"Message": message, | ||||||||||||||||||||
} | ||||||||||||||||||||
setVolumeSnapshotStatus(c, dynCli, volumeSnapshotGVR, snapshotName, namespace, status) | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
func setVolumeSnapshotStatus(c *C, dynCli *dynfake.FakeDynamicClient, volumeSnapshotGVR schema.GroupVersionResource, snapshotName string, namespace string, status map[string]interface{}) { | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) | ||||||||||||||||||||
defer cancel() | ||||||||||||||||||||
|
||||||||||||||||||||
us, err := dynCli.Resource(volumeSnapshotGVR).Namespace(namespace).Get(ctx, snapshotName, metav1.GetOptions{}) | ||||||||||||||||||||
c.Assert(err, IsNil) | ||||||||||||||||||||
us.Object["status"] = status | ||||||||||||||||||||
_, err = dynCli.Resource(volumeSnapshotGVR).Namespace(namespace).UpdateStatus(ctx, us, metav1.UpdateOptions{}) | ||||||||||||||||||||
c.Assert(err, IsNil) | ||||||||||||||||||||
} | ||||||||||||||||||||
|
||||||||||||||||||||
// ---------------------------------------------------------------------------- | ||||||||||||||||||||
|
||||||||||||||||||||
func (s *SnapshotTestSuite) TestVolumeSnapshotAlpha(c *C) { | ||||||||||||||||||||
if s.snapshotClassAlpha == nil { | ||||||||||||||||||||
c.Skip("No v1alpha1 Volumesnapshotclass in the cluster") | ||||||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Question: Is this the right poll to use here? Why is 100 retries the correct value?
It appears to me that there is currently only 1 other file where Kanister code uses poll.WaitWithRetries.
It is more common to use
poll.Wait
, sometimes with the ctx set to have a timeout. Or to callpoll.WaitWithBackoff
and a maximum time set.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could do that. The difference here is that we only retry on this specific type of error, while
poll.WaitWithBackoff
would not distinguish that.The
100
value is pretty arbitrary just to avoid the infinite wait. We could usepoll.WaitWithBackoffWithRetries
to configure max time to wait as well, but I don't know what's the good value in this context.Current setting for the context timeout is infinity because there's a long-running process. Inserting a new time limit here would mean potentially creating a new failure scenario. Setting retries would worst case result in similar behaviour to what we have right now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As @pavannd1 indicated we can check this in as is and iterate on the polling mechanism-- this will improve robustness as is.
We can retry only on a specific error when using poll.Wait or poll.WaitWithBackoff by making the check for that error within the function called by poll. See for example how an
apierrors.IsNotFound
error is handled in RepoServiceHandler.createService.See
kanister/pkg/controllers/repositoryserver/handler.go
Line 138 in 2756ffb
Yes, there are some subtle differences between terminating a poll with a timeout or by retry count-- in particular whether ctx expiration can affect a call made within the poll loop. But I don't think those differences matter when the call being made is just a Get.
Ultimately the retry count is going to be picked based on a some knowledge of how long it is reasonable to wait for the snapshot controller to resolve the transient. In my mind it is clearer to just express that as a time value instead of trying to calculate the retry count from desired time and the backoff parameters.
See the many instances of
poll.Wait(timeoutCtx,...
in pkg/app where an app-specific timeout is known and used. Or pkg/kube.WaitForPodReady.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Upon further reflection, I've realized that the code as implemented has the following behavior:
Importantly, setting a timeout on the context at beginning of all polling will affect the first wait, the wait for controller to ready snapshot before setting any error, transient or not.
While I still think it would be better to have an explicit time limit on retrying of transients, and we probably could work out a way to build it into a retry function closure, at that point it is no longer simple and clear.
Best solution is probably to keep retry count on trasients and add a comment about expected time that count corresponds to using default backoff.