-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
✨ Make timeout to wait for blocked move global and configurable #9741
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,7 +51,7 @@ type ResourceMutatorFunc func(u *unstructured.Unstructured) error | |
// ObjectMover defines methods for moving Cluster API objects to another management cluster. | ||
type ObjectMover interface { | ||
// Move moves all the Cluster API objects existing in a namespace (or from all the namespaces if empty) to a target management cluster. | ||
Move(ctx context.Context, namespace string, toCluster Client, dryRun bool, mutators ...ResourceMutatorFunc) error | ||
Move(ctx context.Context, namespace string, toCluster Client, dryRun bool, waitForUnblockTimeout time.Duration, mutators ...ResourceMutatorFunc) error | ||
|
||
// ToDirectory writes all the Cluster API objects existing in a namespace (or from all the namespaces if empty) to a target directory. | ||
ToDirectory(ctx context.Context, namespace string, directory string) error | ||
|
@@ -70,7 +70,7 @@ type objectMover struct { | |
// ensure objectMover implements the ObjectMover interface. | ||
var _ ObjectMover = &objectMover{} | ||
|
||
func (o *objectMover) Move(ctx context.Context, namespace string, toCluster Client, dryRun bool, mutators ...ResourceMutatorFunc) error { | ||
func (o *objectMover) Move(ctx context.Context, namespace string, toCluster Client, dryRun bool, waitForUnblockTimeout time.Duration, mutators ...ResourceMutatorFunc) error { | ||
log := logf.Log | ||
log.Info("Performing move...") | ||
o.dryRun = dryRun | ||
|
@@ -98,7 +98,7 @@ func (o *objectMover) Move(ctx context.Context, namespace string, toCluster Clie | |
proxy = toCluster.Proxy() | ||
} | ||
|
||
return o.move(ctx, objectGraph, proxy, mutators...) | ||
return o.move(ctx, objectGraph, proxy, waitForUnblockTimeout, mutators...) | ||
} | ||
|
||
func (o *objectMover) ToDirectory(ctx context.Context, namespace string, directory string) error { | ||
|
@@ -315,7 +315,7 @@ func getMachineObj(ctx context.Context, proxy Proxy, machine *node, machineObj * | |
} | ||
|
||
// Move moves all the Cluster API objects existing in a namespace (or from all the namespaces if empty) to a target management cluster. | ||
func (o *objectMover) move(ctx context.Context, graph *objectGraph, toProxy Proxy, mutators ...ResourceMutatorFunc) error { | ||
func (o *objectMover) move(ctx context.Context, graph *objectGraph, toProxy Proxy, waitForUnblockTimeout time.Duration, mutators ...ResourceMutatorFunc) error { | ||
log := logf.Log | ||
|
||
clusters := graph.getClusters() | ||
|
@@ -336,15 +336,19 @@ func (o *objectMover) move(ctx context.Context, graph *objectGraph, toProxy Prox | |
} | ||
|
||
log.Info("Waiting for all resources to be ready to move") | ||
// exponential backoff configuration which returns durations for a total time of ~2m. | ||
// Example: 0, 5s, 8s, 11s, 17s, 26s, 38s, 57s, 86s, 128s | ||
waitForMoveUnblockedBackoff := wait.Backoff{ | ||
Duration: 5 * time.Second, | ||
Factor: 1.5, | ||
Steps: 10, | ||
// backoff to wait for a successful GET to check for the annotation | ||
getResourceBackoff := newReadBackoff() | ||
// backoff to re-check if an individual resource is blocking move. | ||
// In total, this is excessively long (>2 days) to try to make sure it's always larger than the global | ||
// timeout. The global timeout will supersede this if it is reached first. | ||
waitForResourceMoveUnblockedBackoff := wait.Backoff{ | ||
Duration: 3 * time.Second, | ||
Steps: 100, | ||
Factor: 1.1, | ||
Jitter: 0.1, | ||
Cap: 1 * time.Hour, | ||
} | ||
if err := waitReadyForMove(ctx, o.fromProxy, graph.getMoveNodes(), o.dryRun, waitForMoveUnblockedBackoff); err != nil { | ||
if err := waitReadyForMove(ctx, o.fromProxy, graph.getMoveNodes(), o.dryRun, waitForUnblockTimeout, getResourceBackoff, waitForResourceMoveUnblockedBackoff); err != nil { | ||
return errors.Wrap(err, "error waiting for resources to be ready to move") | ||
} | ||
|
||
|
@@ -610,7 +614,7 @@ func setClusterClassPause(ctx context.Context, proxy Proxy, clusterclasses []*no | |
return nil | ||
} | ||
|
||
func waitReadyForMove(ctx context.Context, proxy Proxy, nodes []*node, dryRun bool, backoff wait.Backoff) error { | ||
func waitReadyForMove(ctx context.Context, proxy Proxy, nodes []*node, dryRun bool, globalTimeout time.Duration, getResourceBackoff, waitForResourceMoveUnblockedBackoff wait.Backoff) error { | ||
if dryRun { | ||
return nil | ||
} | ||
|
@@ -622,6 +626,9 @@ func waitReadyForMove(ctx context.Context, proxy Proxy, nodes []*node, dryRun bo | |
return errors.Wrap(err, "error creating client") | ||
} | ||
|
||
ctx, cancel := context.WithTimeout(ctx, globalTimeout) | ||
defer cancel() | ||
|
||
for _, n := range nodes { | ||
log := log.WithValues( | ||
"apiVersion", n.identity.GroupVersionKind(), | ||
|
@@ -647,18 +654,16 @@ func waitReadyForMove(ctx context.Context, proxy Proxy, nodes []*node, dryRun bo | |
} | ||
key := client.ObjectKeyFromObject(obj) | ||
|
||
blockLogged := false | ||
if err := retryWithExponentialBackoff(ctx, backoff, func(ctx context.Context) error { | ||
if err := c.Get(ctx, key, obj); err != nil { | ||
log.Info(fmt.Sprintf("Move blocked by %s annotation, waiting for it to be removed", clusterctlv1.BlockMoveAnnotation)) | ||
if err := retryWithExponentialBackoff(ctx, waitForResourceMoveUnblockedBackoff, func(ctx context.Context) error { | ||
if err := retryWithExponentialBackoff(ctx, getResourceBackoff, func(ctx context.Context) error { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is |
||
return c.Get(ctx, key, obj) | ||
}); err != nil { | ||
return errors.Wrapf(err, "error getting %s/%s", obj.GroupVersionKind(), key) | ||
} | ||
|
||
if _, exists := obj.GetAnnotations()[clusterctlv1.BlockMoveAnnotation]; exists { | ||
if !blockLogged { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we no longer need this check? Is this just extra code that was supposed to be removed in #9246? |
||
log.Info(fmt.Sprintf("Move blocked by %s annotation, waiting for it to be removed", clusterctlv1.BlockMoveAnnotation)) | ||
blockLogged = true | ||
} | ||
return errors.Errorf("resource is not ready to move: %s/%s", obj.GroupVersionKind(), key) | ||
return errors.New("Resource is blocking move") | ||
} | ||
log.V(5).Info("Resource is ready to move") | ||
return nil | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not super familiar clusterctl code base, how do you know this annotation is blocking here without checking for their existence?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
n.blockingMove
which is checked above is populated earlier in the flow by checking the annotation. This was added by #9246:cluster-api/cmd/clusterctl/client/cluster/objectgraph.go
Line 328 in 8f82f88