Skip to content

Commit

Permalink
Watch external objects for machine before deleting
Browse files Browse the repository at this point in the history
This fixes a race condition in the machine controller when the controller is restarted after a machine has been marked for deletion but before the infra machine or bootstrap config are deleted. In that case, the controller doesn't have watches on those external objects, so the reconciliation is not triggered once they are deleted. This means that Machines stay in Deleting state for the resync period, the default is 10 minutes.
  • Loading branch information
g-gaston committed Jan 23, 2024
1 parent 624f892 commit 639fbb0
Show file tree
Hide file tree
Showing 3 changed files with 241 additions and 19 deletions.
24 changes: 16 additions & 8 deletions internal/controllers/machine/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
if err != nil {
switch err {
case errNoControlPlaneNodes, errLastControlPlaneNode, errNilNodeRef, errClusterIsBeingDeleted, errControlPlaneIsBeingDeleted:
var nodeName = ""
nodeName := ""
if m.Status.NodeRef != nil {
nodeName = m.Status.NodeRef.Name
}
Expand Down Expand Up @@ -427,7 +427,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
}

infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, m)
infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, cluster, m)
if err != nil {
return ctrl.Result{}, err
}
Expand All @@ -436,7 +436,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
return ctrl.Result{}, nil
}

bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, m)
bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, cluster, m)
if err != nil {
return ctrl.Result{}, err
}
Expand Down Expand Up @@ -723,8 +723,8 @@ func (r *Reconciler) deleteNode(ctx context.Context, cluster *clusterv1.Cluster,
return nil
}

func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, m *clusterv1.Machine) (bool, error) {
obj, err := r.reconcileDeleteExternal(ctx, m, m.Spec.Bootstrap.ConfigRef)
func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) {
obj, err := r.reconcileDeleteExternal(ctx, cluster, m, m.Spec.Bootstrap.ConfigRef)
if err != nil {
return false, err
}
Expand All @@ -743,8 +743,8 @@ func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, m *clusterv1.
return false, nil
}

func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, m *clusterv1.Machine) (bool, error) {
obj, err := r.reconcileDeleteExternal(ctx, m, &m.Spec.InfrastructureRef)
func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) {
obj, err := r.reconcileDeleteExternal(ctx, cluster, m, &m.Spec.InfrastructureRef)
if err != nil {
return false, err
}
Expand All @@ -764,7 +764,7 @@ func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, m *clust
}

// reconcileDeleteExternal tries to delete external references.
func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, m *clusterv1.Machine, ref *corev1.ObjectReference) (*unstructured.Unstructured, error) {
func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (*unstructured.Unstructured, error) {
if ref == nil {
return nil, nil
}
Expand All @@ -777,6 +777,14 @@ func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, m *clusterv1.M
}

if obj != nil {
// reconcileExternal ensures that we set the object's OwnerReferences correctly and watch the object.
// The machine delete logic depends on reconciling the machine when the external objects are deleted.
// This avoids a race condition where the machine is deleted before the external objects are ever reconciled
// by this controller.
if _, err := r.ensureExternalOwnershipAndWatch(ctx, cluster, m, ref); err != nil {
return nil, err
}

// Issue a delete request.
if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) {
return obj, errors.Wrapf(err,
Expand Down
40 changes: 35 additions & 5 deletions internal/controllers/machine/machine_controller_phases.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,7 @@ import (
"sigs.k8s.io/cluster-api/util/patch"
)

var (
externalReadyWait = 30 * time.Second
)
var externalReadyWait = 30 * time.Second

func (r *Reconciler) reconcilePhase(_ context.Context, m *clusterv1.Machine) {
originalPhase := m.Status.Phase
Expand Down Expand Up @@ -89,12 +87,44 @@ func (r *Reconciler) reconcilePhase(_ context.Context, m *clusterv1.Machine) {

// reconcileExternal handles generic unstructured objects referenced by a Machine.
func (r *Reconciler) reconcileExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (external.ReconcileOutput, error) {
log := ctrl.LoggerFrom(ctx)

if err := utilconversion.UpdateReferenceAPIContract(ctx, r.Client, ref); err != nil {
return external.ReconcileOutput{}, err
}

result, err := r.ensureExternalOwnershipAndWatch(ctx, cluster, m, ref)
if err != nil {
return external.ReconcileOutput{}, err
}
if result.RequeueAfter > 0 || result.Paused {
return result, nil
}

obj := result.Result

// Set failure reason and message, if any.
failureReason, failureMessage, err := external.FailuresFrom(obj)
if err != nil {
return external.ReconcileOutput{}, err
}
if failureReason != "" {
machineStatusError := capierrors.MachineStatusError(failureReason)
m.Status.FailureReason = &machineStatusError
}
if failureMessage != "" {
m.Status.FailureMessage = ptr.To(
fmt.Sprintf("Failure detected from referenced resource %v with name %q: %s",
obj.GroupVersionKind(), obj.GetName(), failureMessage),
)
}

return external.ReconcileOutput{Result: obj}, nil
}

// ensureExternalOwnershipAndWatch ensures that only the Machine owns the external object,
// adds a watch to the external object if one does not already exist and adds the necessary labels.
func (r *Reconciler) ensureExternalOwnershipAndWatch(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (external.ReconcileOutput, error) {
log := ctrl.LoggerFrom(ctx)

obj, err := external.Get(ctx, r.UnstructuredCachingClient, ref, m.Namespace)
if err != nil {
if apierrors.IsNotFound(errors.Cause(err)) {
Expand Down
196 changes: 190 additions & 6 deletions internal/controllers/machine/machine_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

Expand Down Expand Up @@ -150,7 +151,8 @@ func TestWatches(t *testing.T) {
Kind: "GenericBootstrapConfig",
Name: "bootstrap-config-machinereconcile",
},
}},
},
},
}

g.Expect(env.Create(ctx, machine)).To(Succeed())
Expand Down Expand Up @@ -183,6 +185,185 @@ func TestWatches(t *testing.T) {
}, timeout).Should(BeTrue())
}

func TestWatchesDelete(t *testing.T) {
g := NewWithT(t)
ns, err := env.CreateNamespace(ctx, "test-machine-watches-delete")
g.Expect(err).ToNot(HaveOccurred())

infraMachine := &unstructured.Unstructured{
Object: map[string]interface{}{
"kind": "GenericInfrastructureMachine",
"apiVersion": "infrastructure.cluster.x-k8s.io/v1beta1",
"metadata": map[string]interface{}{
"name": "infra-config1",
"namespace": ns.Name,
},
"spec": map[string]interface{}{
"providerID": "test://id-1",
},
"status": map[string]interface{}{
"ready": true,
"addresses": []interface{}{
map[string]interface{}{
"type": "InternalIP",
"address": "10.0.0.1",
},
},
},
},
}
infraMachineFinalizer := "test.infrastructure.cluster.x-k8s.io"
controllerutil.AddFinalizer(infraMachine, infraMachineFinalizer)

defaultBootstrap := &unstructured.Unstructured{
Object: map[string]interface{}{
"kind": "GenericBootstrapConfig",
"apiVersion": "bootstrap.cluster.x-k8s.io/v1beta1",
"metadata": map[string]interface{}{
"name": "bootstrap-config-machinereconcile",
"namespace": ns.Name,
},
"spec": map[string]interface{}{},
"status": map[string]interface{}{},
},
}
bootstrapFinalizer := "test.bootstrap.cluster.x-k8s.io"
controllerutil.AddFinalizer(defaultBootstrap, bootstrapFinalizer)

testCluster := &clusterv1.Cluster{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "machine-reconcile-",
Namespace: ns.Name,
},
Spec: clusterv1.ClusterSpec{
// we create the cluster in paused state so we don't reconcile
// the machine immediately after creation.
// This avoids going through reconcileExternal, which adds watches
// for the provider machine and the bootstrap config objects.
Paused: true,
},
}

g.Expect(env.Create(ctx, testCluster)).To(Succeed())
g.Expect(env.CreateKubeconfigSecret(ctx, testCluster)).To(Succeed())
g.Expect(env.Create(ctx, defaultBootstrap)).To(Succeed())
g.Expect(env.Create(ctx, infraMachine)).To(Succeed())

defer func(do ...client.Object) {
g.Expect(env.Cleanup(ctx, do...)).To(Succeed())
}(ns, testCluster, defaultBootstrap)

// Patch infra machine ready
patchHelper, err := patch.NewHelper(infraMachine, env)
g.Expect(err).ShouldNot(HaveOccurred())
g.Expect(unstructured.SetNestedField(infraMachine.Object, true, "status", "ready")).To(Succeed())
g.Expect(patchHelper.Patch(ctx, infraMachine, patch.WithStatusObservedGeneration{})).To(Succeed())

// Patch bootstrap ready
patchHelper, err = patch.NewHelper(defaultBootstrap, env)
g.Expect(err).ShouldNot(HaveOccurred())
g.Expect(unstructured.SetNestedField(defaultBootstrap.Object, true, "status", "ready")).To(Succeed())
g.Expect(unstructured.SetNestedField(defaultBootstrap.Object, "secretData", "status", "dataSecretName")).To(Succeed())
g.Expect(patchHelper.Patch(ctx, defaultBootstrap, patch.WithStatusObservedGeneration{})).To(Succeed())

machine := &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "machine-created-",
Namespace: ns.Name,
Labels: map[string]string{
clusterv1.MachineControlPlaneLabel: "",
},
},
Spec: clusterv1.MachineSpec{
ClusterName: testCluster.Name,
InfrastructureRef: corev1.ObjectReference{
APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1",
Kind: "GenericInfrastructureMachine",
Name: "infra-config1",
},
Bootstrap: clusterv1.Bootstrap{
ConfigRef: &corev1.ObjectReference{
APIVersion: "bootstrap.cluster.x-k8s.io/v1beta1",
Kind: "GenericBootstrapConfig",
Name: "bootstrap-config-machinereconcile",
},
},
},
}
// We create the machine with a finalizer so the machine is not deleted immediately.
controllerutil.AddFinalizer(machine, clusterv1.MachineFinalizer)

g.Expect(env.Create(ctx, machine)).To(Succeed())
defer func() {
g.Expect(env.Cleanup(ctx, machine)).To(Succeed())
}()

// We mark the machine for deletion
g.Expect(env.Delete(ctx, machine)).To(Succeed())

// We unpause the cluster so the machine can be reconciled.
testCluster.Spec.Paused = false
g.Expect(env.Update(ctx, testCluster)).To(Succeed())

// Wait for reconciliation to happen.
// The first reconciliation should add the cluster name label.
key := client.ObjectKey{Name: machine.Name, Namespace: machine.Namespace}
g.Eventually(func() bool {
if err := env.Get(ctx, key, machine); err != nil {
return false
}
return machine.Labels[clusterv1.ClusterNameLabel] == testCluster.Name
}, timeout).Should(BeTrue())

// Deleting the machine should mark the infra machine for deletion
infraMachineKey := client.ObjectKey{Name: infraMachine.GetName(), Namespace: infraMachine.GetNamespace()}
g.Eventually(func() bool {
if err := env.Get(ctx, infraMachineKey, infraMachine); err != nil {
return false
}
return infraMachine.GetDeletionTimestamp() != nil
}, timeout).Should(BeTrue(), "infra machine should be marked for deletion")

// We wait a bit and remove the finalizer, simulating the infra machine controller.
time.Sleep(2 * time.Second)
infraMachine.SetFinalizers([]string{})
g.Expect(env.Update(ctx, infraMachine)).To(Succeed())

// This should delete the infra machine
g.Eventually(func() bool {
err := env.Get(ctx, infraMachineKey, infraMachine)
return apierrors.IsNotFound(err)
}, timeout).Should(BeTrue(), "infra machine should be deleted")

// If the watch on infra machine works, deleting of the infra machine will trigger another
// reconcile, which will mark the bootstrap config for deletion
bootstrapKey := client.ObjectKey{Name: defaultBootstrap.GetName(), Namespace: defaultBootstrap.GetNamespace()}
g.Eventually(func() bool {
if err := env.Get(ctx, bootstrapKey, defaultBootstrap); err != nil {
return false
}
return defaultBootstrap.GetDeletionTimestamp() != nil
}, timeout).Should(BeTrue(), "bootstrap config should be marked for deletion")

// We wait a bit a remove the finalizer, simulating the bootstrap config controller.
time.Sleep(2 * time.Second)
defaultBootstrap.SetFinalizers([]string{})
g.Expect(env.Update(ctx, defaultBootstrap)).To(Succeed())

// This should delete the bootstrap config.
g.Eventually(func() bool {
err := env.Get(ctx, bootstrapKey, defaultBootstrap)
return apierrors.IsNotFound(err)
}, timeout).Should(BeTrue(), "bootstrap config should be deleted")

// If the watch on bootstrap config works, the deleting of the bootstrap config will trigger another
// reconcile, which will remove the finalizer and delete the machine
g.Eventually(func() bool {
err := env.Get(ctx, key, machine)
return apierrors.IsNotFound(err)
}, timeout).Should(BeTrue(), "machine should be deleted")
}

func TestMachine_Reconcile(t *testing.T) {
g := NewWithT(t)

Expand Down Expand Up @@ -250,7 +431,8 @@ func TestMachine_Reconcile(t *testing.T) {
Kind: "GenericBootstrapConfig",
Name: "bootstrap-config-machinereconcile",
},
}},
},
},
Status: clusterv1.MachineStatus{
NodeRef: &corev1.ObjectReference{
Name: "test",
Expand Down Expand Up @@ -1076,13 +1258,13 @@ func TestReconcileDeleteExternal(t *testing.T) {
UnstructuredCachingClient: c,
}

obj, err := r.reconcileDeleteExternal(ctx, machine, machine.Spec.Bootstrap.ConfigRef)
g.Expect(obj).To(BeComparableTo(tc.expected))
obj, err := r.reconcileDeleteExternal(ctx, testCluster, machine, machine.Spec.Bootstrap.ConfigRef)
if tc.expectError {
g.Expect(err).To(HaveOccurred())
} else {
g.Expect(err).ToNot(HaveOccurred())
}
g.Expect(obj).To(BeComparableTo(tc.expected))
})
}
}
Expand Down Expand Up @@ -1889,7 +2071,8 @@ func TestNodeToMachine(t *testing.T) {
Kind: "GenericBootstrapConfig",
Name: "bootstrap-config-machinereconcile",
},
}},
},
},
}

g.Expect(env.Create(ctx, expectedMachine)).To(Succeed())
Expand Down Expand Up @@ -1928,7 +2111,8 @@ func TestNodeToMachine(t *testing.T) {
Kind: "GenericBootstrapConfig",
Name: "bootstrap-config-machinereconcile",
},
}},
},
},
}

g.Expect(env.Create(ctx, randomMachine)).To(Succeed())
Expand Down

0 comments on commit 639fbb0

Please sign in to comment.