Skip to content

Commit

Permalink
MGMT-10968: Replace agentmachine finalizer with machine pre-delete ho…
Browse files Browse the repository at this point in the history
…ok (openshift#54)

* Update assisted-service api and models to latest

* Watch for updates to machines related to an agentmachine

* Return machine from newAgentMachine test helper

* Replace agentmachine finalizer with machine pre-delete hook

Previously when a machine was deleted CAPI would cordon/drain the node,
then wait on the finalizer for our agentmachine resource. This chain of
events doesn't allow assisted-service time to run a pod on the node to
reboot it into discovery.

This commit adds a PreTerminateDeleteHook to the machine resource instead of
a finalizer on the agent machine. When the agentmachine controller sees
that the machine is waiting on the hook, it unbinds the agent and waits
for the assisted-service to complete the reclaim work. Once the reclaim
has completed or failed, the agentmachine controller removes the hook
annotation.

This requires specifically the PreTerminateDeleteHook rather than the
PreDrain one because once we start the agent on the cluster we won't get
a chance to drain the node before it is rebooted into discovery.

https://issues.redhat.com/browse/MGMT-10968
  • Loading branch information
carbonin authored Sep 14, 2022
1 parent 7266349 commit e6fa01e
Show file tree
Hide file tree
Showing 47 changed files with 2,236 additions and 348 deletions.
191 changes: 141 additions & 50 deletions controllers/agentmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@ import (
"fmt"
"strconv"
"strings"
"time"

ignitionapi "github.com/coreos/ignition/v2/config/v3_1/types"
"github.com/go-openapi/swag"
aiv1beta1 "github.com/openshift/assisted-service/api/v1beta1"
aimodels "github.com/openshift/assisted-service/models"
capiproviderv1alpha1 "github.com/openshift/cluster-api-provider-agent/api/v1alpha1"
openshiftconditionsv1 "github.com/openshift/custom-resource-status/conditions/v1"
"github.com/sirupsen/logrus"
Expand All @@ -36,6 +38,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/selection"
"k8s.io/apimachinery/pkg/types"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
Expand All @@ -53,6 +56,8 @@ const (
AgentMachineFinalizerName = "agentmachine." + aiv1beta1.Group + "/deprovision"
AgentMachineRefLabelKey = "agentMachineRef"
AgentMachineRefNamespace = "agentMachineRefNamespace"

machineDeleteHookName = clusterv1.PreTerminateDeleteHookAnnotationPrefix + "/agentmachine"
)

// AgentMachineReconciler reconciles a AgentMachine object
Expand Down Expand Up @@ -89,16 +94,6 @@ func (r *AgentMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request
return ctrl.Result{}, client.IgnoreNotFound(err)
}

res, err := r.handleDeletionFinalizer(ctx, log, agentMachine)
if res != nil || err != nil {
return *res, err
}

// If the AgentMachine is ready, we have nothing to do
if agentMachine.Status.Ready {
return ctrl.Result{}, nil
}

machine, err := clusterutil.GetOwnerMachine(ctx, r.Client, agentMachine.ObjectMeta)
if err != nil {
log.WithError(err).Error("failed to get owner machine")
Expand All @@ -109,6 +104,16 @@ func (r *AgentMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request
return r.updateStatus(ctx, log, agentMachine, ctrl.Result{}, nil)
}

res, err := r.handleDeletionHook(ctx, log, agentMachine, machine)
if res != nil || err != nil {
return *res, err
}

// If the AgentMachine is ready, we have nothing to do
if agentMachine.Status.Ready {
return ctrl.Result{}, nil
}

agentCluster, err := r.getAgentCluster(ctx, log, machine)
if err != nil {
return ctrl.Result{}, err
Expand Down Expand Up @@ -141,51 +146,101 @@ func (r *AgentMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request
return r.updateStatus(ctx, log, agentMachine, ctrl.Result{}, nil)
}

func (r *AgentMachineReconciler) handleDeletionFinalizer(ctx context.Context, log logrus.FieldLogger, agentMachine *capiproviderv1alpha1.AgentMachine) (*ctrl.Result, error) {
if agentMachine.ObjectMeta.DeletionTimestamp.IsZero() { // AgentMachine not being deleted
// Register a finalizer if it is absent.
if !funk.ContainsString(agentMachine.GetFinalizers(), AgentMachineFinalizerName) {
controllerutil.AddFinalizer(agentMachine, AgentMachineFinalizerName)
if err := r.Update(ctx, agentMachine); err != nil {
log.WithError(err).Errorf("failed to add finalizer %s to resource %s %s", AgentMachineFinalizerName, agentMachine.Name, agentMachine.Namespace)
return &ctrl.Result{}, err
}
func (r *AgentMachineReconciler) removeMachineDeletionHookAnnotation(ctx context.Context, machine *clusterv1.Machine) (err error) {
annotations := machine.GetAnnotations()
if _, haveMachineHookAnnotation := annotations[machineDeleteHookName]; haveMachineHookAnnotation {
delete(annotations, machineDeleteHookName)
machine.SetAnnotations(annotations)
err = r.Update(ctx, machine)
}
return err
}

func (r *AgentMachineReconciler) handleDeletionHook(ctx context.Context, log logrus.FieldLogger, agentMachine *capiproviderv1alpha1.AgentMachine, machine *clusterv1.Machine) (*ctrl.Result, error) {
// TODO: this can be removed when we're sure no agent machines have this finalizer anymore
if funk.ContainsString(agentMachine.GetFinalizers(), AgentMachineFinalizerName) {
controllerutil.RemoveFinalizer(agentMachine, AgentMachineFinalizerName)
if err := r.Update(ctx, agentMachine); err != nil {
log.WithError(err).Errorf("failed to remove finalizer %s from resource %s %s", AgentMachineFinalizerName, agentMachine.Name, agentMachine.Namespace)
return &ctrl.Result{}, err
}
} else { // AgentMachine is being deleted
r.Log.Info("Found deletion timestamp on AgentMachine")
if funk.ContainsString(agentMachine.GetFinalizers(), AgentMachineFinalizerName) {
// deletion finalizer found, unbind the Agent from the ClusterDeployment
if agentMachine.Status.AgentRef != nil {
r.Log.Info("Removing ClusterDeployment ref to unbind Agent")
agent, err := r.getAgent(ctx, log, agentMachine)
if err != nil {
if apierrors.IsNotFound(err) {
log.WithError(err).Infof("Failed to get agent %s. assuming the agent no longer exists", agentMachine.Status.AgentRef.Name)
} else {
log.WithError(err).Errorf("Failed to get agent %s", agentMachine.Status.AgentRef.Name)
return &ctrl.Result{}, err
}
} else {
// Remove the AgentMachineRefLabel and set the clusterDeployment to nil
delete(agent.ObjectMeta.Labels, AgentMachineRefLabelKey)
agent.Spec.ClusterDeploymentName = nil
if err := r.Update(ctx, agent); err != nil {
log.WithError(err).Error("failed to remove the Agent's ClusterDeployment ref")
return &ctrl.Result{}, err
}
}
}
}

// set delete hook if not present and machine not being deleted
annotations := machine.GetAnnotations()
if _, haveMachineHookAnnotation := annotations[machineDeleteHookName]; !haveMachineHookAnnotation && machine.DeletionTimestamp == nil {
if annotations == nil {
annotations = make(map[string]string)
}
annotations[machineDeleteHookName] = ""
machine.SetAnnotations(annotations)
if err := r.Update(ctx, machine); err != nil {
log.WithError(err).Error("failed to add machine delete hook annotation")
return &ctrl.Result{}, err
}
// return early here as there's no reason to check if the machine is held up on this hook as we just created it
return nil, nil
}

// return if the machine is not waiting on this hook
cond := conditions.Get(machine, clusterv1.PreTerminateDeleteHookSucceededCondition)
if cond == nil || cond.Status == corev1.ConditionTrue {
return nil, nil
}

if agentMachine.Status.AgentRef == nil {
log.Info("Removing machine delete hook annotation - agent ref is nil")
if err := r.removeMachineDeletionHookAnnotation(ctx, machine); err != nil {
log.WithError(err).Error("failed to remove machine delete hook annotation")
return &ctrl.Result{}, err
}
return nil, nil
}

// remove our finalizer from the list and update it.
controllerutil.RemoveFinalizer(agentMachine, AgentMachineFinalizerName)
if err := r.Update(ctx, agentMachine); err != nil {
log.WithError(err).Errorf("failed to remove finalizer %s from resource %s %s", AgentMachineFinalizerName, agentMachine.Name, agentMachine.Namespace)
return &ctrl.Result{}, err
agent, err := r.getAgent(ctx, log, agentMachine)
if err != nil {
if apierrors.IsNotFound(err) {
log.WithError(err).Infof("Failed to get agent %s. assuming the agent no longer exists", agentMachine.Status.AgentRef.Name)
if hookErr := r.removeMachineDeletionHookAnnotation(ctx, machine); hookErr != nil {
log.WithError(hookErr).Error("failed to remove machine delete hook annotation")
return &ctrl.Result{}, hookErr
}
return nil, nil
} else {
log.WithError(err).Errorf("Failed to get agent %s", agentMachine.Status.AgentRef.Name)
return &ctrl.Result{}, err
}
}

if funk.Contains(agent.ObjectMeta.Labels, AgentMachineRefLabelKey) || agent.Spec.ClusterDeploymentName != nil {
r.Log.Info("Removing ClusterDeployment ref to unbind Agent")
delete(agent.ObjectMeta.Labels, AgentMachineRefLabelKey)
agent.Spec.ClusterDeploymentName = nil
if err := r.Update(ctx, agent); err != nil {
log.WithError(err).Error("failed to remove the Agent's ClusterDeployment ref")
return &ctrl.Result{}, err
}
r.Log.Info("AgentMachine is ready for deletion")
}

return &ctrl.Result{}, nil
// Remove the hook when either the host is back to some kind of unbound state, reclaim fails, or is not enabled
removeHookStates := []string{
aimodels.HostStatusDiscoveringUnbound,
aimodels.HostStatusKnownUnbound,
aimodels.HostStatusDisconnectedUnbound,
aimodels.HostStatusInsufficientUnbound,
aimodels.HostStatusDisabledUnbound,
aimodels.HostStatusUnbindingPendingUserAction,
aimodels.HostStatusError,
}
if funk.Contains(removeHookStates, agent.Status.DebugInfo.State) {
log.Infof("Removing machine delete hook annotation for agent in status %s", agent.Status.DebugInfo.State)
if err := r.removeMachineDeletionHookAnnotation(ctx, machine); err != nil {
log.WithError(err).Error("failed to remove machine delete hook annotation")
return &ctrl.Result{}, err
}
} else {
log.Infof("Waiting for agent %s to reboot into discovery", agent.Name)
return &ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}

return nil, nil
Expand Down Expand Up @@ -538,6 +593,41 @@ func getAddresses(foundAgent *aiv1beta1.Agent) []clusterv1.MachineAddress {
return machineAddresses
}

func (r *AgentMachineReconciler) mapMachineToAgentMachine(machine client.Object) []reconcile.Request {
log := r.Log.WithFields(
logrus.Fields{
"machine": machine.GetName(),
"machine_namespace": machine.GetNamespace(),
},
)

amList := &capiproviderv1alpha1.AgentMachineList{}
opts := &client.ListOptions{
Namespace: machine.GetNamespace(),
}
if err := r.List(context.Background(), amList, opts); err != nil {
log.Debugf("failed to list agent machines")
return []reconcile.Request{}
}

for _, agentMachine := range amList.Items {
for _, ref := range agentMachine.OwnerReferences {
gv, err := schema.ParseGroupVersion(ref.APIVersion)
if err != nil {
continue
}
if ref.Kind == "Machine" && gv.Group == clusterv1.GroupVersion.Group && ref.Name == machine.GetName() {
return []reconcile.Request{{NamespacedName: types.NamespacedName{
Namespace: agentMachine.Namespace,
Name: agentMachine.Name,
}}}
}
}
}

return []reconcile.Request{}
}

// SetupWithManager sets up the controller with the Manager.
func (r *AgentMachineReconciler) SetupWithManager(mgr ctrl.Manager, agentNamespace string) error {
mapAgentToAgentMachine := func(agent client.Object) []reconcile.Request {
Expand Down Expand Up @@ -577,5 +667,6 @@ func (r *AgentMachineReconciler) SetupWithManager(mgr ctrl.Manager, agentNamespa
return ctrl.NewControllerManagedBy(mgr).
For(&capiproviderv1alpha1.AgentMachine{}).
Watches(&source.Kind{Type: &aiv1beta1.Agent{}}, handler.EnqueueRequestsFromMapFunc(mapAgentToAgentMachine)).
Watches(&source.Kind{Type: &clusterv1.Machine{}}, handler.EnqueueRequestsFromMapFunc(r.mapMachineToAgentMachine)).
Complete(r)
}
Loading

0 comments on commit e6fa01e

Please sign in to comment.