Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ KCP remediates unhealthy machines #3830

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions api/v1alpha3/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,12 @@ const (
// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
WaitingForRemediationReason = "WaitingForRemediation"

// RemediationFailedReason is the reason used when a remediation owner fails to remediate an unhealthy machine.
RemediationFailedReason = "RemediationFailed"

// RemediationInProgressReason is the reason used when an unhealthy machine is being remediated by the remediation owner.
RemediationInProgressReason = "RemediationInProgress"

// ExternalRemediationTemplateAvailable is set on machinehealthchecks when MachineHealthCheck controller uses external remediation.
// ExternalRemediationTemplateAvailable is set to false if external remediation template is not found.
ExternalRemediationTemplateAvailable ConditionType = "ExternalRemediationTemplateAvailable"
Expand Down
7 changes: 5 additions & 2 deletions controllers/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,10 +219,13 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust
// after provisioning - e.g. when a MHC condition exists - or during the deletion process).
conditions.SetSummary(machine,
conditions.WithConditions(
// Infrastructure problems should take precedence over all the other conditions
clusterv1.InfrastructureReadyCondition,
// Boostrap comes after, but it is relevant only during initial machine provisioning.
clusterv1.BootstrapReadyCondition,
clusterv1.MachineOwnerRemediatedCondition,
// MHC reported condition should take precedence over the remediation progress
clusterv1.MachineHealthCheckSuccededCondition,
clusterv1.MachineOwnerRemediatedCondition,
),
conditions.WithStepCounterIf(machine.ObjectMeta.DeletionTimestamp.IsZero()),
conditions.WithStepCounterIfOnly(
Expand All @@ -240,8 +243,8 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust
clusterv1.BootstrapReadyCondition,
clusterv1.InfrastructureReadyCondition,
clusterv1.DrainingSucceededCondition,
clusterv1.MachineOwnerRemediatedCondition,
clusterv1.MachineHealthCheckSuccededCondition,
clusterv1.MachineOwnerRemediatedCondition,
}},
)

Expand Down
6 changes: 5 additions & 1 deletion controllers/machinehealthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,11 @@ func (r *MachineHealthCheckReconciler) PatchUnhealthyTargets(ctx context.Context
}
} else {
r.Log.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "MachineHealthCheck failed")
// NOTE: MHC is responsible for creating MachineOwnerRemediatedCondition if missing or to trigger another remediation if the previous one is completed;
// instead, if a remediation is in already progress, the remediation owner is responsible for completing the process and MHC should not overwrite the condition.
if !conditions.Has(t.Machine, clusterv1.MachineOwnerRemediatedCondition) || conditions.IsTrue(t.Machine, clusterv1.MachineOwnerRemediatedCondition) {
conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
}
}
}

Expand Down
8 changes: 7 additions & 1 deletion controlplane/kubeadm/controllers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *

// Aggregate the operational state of all the machines; while aggregating we are adding the
// source ref (reason@machine/name) so the problem can be easily tracked down to its source machine.
conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, controlPlane.Machines.ConditionGetters(), conditions.AddSourceRef())
conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef(), conditions.WithStepCounterIf(false))

// Updates conditions reporting the status of static pods and the status of the etcd cluster.
// NOTE: Conditions reporting KCP operation progress like e.g. Resized or SpecUpToDate are inlined with the rest of the execution.
Expand All @@ -316,6 +316,12 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
return result, err
}

// Reconcile unhealthy machines by triggering deletion and requeue if it is considered safe to remediate,
// otherwise continue with the other KCP operations.
if result, err := r.reconcileUnhealthyMachines(ctx, controlPlane); err != nil || !result.IsZero() {
return result, err
}

// Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations.
needRollout := controlPlane.MachinesNeedingRollout()
switch {
Expand Down
15 changes: 14 additions & 1 deletion controlplane/kubeadm/controllers/fakes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ func (f *fakeManagementCluster) GetMachinesForCluster(c context.Context, n clien

type fakeWorkloadCluster struct {
*internal.Workload
Status internal.ClusterStatus
Status internal.ClusterStatus
EtcdMembersResult []string
}

func (f fakeWorkloadCluster) ForwardEtcdLeadership(_ context.Context, _ *clusterv1.Machine, _ *clusterv1.Machine) error {
Expand Down Expand Up @@ -95,6 +96,18 @@ func (f fakeWorkloadCluster) UpdateKubeletConfigMap(ctx context.Context, version
return nil
}

func (f fakeWorkloadCluster) RemoveEtcdMemberForMachine(ctx context.Context, machine *clusterv1.Machine) error {
return nil
}

func (f fakeWorkloadCluster) RemoveMachineFromKubeadmConfigMap(ctx context.Context, machine *clusterv1.Machine) error {
return nil
}

func (f fakeWorkloadCluster) EtcdMembers(_ context.Context) ([]string, error) {
return f.EtcdMembersResult, nil
}

type fakeMigrator struct {
migrateCalled bool
migrateErr error
Expand Down
255 changes: 255 additions & 0 deletions controlplane/kubeadm/controllers/remediation.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
/*
Copyright 2020 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
"context"
"fmt"

"github.com/pkg/errors"
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// reconcileUnhealthyMachines tries to remediate KubeadmControlPlane unhealthy machines
// based on the process described in https://github.com/kubernetes-sigs/cluster-api/blob/master/docs/proposals/20191017-kubeadm-based-control-plane.md#remediation-using-delete-and-recreate
func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.Context, controlPlane *internal.ControlPlane) (ret ctrl.Result, retErr error) {
logger := r.Log.WithValues("namespace", controlPlane.KCP.Namespace, "kubeadmControlPlane", controlPlane.KCP.Name, "cluster", controlPlane.Cluster.Name)

// Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine)
// and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation.
unhealthyMachines := controlPlane.UnhealthyMachines()

// If there are no unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil).
if len(unhealthyMachines) == 0 {
return ctrl.Result{}, nil
}

// Select the machine to be remediated, which is the oldest machine marked as unhealthy.
//
// NOTE: The current solution is considered acceptable for the most frequent use case (only one unhealthy machine),
// however, in the future this could potentially be improved for the scenario where more than one unhealthy machine exists
// by considering which machine has lower impact on etcd quorum.
fabriziopandini marked this conversation as resolved.
Show resolved Hide resolved
machineToBeRemediated := unhealthyMachines.Oldest()

// Returns if the machine is in the process of being deleted.
if !machineToBeRemediated.ObjectMeta.DeletionTimestamp.IsZero() {
fabriziopandini marked this conversation as resolved.
Show resolved Hide resolved
return ctrl.Result{}, nil
}

patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client)
if err != nil {
return ctrl.Result{}, err
}

defer func() {
// Always attempt to Patch the Machine conditions after each reconcileUnhealthyMachines.
if err := patchHelper.Patch(ctx, machineToBeRemediated, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit.
WithOwnedConditions is deprecated, is there another way to use here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now, I'm keeping the same approach we are using in the other controllers (WithOwnedConditions), but soon or later we should switch everything to WithForceOverwriteConditions unless we finally switch to server side apply.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should un-deprecate it, we shouldn't use WithForceOverwriteConditions here

clusterv1.MachineOwnerRemediatedCondition,
}}); err != nil {
logger.Error(err, "Failed to patch control plane Machine", "machine", machineToBeRemediated.Name)
if retErr == nil {
retErr = errors.Wrapf(err, "failed to patch control plane Machine %s", machineToBeRemediated.Name)
}
}
}()

// Before starting remediation, run preflight checks in order to verify it is safe to remediate.
// If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition.

desiredReplicas := int(*controlPlane.KCP.Spec.Replicas)

// The cluster MUST have spec.replicas >= 3, because this is the smallest cluster size that allows any etcd failure tolerance.
if desiredReplicas < 3 {
logger.Info("A control plane machine needs remediation, but the number of desired replicas is less than 3. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if there are less than 3 desired replicas")
return ctrl.Result{}, nil
}

// The number of replicas MUST be equal to or greater than the desired replicas. This rule ensures that when the cluster
// is missing replicas, we skip remediation and instead perform regular scale up/rollout operations first.
if controlPlane.Machines.Len() < desiredReplicas {
logger.Info("A control plane machine needs remediation, but the current number of replicas is lower that expected. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas, "CurrentReplicas", controlPlane.Machines.Len())
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least %d control plane machines before triggering remediation", desiredReplicas)
return ctrl.Result{}, nil
}

// The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.
if controlPlane.HasDeletingMachine() {
logger.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
return ctrl.Result{}, nil
}

// Remediation MUST preserve etcd quorum. This rule ensures that we will not remove a member that would result in etcd
// losing a majority of members and thus become unable to field new requests.
if controlPlane.IsEtcdManaged() {
canSafelyRemediate, err := r.canSafelyRemoveEtcdMember(ctx, controlPlane, machineToBeRemediated)
if err != nil {
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
return ctrl.Result{}, err
fabriziopandini marked this conversation as resolved.
Show resolved Hide resolved
}
if !canSafelyRemediate {
logger.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name)
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
return ctrl.Result{}, nil
}
}

workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster))
if err != nil {
logger.Error(err, "Failed to create client to workload cluster")
return ctrl.Result{}, errors.Wrapf(err, "failed to create client to workload cluster")
}

// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
if controlPlane.IsEtcdManaged() {
etcdLeaderCandidate := controlPlane.HealthyMachines().Newest()
if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil {
logger.Error(err, "Failed to move leadership to candidate machine", "candidate", etcdLeaderCandidate.Name)
return ctrl.Result{}, err
}
if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil {
logger.Error(err, "Failed to remove etcd member for machine")
return ctrl.Result{}, err
}
}

if err := workloadCluster.RemoveMachineFromKubeadmConfigMap(ctx, machineToBeRemediated); err != nil {
logger.Error(err, "Failed to remove machine from kubeadm ConfigMap")
return ctrl.Result{}, err
}

if err := r.Client.Delete(ctx, machineToBeRemediated); err != nil {
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name)
}

logger.Info("Remediating unhealthy machine", "UnhealthyMachine", machineToBeRemediated.Name)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hasn't remediation already happened at this point? Or is it in progress?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the machine object is deleted at this point and the KCP job is completed, but the actual machine deletion will take some time to happen, so from a user PoV I think it is correct to say that remediation is in progress (and it shows up in the same way in the conditions as well).

conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
return ctrl.Result{Requeue: true}, nil
}

// canSafelyRemoveEtcdMember assess if it is possible to remove the member hosted on the machine to be remediated
// without loosing etcd quorum.
//
// The answer mostly depend on the existence of other failing members on top of the one being deleted, and according
// to the etcd fault tolerance specification (see https://github.com/etcd-io/etcd/blob/master/Documentation/faq.md#what-is-failure-tolerance):
// - 3 CP cluster does not tolerate additional failing members on top of the one being deleted (the target
// cluster size after deletion is 2, fault tolerance 0)
// - 5 CP cluster tolerates 1 additional failing members on top of the one being deleted (the target
// cluster size after deletion is 4, fault tolerance 1)
// - 7 CP cluster tolerates 2 additional failing members on top of the one being deleted (the target
// cluster size after deletion is 6, fault tolerance 2)
// - etc.
//
// NOTE: this func assumes the list of members in sync with the list of machines/nodes, it is required to call reconcileEtcdMembers
// ans well as reconcileControlPlaneConditions before this.
func (r *KubeadmControlPlaneReconciler) canSafelyRemoveEtcdMember(ctx context.Context, controlPlane *internal.ControlPlane, machineToBeRemediated *clusterv1.Machine) (bool, error) {
logger := r.Log.WithValues("namespace", controlPlane.KCP.Namespace, "kubeadmControlPlane", controlPlane.KCP.Name, "cluster", controlPlane.Cluster.Name)

workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, client.ObjectKey{
Namespace: controlPlane.Cluster.Namespace,
Name: controlPlane.Cluster.Name,
})
if err != nil {
return false, errors.Wrapf(err, "failed to get client for workload cluster %s", controlPlane.Cluster.Name)
}

// Gets the etcd status

// This makes it possible to have a set of etcd members status different from the MHC unhealthy/unhealthy conditions.
etcdMembers, err := workloadCluster.EtcdMembers(ctx)
if err != nil {
return false, errors.Wrapf(err, "failed to get etcdStatus for workload cluster %s", controlPlane.Cluster.Name)
}

currentTotalMembers := len(etcdMembers)

logger.Info("etcd cluster before remediation",
"currentTotalMembers", currentTotalMembers,
"currentMembers", etcdMembers)

// The cluster MUST have at least 3 members, because this is the smallest cluster size that allows any etcd failure tolerance.
//
// NOTE: This should not happen given that we are checking the number of replicas before calling this method, however
// given that this could be destructive, this is an additional safeguard.
if currentTotalMembers < 3 {
logger.Info("etcd cluster with less of 3 members can't be safely remediated")
return false, nil
}

targetTotalMembers := currentTotalMembers - 1
targetQuorum := targetTotalMembers/2.0 + 1
targetUnhealthyMembers := 0

healthyMembers := []string{}
unhealthyMembers := []string{}
for _, etcdMember := range etcdMembers {
// Skip the machine to be deleted because it won't be part of the target etcd cluster.
if etcdMember == machineToBeRemediated.Name {
continue
}

// Search for the machine corresponding to the etcd member.
var machine *clusterv1.Machine
for _, m := range controlPlane.Machines {
if m.Status.NodeRef != nil && m.Status.NodeRef.Name == etcdMember {
machine = m
break
}
}

// If an etcd member does not have a corresponding machine, it is not possible to retrieve etcd member health
// so we are assuming the worst scenario and considering the member unhealthy.
//
// NOTE: This should not happen given that we are running reconcileEtcdMembers before calling this method.
if machine == nil {
logger.Info("An etcd member does not have a corresponding machine, assuming this member is unhealthy", "MemberName", etcdMember)
targetUnhealthyMembers++
unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (no machine)", etcdMember))
continue
}

// Check member health as reported by machine's health conditions
if !conditions.IsTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) {
targetUnhealthyMembers++
unhealthyMembers = append(unhealthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name))
continue
}

healthyMembers = append(healthyMembers, fmt.Sprintf("%s (%s)", etcdMember, machine.Name))
}

logger.Info(fmt.Sprintf("etcd cluster projected after remediation of %s", machineToBeRemediated.Name),
"healthyMembers", healthyMembers,
"unhealthyMembers", unhealthyMembers,
"targetTotalMembers", targetTotalMembers,
"targetQuorum", targetQuorum,
"targetUnhealthyMembers", targetUnhealthyMembers,
"projectedQuorum", targetTotalMembers-targetUnhealthyMembers)
if targetTotalMembers-targetUnhealthyMembers >= targetQuorum {
return true, nil
}
return false, nil
}
Loading