Skip to content

Commit

Permalink
api: implement annotation to manually mark machines for remediation v…
Browse files Browse the repository at this point in the history
…ia MHC
  • Loading branch information
chrischdi committed Mar 13, 2024
1 parent efc4044 commit 63d56b2
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 2 deletions.
3 changes: 3 additions & 0 deletions api/v1beta1/common_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ const (
// MachineSkipRemediationAnnotation is the annotation used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler.
MachineSkipRemediationAnnotation = "cluster.x-k8s.io/skip-remediation"

// RemediateMachineAnnotation is the annotation used to mark machines that should be remediated by MachineHealthCheck reconciler.
RemediateMachineAnnotation = "cluster.x-k8s.io/remediate-machine"

// MachineSetSkipPreflightChecksAnnotation is the annotation used to provide a comma-separated list of
// preflight checks that should be skipped during the MachineSet reconciliation.
// Supported items are:
Expand Down
4 changes: 4 additions & 0 deletions api/v1beta1/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,10 @@ const (
// MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
MachineHasFailureReason = "MachineHasFailure"

// HasRemediateMachineAnnotationReason is the reason that get's set at the MachineHealthCheckSucceededCondition when a machine
// has the RemediateMachineAnnotation set.
HasRemediateMachineAnnotationReason = "HasRemediateMachineAnnotation"

// NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout.
NodeStartupTimeoutReason = "NodeStartupTimeout"

Expand Down
1 change: 1 addition & 0 deletions docs/book/src/reference/labels_and_annotations.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
| cluster.x-k8s.io/cloned-from-name | It is the infrastructure machine annotation that stores the name of the infrastructure template resource that was cloned for the machine. This annotation is set only during cloning a template. Older/adopted machines will not have this annotation. |
| cluster.x-k8s.io/cloned-from-groupkind | It is the infrastructure machine annotation that stores the group-kind of the infrastructure template resource that was cloned for the machine. This annotation is set only during cloning a template. Older/adopted machines will not have this annotation. |
| cluster.x-k8s.io/skip-remediation | It is used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler. |
| cluster.x-k8s.io/remediate-machine | It can be applied to a machine to manually mark it for remediation by MachineHealthCheck reconciler. |
| cluster.x-k8s.io/managed-by | It can be applied to InfraCluster resources to signify that some external system is managing the cluster infrastructure. Provider InfraCluster controllers will ignore resources with this annotation. An external controller must fulfill the contract of the InfraCluster resource. External infrastructure providers should ensure that the annotation, once set, cannot be removed. |
| cluster.x-k8s.io/replicas-managed-by | It can be applied to MachinePool resources to signify that some external system is managing infrastructure scaling for that pool. See [the MachinePool documentation](../developer/architecture/controllers/machine-pool.md#externally-managed-autoscaler) for more details. |
| cluster.x-k8s.io/skip-machineset-preflight-checks | It can be applied on MachineDeployment and MachineSet resources to specify a comma-separated list of preflight checks that should be skipped during MachineSet reconciliation. Supported preflight checks are: All, KubeadmVersionSkew, KubernetesVersionSkew, ControlPlaneIsStable. |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ func (t *healthCheckTarget) nodeName() string {

// Determine whether or not a given target needs remediation.
// The node will need remediation if any of the following are true:
// - The Machine has the remediate machine annotation
// - The Machine has failed for some reason
// - The Machine did not get a node before `timeoutForMachineToHaveNode` elapses
// - The Node has gone away
Expand All @@ -93,6 +94,12 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
var nextCheckTimes []time.Duration
now := time.Now()

if annotations.HasRemediateMachine(t.Machine) {
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.HasRemediateMachineAnnotationReason, clusterv1.ConditionSeverityWarning, "Marked for remediation via remediate-machine annotation")
logger.V(3).Info("Target is marked for remediation via remediate-machine annotation")
return true, time.Duration(0)
}

if t.Machine.Status.FailureReason != nil {
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureReason: %v", *t.Machine.Status.FailureReason)
logger.V(3).Info("Target is unhealthy", "failureReason", t.Machine.Status.FailureReason)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,10 @@ func TestGetTargetsFromMHC(t *testing.T) {
// machines for skip remediation
testNode5 := newTestNode("node5")
testMachine5 := newTestMachine("machine5", namespace, clusterName, testNode5.Name, mhcSelector)
testMachine5.Annotations = map[string]string{"cluster.x-k8s.io/skip-remediation": ""}
testMachine5.Annotations = map[string]string{clusterv1.MachineSkipRemediationAnnotation: ""}
testNode6 := newTestNode("node6")
testMachine6 := newTestMachine("machine6", namespace, clusterName, testNode6.Name, mhcSelector)
testMachine6.Annotations = map[string]string{"cluster.x-k8s.io/paused": ""}
testMachine6.Annotations = map[string]string{clusterv1.PausedAnnotation: ""}

testCases := []struct {
desc string
Expand Down Expand Up @@ -340,6 +340,18 @@ func TestHealthCheckTargets(t *testing.T) {
}
machineFailureMsgCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureMessage: %s", failureMsg)

// Target for when the machine has the remediate machine annotation
annotationRemediationMsg := "Marked for remediation via remediate-machine annotation"
testMachineAnnotationRemediation := testMachine.DeepCopy()
testMachineAnnotationRemediation.Annotations = map[string]string{clusterv1.RemediateMachineAnnotation: ""}
machineAnnotationRemediation := healthCheckTarget{
Cluster: cluster,
MHC: testMHC,
Machine: testMachineAnnotationRemediation,
Node: nil,
}
machineAnnotationRemediationCondition := newFailedHealthCheckCondition(clusterv1.HasRemediateMachineAnnotationReason, annotationRemediationMsg)

testCases := []struct {
desc string
targets []healthCheckTarget
Expand Down Expand Up @@ -426,6 +438,14 @@ func TestHealthCheckTargets(t *testing.T) {
expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureMsgCondition},
expectedNextCheckTimes: []time.Duration{},
},
{
desc: "when the machine is manually marked for remediation",
targets: []healthCheckTarget{machineAnnotationRemediation},
expectedHealthy: []healthCheckTarget{},
expectedNeedsRemediation: []healthCheckTarget{machineAnnotationRemediation},
expectedNeedsRemediationCondition: []clusterv1.Condition{machineAnnotationRemediationCondition},
expectedNextCheckTimes: []time.Duration{},
},
}

for _, tc := range testCases {
Expand Down
5 changes: 5 additions & 0 deletions util/annotations/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ func HasSkipRemediation(o metav1.Object) bool {
return hasAnnotation(o, clusterv1.MachineSkipRemediationAnnotation)
}

// HasRemediateMachine returns true if the object has the `remediate-machine` annotation.
func HasRemediateMachine(o metav1.Object) bool {
return hasAnnotation(o, clusterv1.RemediateMachineAnnotation)
}

// HasWithPrefix returns true if at least one of the annotations has the prefix specified.
func HasWithPrefix(prefix string, annotations map[string]string) bool {
for key := range annotations {
Expand Down

0 comments on commit 63d56b2

Please sign in to comment.