Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ MHC: implement annotation to manually mark machines for remediation #10202

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions api/v1beta1/common_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ const (
// MachineSkipRemediationAnnotation is the annotation used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler.
MachineSkipRemediationAnnotation = "cluster.x-k8s.io/skip-remediation"

// RemediateMachineAnnotation is the annotation used to mark machines that should be remediated by MachineHealthCheck reconciler.
RemediateMachineAnnotation = "cluster.x-k8s.io/remediate-machine"
fabriziopandini marked this conversation as resolved.
Show resolved Hide resolved

// MachineSetSkipPreflightChecksAnnotation is the annotation used to provide a comma-separated list of
// preflight checks that should be skipped during the MachineSet reconciliation.
// Supported items are:
Expand Down
4 changes: 4 additions & 0 deletions api/v1beta1/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,10 @@ const (
// MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
MachineHasFailureReason = "MachineHasFailure"

// HasRemediateMachineAnnotationReason is the reason that get's set at the MachineHealthCheckSucceededCondition when a machine
// has the RemediateMachineAnnotation set.
HasRemediateMachineAnnotationReason = "HasRemediateMachineAnnotation"

// NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout.
NodeStartupTimeoutReason = "NodeStartupTimeout"

Expand Down
1 change: 1 addition & 0 deletions docs/book/src/reference/labels_and_annotations.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
| cluster.x-k8s.io/cloned-from-name | It is the infrastructure machine annotation that stores the name of the infrastructure template resource that was cloned for the machine. This annotation is set only during cloning a template. Older/adopted machines will not have this annotation. |
| cluster.x-k8s.io/cloned-from-groupkind | It is the infrastructure machine annotation that stores the group-kind of the infrastructure template resource that was cloned for the machine. This annotation is set only during cloning a template. Older/adopted machines will not have this annotation. |
| cluster.x-k8s.io/skip-remediation | It is used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler. |
| cluster.x-k8s.io/remediate-machine | It can be applied to a machine to manually mark it for remediation by MachineHealthCheck reconciler. |
| cluster.x-k8s.io/managed-by | It can be applied to InfraCluster resources to signify that some external system is managing the cluster infrastructure. Provider InfraCluster controllers will ignore resources with this annotation. An external controller must fulfill the contract of the InfraCluster resource. External infrastructure providers should ensure that the annotation, once set, cannot be removed. |
| cluster.x-k8s.io/replicas-managed-by | It can be applied to MachinePool resources to signify that some external system is managing infrastructure scaling for that pool. See [the MachinePool documentation](../developer/architecture/controllers/machine-pool.md#externally-managed-autoscaler) for more details. |
| cluster.x-k8s.io/skip-machineset-preflight-checks | It can be applied on MachineDeployment and MachineSet resources to specify a comma-separated list of preflight checks that should be skipped during MachineSet reconciliation. Supported preflight checks are: All, KubeadmVersionSkew, KubernetesVersionSkew, ControlPlaneIsStable. |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ func (t *healthCheckTarget) nodeName() string {

// Determine whether or not a given target needs remediation.
// The node will need remediation if any of the following are true:
// - The Machine has the remediate machine annotation
// - The Machine has failed for some reason
// - The Machine did not get a node before `timeoutForMachineToHaveNode` elapses
// - The Node has gone away
Expand All @@ -93,6 +94,12 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
var nextCheckTimes []time.Duration
now := time.Now()

if annotations.HasRemediateMachine(t.Machine) {
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.HasRemediateMachineAnnotationReason, clusterv1.ConditionSeverityWarning, "Marked for remediation via remediate-machine annotation")
logger.V(3).Info("Target is marked for remediation via remediate-machine annotation")
return true, time.Duration(0)
}

if t.Machine.Status.FailureReason != nil {
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureReason: %v", *t.Machine.Status.FailureReason)
logger.V(3).Info("Target is unhealthy", "failureReason", t.Machine.Status.FailureReason)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,10 @@ func TestGetTargetsFromMHC(t *testing.T) {
// machines for skip remediation
testNode5 := newTestNode("node5")
testMachine5 := newTestMachine("machine5", namespace, clusterName, testNode5.Name, mhcSelector)
testMachine5.Annotations = map[string]string{"cluster.x-k8s.io/skip-remediation": ""}
testMachine5.Annotations = map[string]string{clusterv1.MachineSkipRemediationAnnotation: ""}
testNode6 := newTestNode("node6")
testMachine6 := newTestMachine("machine6", namespace, clusterName, testNode6.Name, mhcSelector)
testMachine6.Annotations = map[string]string{"cluster.x-k8s.io/paused": ""}
testMachine6.Annotations = map[string]string{clusterv1.PausedAnnotation: ""}

testCases := []struct {
desc string
Expand Down Expand Up @@ -340,6 +340,18 @@ func TestHealthCheckTargets(t *testing.T) {
}
machineFailureMsgCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureMessage: %s", failureMsg)

// Target for when the machine has the remediate machine annotation
annotationRemediationMsg := "Marked for remediation via remediate-machine annotation"
testMachineAnnotationRemediation := testMachine.DeepCopy()
testMachineAnnotationRemediation.Annotations = map[string]string{clusterv1.RemediateMachineAnnotation: ""}
machineAnnotationRemediation := healthCheckTarget{
Cluster: cluster,
MHC: testMHC,
Machine: testMachineAnnotationRemediation,
Node: nil,
}
machineAnnotationRemediationCondition := newFailedHealthCheckCondition(clusterv1.HasRemediateMachineAnnotationReason, annotationRemediationMsg)

testCases := []struct {
desc string
targets []healthCheckTarget
Expand Down Expand Up @@ -426,6 +438,14 @@ func TestHealthCheckTargets(t *testing.T) {
expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureMsgCondition},
expectedNextCheckTimes: []time.Duration{},
},
{
desc: "when the machine is manually marked for remediation",
targets: []healthCheckTarget{machineAnnotationRemediation},
expectedHealthy: []healthCheckTarget{},
expectedNeedsRemediation: []healthCheckTarget{machineAnnotationRemediation},
expectedNeedsRemediationCondition: []clusterv1.Condition{machineAnnotationRemediationCondition},
expectedNextCheckTimes: []time.Duration{},
},
}

for _, tc := range testCases {
Expand Down
5 changes: 5 additions & 0 deletions util/annotations/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ func HasSkipRemediation(o metav1.Object) bool {
return hasAnnotation(o, clusterv1.MachineSkipRemediationAnnotation)
}

// HasRemediateMachine returns true if the object has the `remediate-machine` annotation.
func HasRemediateMachine(o metav1.Object) bool {
return hasAnnotation(o, clusterv1.RemediateMachineAnnotation)
}

// HasWithPrefix returns true if at least one of the annotations has the prefix specified.
func HasWithPrefix(prefix string, annotations map[string]string) bool {
for key := range annotations {
Expand Down
Loading